In [1]:
import numpy as np
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('blogtext.csv')
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [3]:
df.shape

(681284, 7)

In [4]:
# dropping duplicate values if any
df.drop_duplicates(inplace=True)

### Working on sample as dataset is too large

In [5]:
data=df.sample(frac=0.25,random_state=42)
data.shape

(169150, 7)

In [6]:
data.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169150 entries, 255678 to 329229
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      169150 non-null  int64 
 1   gender  169150 non-null  object
 2   age     169150 non-null  int64 
 3   topic   169150 non-null  object
 4   sign    169150 non-null  object
 5   date    169150 non-null  object
 6   text    169150 non-null  object
dtypes: int64(2), object(5)
memory usage: 10.3+ MB


### Preprocess rows of the “text” column

In [8]:
#Remove unwanted characters
data['new_text']=data['text'].apply(lambda x: re.sub(r'[^A-Za-z]+',' ',x))

In [9]:
#convert text to lower case
data['new_text']=data['new_text'].apply(lambda x: x.lower())

In [10]:
#Remove unwanted spaces
data['new_text']=data['new_text'].apply(lambda x: x.strip())

In [11]:
#Remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
data['new_text']=data['new_text'].apply(lambda x: ' '.join([i for i in x.split() if i not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AMAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
print(data['text'].iloc[1],data['new_text'].iloc[1],sep='\n')

       For the first time since the end of school I met up with Brandon and Sean to PK at Vincent Elementary, and DAMN WAS IT AWESOME!!  I got there about 20 minutes before everyone else and pk'd some, but really not a lot seeing as I feel tired.  AANYWAY.  I drilled monkey vaults and attempted some kongs, which I still freak out on and dont put my legs through my arms.  Oh well.  Um, we talked a lot and ate food some.  Then when Brandon got there we started going around the playground a little bit.  Much to my surprise, I found that rolling on woodchips is not comfortable after a 10 foot drop, they feel like little rocks digging into your spine.  So I went back to vaulting this little tube thing, that was fun. Then Sean came and we practiced wall-runs, stretched, and did some flow activities.  I need to relax for a little bit, but not moving for a day is rough.  I'm addicted!!!    
first time since end school met brandon sean pk vincent elementary damn awesome got minutes everyone els

### Label columns to merge: “gender”, “age”, “topic”, “sign”

In [13]:
data['label']=data.apply(lambda x: [x['gender'],str(x['age']),x['topic'],x['sign']], axis=1)
data.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,new_text,label
255678,1772041,male,14,Student,Sagittarius,"04,August,2004",ahhh hayo.. i dunt believe we are...,ahhh hayo dunt believe yr carazy yes well weir...,"[male, 14, Student, Sagittarius]"
606383,3353160,female,14,Arts,Leo,"26,June,2004",For the first time since the end of sch...,first time since end school met brandon sean p...,"[female, 14, Arts, Leo]"
40902,3798944,male,23,Banking,Capricorn,"01,July,2004",Wow... today has been HECTIC to say...,wow today hectic say least busy year hope sign...,"[male, 23, Banking, Capricorn]"
626648,1944324,male,16,Student,Taurus,"08,November,2003",Celebrating 10 posts of Jonah! (Hosted ...,celebrating posts jonah hosted laurence fishbu...,"[male, 16, Student, Taurus]"
71623,479019,male,24,Student,Gemini,"02,July,2004",Man... I've been having some majo...,man majorly trippy dreams past week two last n...,"[male, 24, Student, Gemini]"


In [14]:
data=data[['new_text','label']]
data.head()

Unnamed: 0,new_text,label
255678,ahhh hayo dunt believe yr carazy yes well weir...,"[male, 14, Student, Sagittarius]"
606383,first time since end school met brandon sean p...,"[female, 14, Arts, Leo]"
40902,wow today hectic say least busy year hope sign...,"[male, 23, Banking, Capricorn]"
626648,celebrating posts jonah hosted laurence fishbu...,"[male, 16, Student, Taurus]"
71623,man majorly trippy dreams past week two last n...,"[male, 24, Student, Gemini]"


### Separate features and labels, and split the data into training and testing

In [15]:
from sklearn.model_selection import train_test_split
X=data['new_text']
y=data['label']
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

### Vectorize the features

Create a Bag of Words using count vectorizer

    -i.Use ngram_range=(1, 2)
    -ii.Vectorize training and testing features

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(binary=True,ngram_range=(1,2))
x_train=vectorizer.fit_transform(x_train)
x_test=vectorizer.transform(x_test)

##### Printing Term-Document Matrix

In [17]:
vectorizer.get_feature_names()

['aa',
 'aa aa',
 'aa aaa',
 'aa ad',
 'aa advert',
 'aa always',
 'aa anyway',
 'aa apparently',
 'aa assertive',
 'aa ball',
 'aa batteried',
 'aa batteries',
 'aa bdaee',
 'aa beautiful',
 'aa bf',
 'aa bhii',
 'aa big',
 'aa bigbook',
 'aa bit',
 'aa book',
 'aa bra',
 'aa bring',
 'aa brown',
 'aa bukannya',
 'aa business',
 'aa certain',
 'aa championship',
 'aa class',
 'aa club',
 'aa coming',
 'aa compare',
 'aa considers',
 'aa corti',
 'aa degree',
 'aa div',
 'aa doesnt',
 'aa done',
 'aa dragon',
 'aa enough',
 'aa ers',
 'aa etc',
 'aa example',
 'aa favoring',
 'aa finished',
 'aa flights',
 'aa gaya',
 'aa gaye',
 'aa gayi',
 'aa gift',
 'aa girls',
 'aa going',
 'aa good',
 'aa great',
 'aa grins',
 'aa ha',
 'aa haha',
 'aa hchannalhxaa',
 'aa hmm',
 'aa htm',
 'aa html',
 'aa hyper',
 'aa jao',
 'aa jeff',
 'aa kk',
 'aa kokoro',
 'aa lah',
 'aa lazy',
 'aa like',
 'aa lizzy',
 'aa long',
 'aa lot',
 'aa magrini',
 'aa masti',
 'aa meeting',
 'aa meetings',
 'aa memb

### Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label.

In [19]:
label_count=dict()

for labels in y.values:
    for label in labels:
        if label in label_count:
            label_count[label]+=1
        else:
            label_count[label]=1
print(label_count)

{'male': 85506, '14': 6830, 'Student': 38480, 'Sagittarius': 12354, 'female': 83644, 'Arts': 8097, 'Leo': 13363, '23': 18095, 'Banking': 1044, 'Capricorn': 12254, '16': 18285, 'Taurus': 15347, '24': 19939, 'Gemini': 12617, '40': 1238, 'Accounting': 933, 'Scorpio': 14256, '17': 19943, '27': 11508, 'Law': 2258, 'Aries': 16287, '34': 5336, 'Religion': 1299, 'indUnk': 62041, 'Communications-Media': 5105, '33': 4387, 'Libra': 15467, '15': 10390, 'Pisces': 13289, 'Technology': 10383, 'Cancer': 16157, '13': 3076, '25': 16709, '26': 13704, 'Virgo': 15209, '47': 527, 'Internet': 3953, 'Marketing': 1133, 'Education': 7402, 'Museums-Libraries': 774, 'Engineering': 2825, '35': 4272, 'Telecommunications': 964, 'Architecture': 385, 'Non-Profit': 3676, 'Aquarius': 12550, '38': 1836, 'HumanResources': 761, 'Publishing': 1923, 'Chemicals': 955, 'Agriculture': 332, 'Science': 1851, '39': 1363, '46': 642, 'Biotech': 562, '37': 2359, '45': 1115, 'Government': 1720, 'BusinessServices': 1095, 'Automotive': 

### Convert your train and test labels using MultiLabelBinarizer

In [20]:
from sklearn.preprocessing import MultiLabelBinarizer
binarizer=MultiLabelBinarizer(classes=sorted(label_count.keys()))
y_train=binarizer.fit_transform(y_train.values)
y_test=binarizer.transform(y_test.values)

### Making Prediction

In [21]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(solver='lbfgs')
clf=OneVsRestClassifier(clf)
clf.fit(x_train,y_train)

OneVsRestClassifier(estimator=LogisticRegression())

In [23]:
print('Train Score: ',clf.score(x_train,y_train))

Train Score:  0.8590156665681348


In [26]:
from sklearn.metrics import accuracy_score,f1_score,average_precision_score,recall_score
y_pred=clf.predict(x_test)
print('Accuracy score: ', accuracy_score(y_test, y_pred))
print('F1 score: ', f1_score(y_test, y_pred,average='micro'))
print('Average precision score: ', average_precision_score(y_test, y_pred,average='micro'))
print('Average recall score: ', recall_score(y_test, y_pred,average='micro'))

Accuracy score:  0.01776529707360331
F1 score:  0.35323349252763236
Average precision score:  0.18836380205444334
Average recall score:  0.24951226721844516


### Comparing Actual and Predicted

In [27]:
y_test_inversed = binarizer.inverse_transform(y_test)
y_pred_inversed = binarizer.inverse_transform(y_pred)

In [29]:
for i in range(0,5):
    print('Text:\t{}\nActual Labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        x_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(y_pred_inversed[i])
    ))

Text:	  (0, 5528)	1
  (0, 5595)	1
  (0, 5766)	1
  (0, 6176)	1
  (0, 6460)	1
  (0, 8852)	1
  (0, 8861)	1
  (0, 10993)	1
  (0, 11204)	1
  (0, 24468)	1
  (0, 24478)	1
  (0, 41458)	1
  (0, 41662)	1
  (0, 47645)	1
  (0, 48179)	1
  (0, 73589)	1
  (0, 73668)	1
  (0, 74014)	1
  (0, 74522)	1
  (0, 74807)	1
  (0, 74847)	1
  (0, 92053)	1
  (0, 93009)	1
  (0, 93063)	1
  (0, 98022)	1
  :	:
  (0, 6378797)	1
  (0, 6379777)	1
  (0, 6381177)	1
  (0, 6415635)	1
  (0, 6416109)	1
  (0, 6435299)	1
  (0, 6435929)	1
  (0, 6480426)	1
  (0, 6481247)	1
  (0, 6497536)	1
  (0, 6499296)	1
  (0, 6538678)	1
  (0, 6541545)	1
  (0, 6541549)	1
  (0, 6543978)	1
  (0, 6546068)	1
  (0, 6546425)	1
  (0, 6552831)	1
  (0, 6555902)	1
  (0, 6560809)	1
  (0, 6561936)	1
  (0, 6564035)	1
  (0, 6589130)	1
  (0, 6589132)	1
  (0, 6589134)	1
Actual Labels:	24,Taurus,indUnk,male
Predicted labels:	indUnk,male


Text:	  (0, 158395)	1
  (0, 159441)	1
  (0, 162068)	1
  (0, 166916)	1
  (0, 168271)	1
  (0, 177459)	1
  (0, 177695)	1
  (0, 19