# PROJECT B

In [56]:
# importing the libraries
import numpy as np
import pandas as pd
import re 
import nltk
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score   

import warnings
warnings.filterwarnings('ignore')

# Loading dataset

In [57]:
#importing the dataset
blog_df = pd.read_csv("Dataset - blogtext.csv")

In [58]:
#checking sample data to see the overview of the dataset
blog_df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [59]:
# checking the info of the dataset
blog_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [60]:
# checking for any null values
blog_df.isna().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [61]:
# checking the shape of the dataset
blog_df.shape

(681284, 7)

In [62]:
# taking the subset of the dataset
blog_df_subset = blog_df.head(20000)

In [63]:
blog_df_subset.reset_index(drop=True, inplace=True)

In [64]:
blog_df_subset.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [65]:
blog_df_subset.shape

(20000, 7)

# Preprocessing the text column

In [66]:
# text column
blog_df_subset['text']

0                   Info has been found (+/- 100 pages,...
1                   These are the team members:   Drewe...
2                   In het kader van kernfusie op aarde...
3                         testing!!!  testing!!!          
4                     Thanks to Yahoo!'s Toolbar I can ...
                               ...                        
19995                     Writing tests are retarded......
19996                     Ok nothing happened that was ...
19997                     So Daniel who do you want to ...
19998                     I finally got around to seein...
19999                     My chemistry teacher must thi...
Name: text, Length: 20000, dtype: object

In [67]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aruns\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

In [69]:
# function to process the text
def Text_Processing(text):
    # removing the unwanted characters
    process_text = re.sub(r'[^A-Za-z]+',' ',text)
    
    # converting into lower text
    process_text = process_text.lower()
    
    # removing the trailing and leading spaces
    process_text = process_text.strip()
    
    #removing stop words
    process_text = ' '.join([words for words in process_text.split() if words not in stopwords])
    
    # return processed text
    return process_text 

In [70]:
# applying the text processing to the text column
blog_df_subset['processed_text'] = blog_df_subset['text'].apply(Text_Processing)

In [71]:
# Processed data example
blog_df_subset['processed_text'][10]

'ah korean language looks difficult first figure read hanguel korea surprisingly easy learn alphabet characters seems easy vocabulary starts oh backwards us sentence structure yikes luckily many options us slow witted foreigners take language course could list urllink joongang article says lot resources urllink well guy motivation jeon ji hyun latest something actually star movies cfs hear means commercial feature positive saw latest movie sunday night hard describe name english version windstruck korean version yeochinso short ne yeojachingu rul sogayhamnida like introduce girlfriend surprisingly titles make sense like website korean english looks quite good actually urllink movie shown theatres subtitles special times info urllink list many theatres seoul click urllink urllink great reason learn korean already married went foreigners well local korean national course korean take picture put urllink movie hof bar update bud mine passed urllink link giordano ad apparently aired korea n

In [72]:
# Original data example
blog_df_subset['text'][10]

"             Ah, the Korean language...it looks so difficult at first, then as you figure out how to read Hanguel (Korea's surprisingly easy-to-learn alphabet of 24 characters) it seems so easy. Then the vocabulary starts. Oh no. Then the backwards (to us) sentence structure.  Yikes!  Luckily there are many options for us slow-witted foreigners to take on the language.  Of course I could list them here but  urlLink this JoongAng article  says a lot and there are more resources  urlLink here .    Well, if you're a guy here is some motivation for you: Jeon Ji Hyun (전지현), the latest 20-something (24, actually) star of movies and CFs (I hear this means Commercial Feature, but not positive).  I saw her latest movie on Sunday night.  It's hard to describe the name...the English version is 'Windstruck' but the Korean version is 여친소 (yeochinso) which is short for 내여자친구를소개합니다 (ne yeojachingu rul sogayhamnida) or 'I'd like to introduce you to my girlfriend'.  Surprisingly, both titles make sens

In [18]:
blog_df_subset['text'] = blog_df_subset['processed_text']

# Merge columns for multi-label classification

In [19]:
# age column converting to object type
blog_df_subset['age']=blog_df_subset['age'].astype('object')

In [20]:
# merging the label columns
blog_df_subset['labels']=blog_df_subset.apply(lambda col: [col['gender'],str(col['age']),col['topic'],col['sign']], axis=1)

In [21]:
blog_df_subset

Unnamed: 0,id,gender,age,topic,sign,date,text,processed_text,labels
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing,testing testing,"[male, 15, Student, Leo]"
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoo toolbar capture urls popups means...,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"
...,...,...,...,...,...,...,...,...,...
19995,1019710,male,16,Student,Pisces,"10,February,2004",writing tests retarded along chemistry project...,writing tests retarded along chemistry project...,"[male, 16, Student, Pisces]"
19996,1019710,male,16,Student,Pisces,"31,March,2004",ok nothing happened crazy except chipped back ...,ok nothing happened crazy except chipped back ...,"[male, 16, Student, Pisces]"
19997,1019710,male,16,Student,Pisces,"30,March,2004",daniel want win election get question alot ok ...,daniel want win election get question alot ok ...,"[male, 16, Student, Pisces]"
19998,1019710,male,16,Student,Pisces,"29,March,2004",finally got around seeing passion christ well ...,finally got around seeing passion christ well ...,"[male, 16, Student, Pisces]"


In [22]:
# removing the columns id, gender, age, topic, sign, date, text
blog_df_subset = blog_df_subset.drop(['id','gender','age','topic','sign','date', 'text'], axis=1)

In [23]:
blog_df_subset

Unnamed: 0,processed_text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"
...,...,...
19995,writing tests retarded along chemistry project...,"[male, 16, Student, Pisces]"
19996,ok nothing happened crazy except chipped back ...,"[male, 16, Student, Pisces]"
19997,daniel want win election get question alot ok ...,"[male, 16, Student, Pisces]"
19998,finally got around seeing passion christ well ...,"[male, 16, Student, Pisces]"


# Separating features & labels and split into training and testing

In [24]:
X = blog_df_subset['processed_text']

In [25]:
y = blog_df_subset['labels']

In [26]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,y,test_size=0.2)

In [27]:
print("Xtrain shape", Xtrain.shape)
print("Xtest shape", Xtest.shape)
print("Ytrain shape", Ytrain.shape)
print("Ytest shape", Ytest.shape)

Xtrain shape (16000,)
Xtest shape (4000,)
Ytrain shape (16000,)
Ytest shape (4000,)


# Vectorizing features

In [28]:
# initializing the vectorization
vectorizer=CountVectorizer(binary=True, ngram_range=(1,2))

In [29]:
# vectorizing features
X=vectorizer.fit_transform(X)

In [30]:
# checking few samples
vectorizer.get_feature_names()[:50]

['aa',
 'aa aa',
 'aa advert',
 'aa amazing',
 'aa anger',
 'aa batteries',
 'aa compared',
 'aa ended',
 'aa htm',
 'aa keeps',
 'aa months',
 'aa nice',
 'aa process',
 'aa real',
 'aa sd',
 'aa species',
 'aa sudden',
 'aa ting',
 'aaa',
 'aaa aaa',
 'aaa andy',
 'aaa assistance',
 'aaa come',
 'aaa discount',
 'aaa get',
 'aaa hahahaha',
 'aaa joe',
 'aaa looks',
 'aaa rated',
 'aaa someone',
 'aaa take',
 'aaa tow',
 'aaa travel',
 'aaaa',
 'aaaa jet',
 'aaaaaa',
 'aaaaaa comes',
 'aaaaaa honest',
 'aaaaaaa',
 'aaaaaaa love',
 'aaaaaaaaaaaah',
 'aaaaaaaaaaah',
 'aaaaaaaaaaah drove',
 'aaaaaaaaaaahhhhhhhhhhhhhhhhhhh',
 'aaaaaaaaaaahhhhhhhhhhhhhhhhhhh hw',
 'aaaaaaaaaaand',
 'aaaaaaaaaaand done',
 'aaaaaaaaaaauuugh',
 'aaaaaaaaaaauuugh least',
 'aaaaaaaaaaawwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww']

In [31]:
# printing the document term matrix for few samples
df = pd.DataFrame(X[:100,:150].toarray(), columns=vectorizer.get_feature_names()[:150])
df

Unnamed: 0,aa,aa aa,aa advert,aa amazing,aa anger,aa batteries,aa compared,aa ended,aa htm,aa keeps,...,aaahhhh diva,aaahhhh excited,aaahing,aaahing much,aaanyway,aaanyway found,aaanyway shut,aaargh,aaargh damn,aaargh decided
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# Vectorize training and testing features
Xtrain_vec = vectorizer.transform(Xtrain)

In [33]:
Xtest_vec = vectorizer.transform(Xtest)

In [34]:
print(Xtrain_vec[0])

  (0, 639842)	1
  (0, 640989)	1
  (0, 759527)	1
  (0, 762526)	1
  (0, 1138396)	1
  (0, 1138558)	1
  (0, 1213285)	1
  (0, 1217425)	1


In [35]:
print(Xtest_vec[0])

  (0, 8815)	1
  (0, 9208)	1
  (0, 10282)	1
  (0, 10388)	1
  (0, 12179)	1
  (0, 12180)	1
  (0, 45643)	1
  (0, 45644)	1
  (0, 55093)	1
  (0, 55116)	1
  (0, 55276)	1
  (0, 55317)	1
  (0, 55506)	1
  (0, 56854)	1
  (0, 56860)	1
  (0, 60509)	1
  (0, 60557)	1
  (0, 62930)	1
  (0, 63171)	1
  (0, 64586)	1
  (0, 64593)	1
  (0, 66026)	1
  (0, 66040)	1
  (0, 67829)	1
  (0, 67940)	1
  :	:
  (0, 1235945)	1
  (0, 1236045)	1
  (0, 1236254)	1
  (0, 1236267)	1
  (0, 1247112)	1
  (0, 1247161)	1
  (0, 1254528)	1
  (0, 1254760)	1
  (0, 1256810)	1
  (0, 1257518)	1
  (0, 1258102)	1
  (0, 1258539)	1
  (0, 1258726)	1
  (0, 1263851)	1
  (0, 1264169)	1
  (0, 1265436)	1
  (0, 1265559)	1
  (0, 1269944)	1
  (0, 1269981)	1
  (0, 1271567)	1
  (0, 1271569)	1
  (0, 1283558)	1
  (0, 1283701)	1
  (0, 1289827)	1
  (0, 1289848)	1


# Dictionary to get the count of every label 

In [36]:
# getting the label counts
label_counts=dict()

for labels in blog_df_subset.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label]+=1
        else:
            label_counts[label]=1

In [37]:
label_counts

{'male': 11354,
 '15': 1097,
 'Student': 2637,
 'Leo': 1732,
 '33': 769,
 'InvestmentBanking': 71,
 'Aquarius': 1313,
 'female': 8646,
 '14': 811,
 'indUnk': 7789,
 'Aries': 5209,
 '25': 1190,
 'Capricorn': 930,
 '17': 1961,
 'Gemini': 780,
 '23': 1963,
 'Non-Profit': 204,
 'Cancer': 1536,
 'Banking': 89,
 '37': 130,
 'Sagittarius': 2153,
 '26': 919,
 '24': 1557,
 'Scorpio': 1485,
 '27': 2320,
 'Education': 759,
 '45': 72,
 'Engineering': 357,
 'Libra': 983,
 'Science': 87,
 '34': 871,
 '41': 82,
 'Communications-Media': 414,
 'BusinessServices': 184,
 'Sports-Recreation': 120,
 'Virgo': 871,
 'Taurus': 1330,
 'Arts': 358,
 'Pisces': 1678,
 '44': 9,
 '16': 1236,
 'Internet': 778,
 'Museums-Libraries': 67,
 'Accounting': 35,
 '39': 105,
 '35': 2494,
 'Technology': 2989,
 '36': 1726,
 'Law': 47,
 '46': 188,
 'Consulting': 166,
 'Automotive': 111,
 '42': 47,
 'Religion': 182,
 '13': 113,
 'Fashion': 1622,
 '38': 85,
 '43': 6,
 'Publishing': 70,
 '40': 1,
 'Marketing': 207,
 'LawEnforcemen

# Transform labels

In [38]:
# initializing the multilabel binarizer
binarizer=MultiLabelBinarizer(classes=sorted(label_counts.keys()))

In [39]:
# transforming training labels
Ytrain=binarizer.fit_transform(Ytrain)

In [40]:
# transforming testing labels
Ytest = binarizer.fit_transform(Ytest)

# Classification

In [41]:
clf = LogisticRegression(solver = 'lbfgs', max_iter=500)

#One-vs-Rest approach
clf = OneVsRestClassifier(clf)

In [42]:
clf.fit(Xtrain_vec, Ytrain)

OneVsRestClassifier(estimator=LogisticRegression(max_iter=500))

In [43]:
pred=clf.predict(Xtest_vec)

In [44]:
clf_Train = clf.score(Xtrain_vec, Ytrain)
print("Training Accuracy score: ", clf_Train)

clf_Test = clf.score(Xtest_vec,Ytest)
print("Testing Accuracy score: ", clf_Test)

pred = clf.predict(Xtest_vec)

clf_f1 = f1_score(Ytest, pred,average='micro')
print("f1 score: ",clf_f1)

Training Accuracy score:  0.93475
Testing Accuracy score:  0.162
f1 score:  0.5300136708231492


In [45]:
def scores_averaging(Ytest, pred, avg_metric):
    print('Accuracy score: ', accuracy_score(Ytest, pred))
    print('F1 score: ', f1_score(Ytest, pred, average=avg_metric))
    print('Average precision score: ', average_precision_score(Ytest, pred, average=avg_metric))
    print('Average recall score: ', recall_score(Ytest, pred, average=avg_metric))

In [46]:
# Average with micro
scores_averaging(Ytest, pred, "micro")

Accuracy score:  0.162
F1 score:  0.5300136708231492
Average precision score:  0.345828963534897
Average recall score:  0.3998125


In [47]:
# Average with macro
scores_averaging(Ytest, pred, "macro")

Accuracy score:  0.162
F1 score:  0.1939167429826198
Average precision score:  nan
Average recall score:  0.1345445223904179


In [48]:
# Average with weighted
scores_averaging(Ytest, pred, "weighted")

Accuracy score:  0.162
F1 score:  0.478011203813244
Average precision score:  0.40694379720974916
Average recall score:  0.3998125


Macro-average : This will calculate the average for each class independently and then take the average. 

Weighted - average : This will take into consideration of contribution of each label in the class and then returns the average considering the proportion for each label in the dataset.

<b>Micro-average : This will take into consideration of contribution of all the classes to compute the average. Hence as we are seeing a class imbalance on the labels, this metric is favorable.<b>
    


In [50]:
# Function to print true label and predicted label for any five random values
for i in range(5):
    j = random.randint(0,len(Ytest))
    print("Predicted - ", binarizer.inverse_transform(pred)[j])
    print("Actual - ", binarizer.inverse_transform(Ytest)[j])  
    print("\n")

Predicted -  ('17', 'male')
Actual -  ('17', 'Cancer', 'Student', 'male')


Predicted -  ('25', 'Virgo', 'female')
Actual -  ('25', 'Virgo', 'female', 'indUnk')


Predicted -  ('male',)
Actual -  ('23', 'Marketing', 'Scorpio', 'male')


Predicted -  ('male',)
Actual -  ('16', 'Cancer', 'indUnk', 'male')


Predicted -  ('female', 'indUnk')
Actual -  ('23', 'Scorpio', 'female', 'indUnk')


