In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report,f1_score, accuracy_score, recall_score, precision_score
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
os.chdir('/content/drive/My Drive')

In [4]:
df = pd.read_csv("blogtext.csv")

In [5]:
df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [6]:
df.shape

(681284, 7)

In [7]:
#There are 681284 rows, so performing operations on smaller dataset first.

In [8]:
df = df[:3000]
print(df.shape)
df["text"].loc[0]

(3000, 7)


'           Info has been found (+/- 100 pages, and 4.5 MB of .pdf files) Now i have to wait untill our team leader has processed it and learns html.         '

In [9]:
# Pre-Preocessing text coloumn

In [10]:
# Removing unwanted and special characters.
df["text"] = df["text"].str.replace('[^A-Za-z]',' ')
df["text"].loc[0]

'           Info has been found          pages  and     MB of  pdf files  Now i have to wait untill our team leader has processed it and learns html          '

In [11]:
#Converting letters to lower case
df['text'] = df['text'].str.lower()
df["text"].loc[0]

'           info has been found          pages  and     mb of  pdf files  now i have to wait untill our team leader has processed it and learns html          '

In [12]:
#Space removal
df["text"] = df["text"].str.strip()
df["text"].loc[0]

'info has been found          pages  and     mb of  pdf files  now i have to wait untill our team leader has processed it and learns html'

In [13]:
#Splitting each row of text data into individual words and then removing stop words.
df["text"] = df["text"].str.split()

import nltk
nltk.download('stopwords')

stop = stopwords.words('english')
def removestopwords(y): 
 stopwordsremoved = [w for w in y if w not in stop]
 return(" ".join(stopwordsremoved))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
text_column_size = df["text"].size
print("text column size :", text_column_size)

# Initialize an empty list to hold the text after stop word removal
df_text = []

# Looping over each text
for i in range( 0, text_column_size):
    df_text.append(removestopwords(df["text"][i]))

text column size : 3000


In [15]:
df_text[10]

'ah korean language looks difficult first figure read hanguel korea surprisingly easy learn alphabet characters seems easy vocabulary starts oh backwards us sentence structure yikes luckily many options us slow witted foreigners take language course could list urllink joongang article says lot resources urllink well guy motivation jeon ji hyun latest something actually star movies cfs hear means commercial feature positive saw latest movie sunday night hard describe name english version windstruck korean version yeochinso short ne yeojachingu rul sogayhamnida like introduce girlfriend surprisingly titles make sense like website korean english looks quite good actually urllink movie shown theatres subtitles special times info urllink list many theatres seoul click urllink urllink great reason learn korean already married went foreigners well local korean national course korean take picture put urllink movie hof bar update bud mine passed urllink link giordano ad apparently aired korea n

In [16]:
df["text"]

0       [info, has, been, found, pages, and, mb, of, p...
1       [these, are, the, team, members, drewes, van, ...
2       [in, het, kader, van, kernfusie, op, aarde, ma...
3                                      [testing, testing]
4       [thanks, to, yahoo, s, toolbar, i, can, now, c...
                              ...                        
2995    [but, that, zoo, exhibit, thing, was, mucho, m...
2996    [my, fave, song, for, the, day, aimee, mann, p...
2997           [urllink, america, s, best, zoo, exhibits]
2998    [the, less, one, makes, declaritive, statement...
2999    [while, his, status, as, a, media, personality...
Name: text, Length: 3000, dtype: object

In [17]:
# Replacing df["text"] column with df_text
df["text"] = df_text

In [18]:
# Lemmatization
import nltk
nltk.download('wordnet')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    lemm = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    return(" ".join(lemm)) 

df["text"] = df.text.apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
df["text"][10]

'ah korean language look difficult first figure read hanguel korea surprisingly easy learn alphabet character seems easy vocabulary start oh backwards u sentence structure yikes luckily many option u slow witted foreigner take language course could list urllink joongang article say lot resource urllink well guy motivation jeon ji hyun latest something actually star movie cf hear mean commercial feature positive saw latest movie sunday night hard describe name english version windstruck korean version yeochinso short ne yeojachingu rul sogayhamnida like introduce girlfriend surprisingly title make sense like website korean english look quite good actually urllink movie shown theatre subtitle special time info urllink list many theatre seoul click urllink urllink great reason learn korean already married went foreigner well local korean national course korean take picture put urllink movie hof bar update bud mine passed urllink link giordano ad apparently aired korea nothing xxx sensibil

In [20]:
# Merging
df.head(2)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found page mb pdf file wait untill team l...
1,2059027,male,15,Student,Leo,"13,May,2004",team member drewes van der laag urllink mail r...


In [21]:
# Merging gender, age, topic and sign
df['age'] = df['age'].astype(str)
df['Labels'] = df[['gender','age','topic','sign']].apply(lambda x: ','.join(x), axis = 1) 
df_merged = df.drop(labels = ['date','gender', 'age','topic','sign','id'], axis = 1)
df_merged.head()

Unnamed: 0,text,Labels
0,info found page mb pdf file wait untill team l...,"male,15,Student,Leo"
1,team member drewes van der laag urllink mail r...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture url popups mean s...,"male,33,InvestmentBanking,Aquarius"


In [22]:
df_merged.shape

(3000, 2)

In [23]:
# Training and Testing
feature = df_merged['text']
df_merged['Labels'] = df_merged['Labels'].str.lower()
labels = df_merged['Labels']
X_train, X_test, Y_train, Y_test = train_test_split(feature,labels, test_size = 0.33, random_state = 143)
Y_train.shape

(2010,)

In [24]:
# Vectorization
# Using count vectorizer
vectorizer = CountVectorizer(min_df = 2,ngram_range = (1,2),stop_words = "english")
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print("X_train shape & sample",X_train.shape)
X_train[0]

X_train shape & sample (2010, 16018)


<1x16018 sparse matrix of type '<class 'numpy.int64'>'
	with 27 stored elements in Compressed Sparse Row format>

In [25]:
# Dictionary to get the count of every label.
vectorizer_labels = CountVectorizer(min_df = 1,ngram_range = (1,1),stop_words = "english")
labels_vector = vectorizer_labels.fit_transform(labels)
vectorizer_labels.vocabulary_

#Extracting only key value from dictionary above
label_classes = []  
for key in vectorizer_labels.vocabulary_.keys():
    label_classes.append(key)
    
print(sorted(label_classes))

['14', '15', '16', '17', '23', '24', '25', '26', '27', '33', '34', '35', '37', '39', '41', '44', '45', 'accounting', 'aquarius', 'aries', 'arts', 'banking', 'businessservices', 'cancer', 'capricorn', 'communications', 'education', 'engineering', 'female', 'gemini', 'indunk', 'internet', 'investmentbanking', 'leo', 'libra', 'libraries', 'male', 'media', 'museums', 'non', 'pisces', 'profit', 'recreation', 'sagittarius', 'science', 'scorpio', 'sports', 'student', 'taurus', 'technology', 'virgo']


In [26]:
# Transforming the Labels.
# Should transform labels into binary form
# Using MultilabelBinarizer
mlb = MultiLabelBinarizer(classes = label_classes)

In [27]:
# Converting Labels to fit the format required by MultiLabelBinarizer
labels = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in labels]]
labels[35]

['male', '33', 'investmentbanking', 'aquarius']

In [28]:
labels_trans = mlb.fit(labels) # transforming entire set of lables
labels_trans

MultiLabelBinarizer(classes=['male', '15', 'student', 'leo', '33',
                             'investmentbanking', 'aquarius', 'female', '14',
                             'indunk', 'aries', '25', 'capricorn', '17',
                             'gemini', '23', 'non', 'profit', 'cancer',
                             'banking', '37', 'sagittarius', '26', '24',
                             'scorpio', '27', 'education', '45', 'engineering',
                             'libra', ...],
                    sparse_output=False)

In [29]:
# Converting Y_train to fit the format required by MultiLabelBinarizer
Y_train = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in Y_train]]
Y_train[35]

['male', '15', 'science', 'libra']

In [30]:
# Transforming train labels using mlb
Y_train_trans = mlb.transform(Y_train)
Y_train_trans[35]

  .format(sorted(unknown, key=str)))


array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [31]:
# Now converting Y_test into format required by mlb
Y_test = [["".join(re.findall("\w",f)) for f in lst] for lst in [s.split(",") for s in Y_test]]
Y_test_trans = mlb.transform(Y_test) # transforming test labels.
Y_test[35]

  .format(sorted(unknown, key=str)))


['male', '35', 'technology', 'aries']

CLassifier

*Using One vs Rest Approach, Number of tags are trained.

*Using Logistic Regression

In [32]:
clf = LogisticRegression(solver = 'lbfgs',max_iter = 1000)  # initiating the classifier
clf = OneVsRestClassifier(clf)

In [33]:
# Fitting the classifier.
clf.fit(X_train,Y_train_trans)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [34]:
# Accuracy

print("Train Accuracy:",clf.score(X_train,Y_train_trans))


Train Accuracy: 0.972139303482587


In [35]:
Y_pred = clf.predict(X_test)

In [36]:
print("Test Accuracy:" + str(accuracy_score(Y_test_trans, Y_pred)))
print("F1: " + str(f1_score(Y_test_trans, Y_pred, average='micro')))
print("Precision: " + str(precision_score(Y_test_trans, Y_pred, average='micro')))
print("Recall: " + str(recall_score(Y_test_trans, Y_pred, average='micro')))

Test Accuracy:0.5686868686868687
F1: 0.7584394023242943
Precision: 0.8270971635485818
Recall: 0.7003065917220235


In [37]:
# Printing true labels and predicted labels.
# Inverse transforming
Y_pred_inv = mlb.inverse_transform(Y_pred)
Y_test_trans_inv =  mlb.inverse_transform(Y_test_trans)

In [38]:
print("Ex. 1 - predicted :",Y_pred_inv[20])
print("Ex.  1 - Actual :",Y_test_trans_inv[20])
print("Ex. 1 - Actual_before mlb transformation :",Y_test[20])

Ex. 1 - predicted : ('male', 'aries', '35', 'technology')
Ex.  1 - Actual : ('male', 'aries', '35', 'technology')
Ex. 1 - Actual_before mlb transformation : ['male', '35', 'technology', 'aries']


In [39]:
print("Ex. 1 - predicted :",Y_pred_inv[225])
print("Ex.  1 - Actual :",Y_test_trans_inv[225])
print("Ex. 1 - Actual_before mlb transformation :",Y_test[225])

Ex. 1 - predicted : ('female',)
Ex.  1 - Actual : ('female', 'indunk', '24', 'scorpio')
Ex. 1 - Actual_before mlb transformation : ['female', '24', 'indunk', 'scorpio']
