## Mini prject 5: Enhancing Text Classification in Information Retrieval: A Comprehensive Approach with TF-IDF, Naive Bayes, Word Embeddings, LSA, and SVM


In this section we import needed libraries.

In [1]:
import pandas as pd
import numpy as np 
import math 
from numpy import linalg as LA

from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

import matplotlib.pyplot as plt 
import gensim
from sklearn.svm import SVC
from gensim.models import Word2Vec
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [2]:
# Read data from dataset and store
train_pos_df = pd.read_csv("data/train_pos.csv")
train_neg_df = pd.read_csv("data/train_neg.csv")
test_pos_df  = pd.read_csv("data/test_pos.csv")
test_neg_df  = pd.read_csv("data/test_neg.csv")

In [48]:
# delete index column in data frames
train_pos_df.drop(train_pos_df.columns[0], axis=1, inplace=True)
train_neg_df.drop(train_neg_df.columns[0], axis=1, inplace=True)
test_pos_df.drop(test_pos_df.columns[0], axis=1, inplace=True)
test_neg_df.drop(test_neg_df.columns[0], axis=1, inplace=True)

Now, let's see what are the data frames look like!

In [4]:
train_pos_df.head()

Unnamed: 0,ID,text,rating
0,0,Bromwell High is a cartoon comedy. It ran at t...,9
1,10000,Homelessness (or Houselessness as George Carli...,8
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,10
3,10002,This is easily the most underrated film inn th...,7
4,10003,This is not the typical Mel Brooks film. It wa...,8


In [5]:
train_neg_df.head()

Unnamed: 0,ID,text,rating
0,0,Story of a man who has unnatural feelings for ...,3
1,10000,Airport '77 starts as a brand new luxury 747 p...,4
2,10001,This film lacked something I couldn't put my f...,4
3,10002,"Sorry everyone,,, I know this is supposed to b...",1
4,10003,When I was little my parents took me along to ...,1


In [6]:
test_pos_df.head()

Unnamed: 0,ID,text,rating
0,0,I went and saw this movie last night after bei...,10
1,10000,Actor turned director Bill Paxton follows up h...,7
2,10001,As a recreational golfer with some knowledge o...,9
3,10002,"I saw this film in a sneak preview, and it is ...",8
4,10003,Bill Paxton has taken the true story of the 19...,8


In [7]:
test_neg_df.head()

Unnamed: 0,ID,text,rating
0,0,Once again Mr. Costner has dragged out a movie...,2
1,10000,This is an example of why the majority of acti...,4
2,10001,"First of all I hate those moronic rappers, who...",1
3,10002,Not even the Beatles could write songs everyon...,3
4,10003,Brass pictures (movies is not a fitting word f...,3


## Preprocessing

Define a function to do the data preprocessing which does the following tasks:
<ul>
    <li>Tokenization</li><br>
    <li>Delete stop words</li><br>
    <li>Lowercase words</li><br>
    <li>Delete punctuations</li><br>
    <li>Stemming</li><br>

</ul>

In [8]:
def preprocessing(document):
    '''
    Define a regular expression pattern for tokenization 
    (matching words, 3-digit numbers, numbers with more than 3 digits and 
    have thousands seperators in which n > 3, and floating-point numbers)
    '''
    if document is not None:
        pattern = r'\d{1,3}(?:,\d{3})*(?:\.\d+)?|\w+'

        # Tokenize the text using the regular expression pattern    
        tokens = regexp_tokenize(document, pattern)

        # Lowercase the tokens
        tokens = [word.lower() for word in tokens if word.isalpha()]


        # Get the set of English stopwords
        stop_words = set(stopwords.words('english'))

        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]

        # Remove punctuation
        tokens = [word for word in tokens if word not in string.punctuation]

        # Stemming
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

        # return the processed document
        return ' '.join(tokens)
    else:
        return ''

##### Document preprocessing
In this section we want to do preprocessing on data and store the processed text in 'preprocessed' column of every data frame.

Train data preproccessing here

In [9]:
# Apply the preprocessing function to the 'text' column
# and make a new column named preprocessed to store the 
# text which is preprocessed for each df
train_pos_df['preprocessed'] = train_pos_df['text'].apply(preprocessing)
train_neg_df['preprocessed'] = train_neg_df['text'].apply(preprocessing)
test_pos_df['preprocessed']  = test_pos_df['text'].apply(preprocessing)
test_neg_df['preprocessed']  = test_neg_df['text'].apply(preprocessing)

Now, look what is the new shape of positive class data frame as an example!

In [10]:
train_pos_df.head()

Unnamed: 0,ID,text,rating,preprocessed
0,0,Bromwell High is a cartoon comedy. It ran at t...,9,bromwel high cartoon comedi ran time program s...
1,10000,Homelessness (or Houselessness as George Carli...,8,homeless houseless georg carlin state issu yea...
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,10,brilliant act lesley ann warren best dramat ho...
3,10002,This is easily the most underrated film inn th...,7,easili underr film inn brook cannon sure flaw ...
4,10003,This is not the typical Mel Brooks film. It wa...,8,typic mel brook film much less slapstick movi ...


### Compute TF
compute term frequency for each document in this part using count_term_frequency function and look what is the shape of new data frames!

In [11]:
# Insert a new column to the df to store tf for eacg document
def count_term_frequency(document):
    tokens = document.split()
    tf = {}

    for term in tokens:
        if term not in tf:
            tf[term] = 1
        else:
            tf[term] += 1
    
    return tf

In [12]:
train_pos_df['tf'] = train_pos_df['preprocessed'].apply(count_term_frequency)
train_neg_df['tf'] = train_neg_df['preprocessed'].apply(count_term_frequency)
test_pos_df['tf']  = test_pos_df['preprocessed'].apply(count_term_frequency)
test_neg_df['tf']  = test_neg_df['preprocessed'].apply(count_term_frequency)

In [13]:
train_pos_df.head()

Unnamed: 0,ID,text,rating,preprocessed,tf
0,0,Bromwell High is a cartoon comedy. It ran at t...,9,bromwel high cartoon comedi ran time program s...,"{'bromwel': 4, 'high': 5, 'cartoon': 1, 'comed..."
1,10000,Homelessness (or Houselessness as George Carli...,8,homeless houseless georg carlin state issu yea...,"{'homeless': 4, 'houseless': 1, 'georg': 1, 'c..."
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,10,brilliant act lesley ann warren best dramat ho...,"{'brilliant': 1, 'act': 1, 'lesley': 1, 'ann':..."
3,10002,This is easily the most underrated film inn th...,7,easili underr film inn brook cannon sure flaw ...,"{'easili': 1, 'underr': 1, 'film': 2, 'inn': 1..."
4,10003,This is not the typical Mel Brooks film. It wa...,8,typic mel brook film much less slapstick movi ...,"{'typic': 1, 'mel': 1, 'brook': 2, 'film': 1, ..."


### Compute CF
Now, this is the time to compute collection fraquency.

In [14]:
class_pos_dict = {}
class_neg_dict = {}

# define collection size as T
T = 0
class_pos_size, class_neg_size = 0, 0
trian_distinct_terms = 0

# fill the dictionary of the pos class
for document in train_pos_df['preprocessed']:
    for term in document.split():
        if term not in class_pos_dict:
            class_pos_dict[term] = 1
        else:
            class_pos_dict[term] += 1
    T += len(document.split())

# compute pos class size
class_pos_size = T

# fill the dictionary of the neg class
for document in train_neg_df['preprocessed']:
    for term in document.split():
        if term not in class_neg_dict:
            class_neg_dict[term] = 1
        else:
            class_neg_dict[term] += 1
    T += len(document.split())

# compute neg class size
class_neg_size = T - class_pos_size
trian_distinct_terms = len(class_neg_dict) + len(class_pos_dict)

print(f"Train set size:          {T}")
print(f"Class positive size:     {class_pos_size}\nClass negative size:     {class_neg_size}")
print(f"Distinct collection terms: {trian_distinct_terms}")
print(class_pos_dict)
print(class_neg_dict)

Train set size:          3088439
Class positive size:     1573809
Class negative size:     1514630
Distinct collection terms: 72536
{'bromwel': 8, 'high': 1105, 'cartoon': 357, 'comedi': 2010, 'ran': 122, 'time': 8499, 'program': 179, 'school': 818, 'life': 4211, 'teacher': 180, 'year': 4055, 'teach': 158, 'profess': 53, 'lead': 1396, 'believ': 1888, 'satir': 212, 'much': 4687, 'closer': 126, 'realiti': 653, 'scrambl': 6, 'surviv': 343, 'financi': 83, 'insight': 186, 'student': 335, 'see': 7493, 'right': 1837, 'pathet': 65, 'pomp': 5, 'petti': 51, 'whole': 1312, 'situat': 662, 'remind': 530, 'knew': 462, 'saw': 1699, 'episod': 1698, 'repeatedli': 52, 'tri': 2510, 'burn': 305, 'immedi': 298, 'recal': 165, 'classic': 1391, 'line': 1503, 'inspector': 84, 'sack': 20, 'one': 14173, 'welcom': 163, 'expect': 1421, 'mani': 3776, 'adult': 538, 'age': 1077, 'think': 4295, 'far': 1359, 'fetch': 63, 'piti': 133, 'homeless': 81, 'houseless': 1, 'georg': 555, 'carlin': 9, 'state': 560, 'issu': 406, 

The output above shows that for instance, the word 'high' is used in the class positive 1105 times.

## Naive Bayes
In this part i want to use Bayes classifier to do the classification task on test data.

#### Classification rule:

\begin{equation*}
c_{map} = argmax_{c\in C} \left[log\hat{P}(c) + \sum_{1 \le k \le n_d} log\hat{P}(t_k|c) \right]
\end{equation*}

As there are 2 classes and both of them have size 12500, it is not requered to compute $\hat{P}(c)=\frac{N_c}{N}$. So, we have the below rule:
\begin{equation*}
c_{map} = argmax_{c\in C} \left[\sum_{1 \le k \le n_d} log\hat{P}(t_k|c) \right]
\end{equation*}

\begin{equation*}
\hat{P}(t_k|c) = \frac{T_{ct}+1}{\sum_{t'\in V} (T_{ct'}+1)} = \frac{T_{ct}+1}{(\sum_{t'\in V} T_{ct'})+B}
\end{equation*}

### Class POSITIVE

We have to compute the last condotional probability for all of the terms in class positive.

In [None]:
P_pos = {}

for term in class_pos_dict:
    P_pos[term] = (class_pos_dict[term] + 1)/(class_pos_size + trian_distinct_terms)



### Class NEGATIVE

In [16]:
P_neg = {}

for term in class_neg_dict:
    P_neg[term] = (class_neg_dict[term] + 1)/(class_neg_size + trian_distinct_terms)

## Evalute test set : using multiplication

#### Fisrt part: Evaluating positive labels of test set

In [50]:
classification_result = []

for document in test_pos_df['preprocessed']:
    # As we used multiplication in this section, we set the initial 
    # amounts as 1
    P_c1 = 1
    P_c2 = 1
    '''
    for reach term  in the documnet, if it is in class positive(negative), 
    we compute P_pos(P_neg) for that term and add it to the dict P_pos(P_neg)
    '''
    for term in document.split():
        if term not in P_pos:
            P_pos[term] = 1 / (class_pos_size + trian_distinct_terms)
        P_c1 *= P_pos[term] # multiplication

        if term not in P_neg:
            P_neg[term] = 1 / (class_neg_size + trian_distinct_terms)
        P_c2 *= P_neg[term]

    '''
    if the probability of belonging to the class positive is more than class negative,
    then append 1(for class positive) to the results array. else, append 
    2(for class negative)
    '''
    if P_c1 >= P_c2:
        classification_result.append(1)
    else: 
        classification_result.append(2)
    

print("Result of classification on positive test documents: ")
print(f"Number of documents in test set whcih are predicted to be in class pos: {classification_result.count(1)}")
print(f"Number of documents in test set whcih are predicted to be in class neg: {classification_result.count(2)}")
pos_acc = classification_result.count(1)/len(test_pos_df)
print(f"Accuracy: {pos_acc}")


Result of classification on positive test documents: 
Number of documents in test set whcih are predicted to be in class pos: 10732
Number of documents in test set whcih are predicted to be in class neg: 1768
Accuracy: 0.85856


#### Fisrt part: Evaluating negative labels of test set

In [51]:
for document in test_neg_df['preprocessed']:
    P_c1 = 1
    P_c2 = 1
    '''
    for reach term  in the documnet, if it is in class positive(negative), 
    we compute P_pos(P_neg) for that term and add it to the dict P_pos(P_neg)
    '''
    for term in document.split():
        if term not in P_pos:
            P_pos[term] = 1 / (class_pos_size + trian_distinct_terms)
        P_c1 *= P_pos[term]

        if term not in P_neg:
            P_neg[term] = 1 / (class_neg_size + trian_distinct_terms)
        P_c2 *= P_neg[term]
    '''
    if the probability of belonging to the class positive is more than class negative,
    then append 1(for class positive) to the results array. else, append 
    2(for class negative)
    '''
    if P_c1 >= P_c2:
        classification_result.append(1)
    else: 
        classification_result.append(2)
    


print("Result of classification on negative test documents: ")
print(f"Number of documents in test set whcih are predicted to be in class pos: {classification_result[12500:].count(1)}")
print(f"Number of documents in test set whcih are predicted to be in class neg: {classification_result[12500:].count(2)}")
neg_acc = classification_result[12500:].count(2)/len(test_pos_df)
print(f"Accuracy: {neg_acc}")


Result of classification on negative test documents: 
Number of documents in test set whcih are predicted to be in class pos: 6396
Number of documents in test set whcih are predicted to be in class neg: 6104
Accuracy: 0.48832


## Using log

In [52]:
'''
We use the above code here too. the only difference in that we use log to 
convert multiplication to sum and improve tha results.
'''
classification_result = []

for document in test_pos_df['preprocessed']:
    # As we use log in this section, the initial amounts should be set on 0
    P_c1 = 0
    P_c2 = 0
    for term in document.split():
        if term not in P_pos:
            P_pos[term] = 1 / (class_pos_size + trian_distinct_terms)
        P_c1 += math.log2(P_pos[term]) # Log is here

        if term not in P_neg:
            P_neg[term] = 1 / (class_neg_size + trian_distinct_terms)
        P_c2 += math.log2(P_neg[term])

    if P_c1 >= P_c2:
        classification_result.append(1)
    else: 
        classification_result.append(2)
    

TP1 = classification_result.count(1)
print("Result of classification on positive test documents: ")
print(f"Number of documents in test set whcih are predicted to be in class pos: {classification_result.count(1)}")
print(f"Number of documents in test set whcih are predicted to be in class neg: {classification_result.count(2)}")
pos_acc = classification_result.count(1)/len(test_pos_df)
print(f"Accuracy: {pos_acc}")

Result of classification on positive test documents: 
Number of documents in test set whcih are predicted to be in class pos: 9525
Number of documents in test set whcih are predicted to be in class neg: 2975
Accuracy: 0.762


In [53]:
for document in test_neg_df['preprocessed']:
    P_c1 = 1
    P_c2 = 1
    for term in document.split():
        if term not in P_pos:
            P_pos[term] = 1 / (class_pos_size + trian_distinct_terms)
        P_c1 += math.log2(P_pos[term])

        if term not in P_neg:
            P_neg[term] = 1 / (class_neg_size + trian_distinct_terms)
        P_c2 += math.log2(P_neg[term])

    if P_c1 >= P_c2:
        classification_result.append(1)
    else: 
        classification_result.append(2)
    

TP2 = classification_result[12500:].count(2)
print("Result of classification on negative test documents: ")
print(f"Number of documents in test set whcih are predicted to be in class pos: {classification_result[12500:].count(1)}")
print(f"Number of documents in test set whcih are predicted to be in class neg: {classification_result[12500:].count(2)}")
neg_acc = classification_result[12500:].count(2)/len(test_pos_df)
print(f"Accuracy: {neg_acc}")


Result of classification on negative test documents: 
Number of documents in test set whcih are predicted to be in class pos: 1574
Number of documents in test set whcih are predicted to be in class neg: 10926
Accuracy: 0.87408


Now, compute the accuracy of the classification:

In [54]:
print(f"Naive Bayes classifier accuracy: {(TP1+TP2)/(len(test_neg_df) + len(test_pos_df))}")

Naive Bayes classifier accuracy: 0.81804


# Word embedding and LSA with SVM classifier

In [22]:
# Consider the whole train data as a single data frame
df = pd.concat([train_pos_df, train_neg_df], ignore_index=True)
df.index += 1
df

Unnamed: 0,ID,text,rating,preprocessed,tf
1,0,Bromwell High is a cartoon comedy. It ran at t...,9,bromwel high cartoon comedi ran time program s...,"{'bromwel': 4, 'high': 5, 'cartoon': 1, 'comed..."
2,10000,Homelessness (or Houselessness as George Carli...,8,homeless houseless georg carlin state issu yea...,"{'homeless': 4, 'houseless': 1, 'georg': 1, 'c..."
3,10001,Brilliant over-acting by Lesley Ann Warren. Be...,10,brilliant act lesley ann warren best dramat ho...,"{'brilliant': 1, 'act': 1, 'lesley': 1, 'ann':..."
4,10002,This is easily the most underrated film inn th...,7,easili underr film inn brook cannon sure flaw ...,"{'easili': 1, 'underr': 1, 'film': 2, 'inn': 1..."
5,10003,This is not the typical Mel Brooks film. It wa...,8,typic mel brook film much less slapstick movi ...,"{'typic': 1, 'mel': 1, 'brook': 2, 'film': 1, ..."
...,...,...,...,...,...
24996,9998,"Towards the end of the movie, I felt it was to...",4,toward end movi felt technic felt like classro...,"{'toward': 1, 'end': 1, 'movi': 7, 'felt': 2, ..."
24997,9999,This is the kind of movie that my enemies cont...,3,kind movi enemi content watch time bloodi true...,"{'kind': 3, 'movi': 3, 'enemi': 1, 'content': ..."
24998,999,I saw 'Descent' last night at the Stockholm Fi...,3,saw descent last night stockholm film festiv o...,"{'saw': 1, 'descent': 1, 'last': 1, 'night': 1..."
24999,99,Some films that you pick up for a pound turn o...,1,film pick pound turn rather good rd centuri fi...,"{'film': 7, 'pick': 1, 'pound': 1, 'turn': 1, ..."


In [23]:
# Consider the whole test data as a single data frame
test_set = pd.concat([test_pos_df, test_neg_df], ignore_index=True)
test_set.index += 1
test_set

Unnamed: 0,ID,text,rating,preprocessed,tf
1,0,I went and saw this movie last night after bei...,10,went saw movi last night coax friend mine admi...,"{'went': 1, 'saw': 2, 'movi': 4, 'last': 1, 'n..."
2,10000,Actor turned director Bill Paxton follows up h...,7,actor turn director bill paxton follow promis ...,"{'actor': 1, 'turn': 1, 'director': 1, 'bill':..."
3,10001,As a recreational golfer with some knowledge o...,9,recreat golfer knowledg sport histori pleas di...,"{'recreat': 1, 'golfer': 1, 'knowledg': 1, 'sp..."
4,10002,"I saw this film in a sneak preview, and it is ...",8,saw film sneak preview delight cinematographi ...,"{'saw': 1, 'film': 3, 'sneak': 1, 'preview': 1..."
5,10003,Bill Paxton has taken the true story of the 19...,8,bill paxton taken true stori us golf open made...,"{'bill': 1, 'paxton': 1, 'taken': 1, 'true': 1..."
...,...,...,...,...,...
24996,9998,I occasionally let my kids watch this garbage ...,1,occasion let kid watch garbag understand pathe...,"{'occasion': 1, 'let': 1, 'kid': 1, 'watch': 1..."
24997,9999,When all we have anymore is pretty much realit...,1,anymor pretti much realiti tv show peopl make ...,"{'anymor': 1, 'pretti': 1, 'much': 1, 'realiti..."
24998,999,The basic genre is a thriller intercut with an...,3,basic genr thriller intercut uncomfort menag t...,"{'basic': 1, 'genr': 1, 'thriller': 2, 'interc..."
24999,99,Four things intrigued me as to this film - fir...,3,four thing intrigu film firstli star carli pop...,"{'four': 1, 'thing': 1, 'intrigu': 1, 'film': ..."


Construct the labels for train and test datasets

In [24]:
y_train = np.concatenate([np.ones(12500), np.zeros(12500)])
y_test = np.copy(y_train)

#### Preprocess train and test set + compute TF:

In [27]:
# Apply preprocessing on train and test dataframes and compute term frequency
df['preprocessed'] = df['text'].apply(preprocessing)
df['tf'] = df['preprocessed'].apply(count_term_frequency)

test_set['preprocessed'] = test_set['text'].apply(preprocessing)
test_set['tf'] = test_set['preprocessed'].apply(count_term_frequency)

Make new pandas series for test and train datasets


In [28]:
train_text = df.text.apply(preprocessing)
train_text = train_text.apply(gensim.utils.simple_preprocess)

In [29]:
test_text = test_set.text.apply(preprocessing)
test_text = test_text.apply(gensim.utils.simple_preprocess)

This is the form of pandas series:

In [30]:
print(train_text.loc[1])

['bromwel', 'high', 'cartoon', 'comedi', 'ran', 'time', 'program', 'school', 'life', 'teacher', 'year', 'teach', 'profess', 'lead', 'believ', 'bromwel', 'high', 'satir', 'much', 'closer', 'realiti', 'teacher', 'scrambl', 'surviv', 'financi', 'insight', 'student', 'see', 'right', 'pathet', 'teacher', 'pomp', 'petti', 'whole', 'situat', 'remind', 'school', 'knew', 'student', 'saw', 'episod', 'student', 'repeatedli', 'tri', 'burn', 'school', 'immedi', 'recal', 'high', 'classic', 'line', 'inspector', 'sack', 'one', 'teacher', 'student', 'welcom', 'bromwel', 'high', 'expect', 'mani', 'adult', 'age', 'think', 'bromwel', 'high', 'far', 'fetch', 'piti']


In [31]:
print(df.text.loc[1])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [32]:
df.head()

Unnamed: 0,ID,text,rating,preprocessed,tf
1,0,Bromwell High is a cartoon comedy. It ran at t...,9,bromwel high cartoon comedi ran time program s...,"{'bromwel': 4, 'high': 5, 'cartoon': 1, 'comed..."
2,10000,Homelessness (or Houselessness as George Carli...,8,homeless houseless georg carlin state issu yea...,"{'homeless': 4, 'houseless': 1, 'georg': 1, 'c..."
3,10001,Brilliant over-acting by Lesley Ann Warren. Be...,10,brilliant act lesley ann warren best dramat ho...,"{'brilliant': 1, 'act': 1, 'lesley': 1, 'ann':..."
4,10002,This is easily the most underrated film inn th...,7,easili underr film inn brook cannon sure flaw ...,"{'easili': 1, 'underr': 1, 'film': 2, 'inn': 1..."
5,10003,This is not the typical Mel Brooks film. It wa...,8,typic mel brook film much less slapstick movi ...,"{'typic': 1, 'mel': 1, 'brook': 2, 'film': 1, ..."


In [33]:
test_set.head()

Unnamed: 0,ID,text,rating,preprocessed,tf
1,0,I went and saw this movie last night after bei...,10,went saw movi last night coax friend mine admi...,"{'went': 1, 'saw': 2, 'movi': 4, 'last': 1, 'n..."
2,10000,Actor turned director Bill Paxton follows up h...,7,actor turn director bill paxton follow promis ...,"{'actor': 1, 'turn': 1, 'director': 1, 'bill':..."
3,10001,As a recreational golfer with some knowledge o...,9,recreat golfer knowledg sport histori pleas di...,"{'recreat': 1, 'golfer': 1, 'knowledg': 1, 'sp..."
4,10002,"I saw this film in a sneak preview, and it is ...",8,saw film sneak preview delight cinematographi ...,"{'saw': 1, 'film': 3, 'sneak': 1, 'preview': 1..."
5,10003,Bill Paxton has taken the true story of the 19...,8,bill paxton taken true stori us golf open made...,"{'bill': 1, 'paxton': 1, 'taken': 1, 'true': 1..."


# W2V
in this section, we used word2vec as the word embedding and we will do the classification using this.

In [34]:
# consider each vector size for each word in train set as 100.
# window: 5, which means that the model will consider 5 words befor and afterm the target word
# min_count: 1, even consider words which are used only once
train_model = Word2Vec(sentences=train_text, vector_size=100, window=5, min_count=1, workers=4)

In [35]:
train_model.wv.index_to_key

['br',
 'movi',
 'film',
 'one',
 'like',
 'time',
 'good',
 'make',
 'charact',
 'get',
 'see',
 'watch',
 'stori',
 'even',
 'would',
 'realli',
 'well',
 'scene',
 'look',
 'show',
 'much',
 'end',
 'peopl',
 'bad',
 'go',
 'great',
 'also',
 'first',
 'love',
 'think',
 'way',
 'act',
 'play',
 'made',
 'thing',
 'could',
 'know',
 'say',
 'seem',
 'work',
 'plot',
 'two',
 'year',
 'actor',
 'come',
 'mani',
 'seen',
 'take',
 'life',
 'want',
 'never',
 'littl',
 'best',
 'tri',
 'man',
 'ever',
 'give',
 'better',
 'still',
 'perform',
 'find',
 'feel',
 'part',
 'back',
 'director',
 'use',
 'someth',
 'actual',
 'interest',
 'lot',
 'real',
 'old',
 'cast',
 'though',
 'live',
 'star',
 'enjoy',
 'guy',
 'anoth',
 'new',
 'role',
 'noth',
 'funni',
 'music',
 'point',
 'start',
 'set',
 'girl',
 'origin',
 'day',
 'world',
 'everi',
 'believ',
 'turn',
 'quit',
 'direct',
 'us',
 'thought',
 'fact',
 'minut',
 'horror',
 'kill',
 'action',
 'comedi',
 'pretti',
 'young',
 'won

Now, we have to make our train data such that construct a vector for each documnet as the mean of all its words vectoes.

In [37]:
X_train = []
for i in range(len(train_text)):
    doc_model = np.zeros(100)
    for word in train_text[i+1]:
        doc_model += train_model.wv[word]
    X_train.append(doc_model/len(train_text[i+1]))

print(len(X_train))

25000


We have to convert all test documnets to vectors too as above:

In [38]:
X_test = []
for i in range(len(test_text)):
    doc_model = np.zeros(100)
    for word in test_text[i+1]:
        if word in train_model.wv.key_to_index:
            doc_model += train_model.wv[word]
        else:
            doc_model += np.zeros(100)
    X_test.append(doc_model/len(train_text[i+1]))

print(len(X_test))

25000


# SVM
This is the time of classification using SVM!

we will do the classification using different parameters to find which combination is the best for this data set.

In [39]:
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

y_pred1 = svm_classifier.predict(X_test)
accuracy1 = svm_classifier.score(X_test, y_test)
print(classification_report(y_test, y_pred1))
print(f"accuracy using default kernel: {accuracy1}")

              precision    recall  f1-score   support

         0.0       0.61      0.66      0.63     12500
         1.0       0.63      0.58      0.61     12500

    accuracy                           0.62     25000
   macro avg       0.62      0.62      0.62     25000
weighted avg       0.62      0.62      0.62     25000

accuracy using default kernel: 0.6212


In [58]:
svm_classifier_poly_ = SVC(kernel='linear', C=1)
svm_classifier_poly_.fit(X_train, y_train)
y_pred_ = svm_classifier_poly_.predict(X_test)
accuracy_ = svm_classifier_poly_.score(X_test, y_test)
print(classification_report(y_test, y_pred_))
print(f"accuracy using poly kernel: {accuracy_}")

              precision    recall  f1-score   support

         0.0       0.59      0.63      0.61     12500
         1.0       0.60      0.56      0.58     12500

    accuracy                           0.60     25000
   macro avg       0.60      0.60      0.60     25000
weighted avg       0.60      0.60      0.60     25000

accuracy using poly kernel: 0.59752


as it is obvious from the above result, the linear kernel does dot work good for us. Let's consider another kernels.

In [29]:
svm_classifier_poly1 = SVC(kernel='poly', C=0.1)
svm_classifier_poly1.fit(X_train, y_train)
y_pred2 = svm_classifier_poly1.predict(X_test)
accuracy2 = svm_classifier_poly1.score(X_test, y_test)
print(classification_report(y_test, y_pred2))
print(f"accuracy using poly kernel: {accuracy2}")

              precision    recall  f1-score   support

         0.0       0.71      0.82      0.76     12500
         1.0       0.78      0.66      0.72     12500

    accuracy                           0.74     25000
   macro avg       0.74      0.74      0.74     25000
weighted avg       0.74      0.74      0.74     25000

accuracy using poly kernel: 0.73848


In [30]:
svm_classifier_poly2 = SVC(kernel='poly', C=0.08)
svm_classifier_poly2.fit(X_train, y_train)
y_pred3 = svm_classifier_poly2.predict(X_test)
accuracy3 = svm_classifier_poly2.score(X_test, y_test)
print(classification_report(y_test, y_pred3))
print(f"accuracy using poly kernel: {accuracy3}")

              precision    recall  f1-score   support

         0.0       0.76      0.80      0.78     12500
         1.0       0.79      0.75      0.77     12500

    accuracy                           0.77     25000
   macro avg       0.77      0.77      0.77     25000
weighted avg       0.77      0.77      0.77     25000

accuracy using poly kernel: 0.77316


In [55]:
svm_classifier_poly2 = SVC(kernel='poly', C=0.08, gamma='auto')
svm_classifier_poly2.fit(X_train, y_train)
y_pred3 = svm_classifier_poly2.predict(X_test)
accuracy3 = svm_classifier_poly2.score(X_test, y_test)
print(classification_report(y_test, y_pred3))
print(f"accuracy using poly kernel: {accuracy3}")

              precision    recall  f1-score   support

         0.0       0.55      0.29      0.38     12500
         1.0       0.52      0.77      0.62     12500

    accuracy                           0.53     25000
   macro avg       0.53      0.53      0.50     25000
weighted avg       0.53      0.53      0.50     25000

accuracy using poly kernel: 0.52664


In [32]:
svm_classifier_poly3 = SVC(kernel='poly', C=0.09)
svm_classifier_poly3.fit(X_train, y_train)
y_pred4 = svm_classifier_poly3.predict(X_test)
accuracy4= svm_classifier_poly3.score(X_test, y_test)
print(classification_report(y_test, y_pred4))
print(f"accuracy using poly kernel: {accuracy4}")

              precision    recall  f1-score   support

         0.0       0.72      0.81      0.76     12500
         1.0       0.79      0.68      0.73     12500

    accuracy                           0.75     25000
   macro avg       0.75      0.75      0.75     25000
weighted avg       0.75      0.75      0.75     25000

accuracy using poly kernel: 0.7494


In [57]:
svm_classifier_poly4 = SVC(kernel='sigmoid', C=0.1)
svm_classifier_poly4.fit(X_train, y_train)
y_pred4 = svm_classifier_poly4.predict(X_test)
accuracy4 = svm_classifier_poly4.score(X_test, y_test)
print(classification_report(y_test, y_pred4))
print(f"accuracy using poly kernel: {accuracy4}")

              precision    recall  f1-score   support

         0.0       0.53      0.76      0.62     12500
         1.0       0.57      0.32      0.41     12500

    accuracy                           0.54     25000
   macro avg       0.55      0.54      0.52     25000
weighted avg       0.55      0.54      0.52     25000

accuracy using poly kernel: 0.53844


As it is shown above, when we set C=0.08 and the kernel is set on 'poly', we get the best accuracy and f1-score. 
but it is still lower that what we got by Bayes classifier which was near 82.

# LSA

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [46]:
# To do the LSA, we have to first construct our tf-idf matrix 
# and then apply svd and dimention reduction.
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(df['preprocessed'])

In [59]:
# Apply SVD and dimention reduction
lsa1 = TruncatedSVD(n_components=100)
X_lsa1 = lsa1.fit_transform(X_tfidf)
X_test_lsa1 = lsa1.fit_transform(X_tfidf)

# Train the SVM Classifier
svm_classifier1 = SVC(kernel='linear')

# Initialize and train SVM classifier
svm_classifier1.fit(X_lsa1, y_train)

# Predict labels for test data
y_pred = svm_classifier1.predict(X_test_lsa1)

# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.85      0.80      0.82     12500
         1.0       0.81      0.85      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



In [62]:
# Apply SVD and dimention reduction
lsa2 = TruncatedSVD(n_components=300)
X_lsa2 = lsa2.fit_transform(X_tfidf)
X_test_lsa2 = lsa2.fit_transform(X_tfidf)

# Train the SVM Classifier
svm_classifier2 = SVC(kernel='linear')

# Initialize and train SVM classifier
svm_classifier2.fit(X_lsa2, y_train)

# Predict labels for test data
y_pred = svm_classifier2.predict(X_test_lsa2)

# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.86      0.83      0.84     12500
         1.0       0.83      0.87      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



2 above cells means that by increasing the number of components, the accuracy increases too!

In [None]:
# Train the SVM classifier with poly kernel
svm_classifier2 = SVC(kernel='poly')

# Initialize and train SVM classifier
svm_classifier2.fit(X_lsa, y_train)

y_pred = svm_classifier.predict(X_test_lsa)

# Print classification report
print(classification_report(y_test, y_pred))


As we tested on the dataset, we got 82%, 77%, and 85% accuracy for Bayes classification, W2V and LSA using SVM classifier. 
Thus, as we predicted, the best result on our data was obtaied using SVM on LSA.

## FastText
we just tested another word embedding for fun((:

In [31]:
from gensim.models import FastText

model2 = FastText(sentences=train_text, vector_size=100, window=5, min_count=1, workers=4, seed=42)

In [32]:
model2_test = FastText(sentences=test_text, vector_size=100, window=5, min_count=1, workers=4, seed=42)

In [33]:
X_train2 = []
for i in range(len(train_text)):
    doc_model = np.zeros(100)
    for word in train_text[i+1]:
        doc_model += model2.wv[word]
    X_train2.append(doc_model/len(train_text[i+1]))

In [34]:
X_test2 = []
for i in range(len(test_text)):
    doc_model = np.zeros(100)
    for word in test_text[i+1]:
        if word in model2.wv.key_to_index:
            doc_model += model2.wv[word]
        else:
            doc_model += np.zeros(100)
    X_test2.append(doc_model/len(train_text[i+1]))

In [35]:
svm_classifier_poly = SVC(kernel='poly', C=0.1)
svm_classifier_poly.fit(X_train2, y_train)

accuracy_5 = svm_classifier_poly.score(X_test2, y_test)
print(f"accuracy using poly kernel: {accuracy_5}")

accuracy using poly kernel: 0.51344


In [37]:
svm_classifier_poly = SVC(kernel='poly', C=0.5, gamma=0.1)
svm_classifier_poly.fit(X_train2, y_train)

accuracy_6 = svm_classifier_poly.score(X_test2, y_test)
print(f"accuracy using poly kernel: {accuracy_6}")

accuracy using poly kernel: 0.50516


In [38]:
svm_classifier_poly = SVC(kernel='poly', C=0.5, gamma=0.01)
svm_classifier_poly.fit(X_train2, y_train)

accuracy_7 = svm_classifier_poly.score(X_test2, y_test)
print(f"accuracy using poly kernel: {accuracy_7}")

In [None]:
svm_classifier_poly = SVC(kernel='poly', C=0.5, gamma=1)
svm_classifier_poly.fit(X_train2, y_train)

accuracy_8 = svm_classifier_poly.score(X_test2, y_test)
print(f"accuracy using poly kernel: {accuracy_8}")

In [None]:
test_set['preprocessed'] = test_set['text'].apply(preprocessing)