In [86]:
import pandas as pd
import numpy as np
import re

In [87]:
df = pd.read_csv('./data/imdb_master.csv',encoding="ISO-8859-1")

In [88]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [89]:
#remove columns that are not required
df.drop(['Unnamed: 0','file'], axis=1, inplace=True)
df.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


In [90]:
#split dataframe into train and test sets

train = df[df['type']== 'train']
print('No. of observations in train set: ' + str(len(train)))

test = df[df['type']== 'test']
print('No. of observations in test set: ' + str(len(test)))


No. of observations in train set: 75000
No. of observations in test set: 25000


In [91]:
# remove unlabelled samples

train = train[train['label'] != 'unsup']
test = test[test['label'] != 'unsup']

print('No. of observations in train set after removing unlabelled samples: ' + str(len(train)))
print('No. of observations in test set after removing unlabelled samples: ' + str(len(test)))

No. of observations in train set after removing unlabelled samples: 25000
No. of observations in test set after removing unlabelled samples: 25000


### Text preprocessing

In [92]:
train.head()

Unnamed: 0,type,review,label
25000,train,Story of a man who has unnatural feelings for ...,neg
25001,train,Airport '77 starts as a brand new luxury 747 p...,neg
25002,train,This film lacked something I couldn't put my f...,neg
25003,train,"Sorry everyone,,, I know this is supposed to b...",neg
25004,train,When I was little my parents took me along to ...,neg


In [93]:
train.tail()

Unnamed: 0,type,review,label
49995,train,"Seeing as the vote average was pretty low, and...",pos
49996,train,"The plot had some wretched, unbelievable twist...",pos
49997,train,I am amazed at how this movie(and most others ...,pos
49998,train,A Christmas Together actually came before my t...,pos
49999,train,Working-class romantic drama from director Mar...,pos


In [94]:
train['label'].value_counts()

pos    12500
neg    12500
Name: label, dtype: int64

In [95]:
train['label'] = train['label'].map({'pos': 1, 'neg': 0})
len(train[train['label'] == 0])

12500

In [96]:
test['label'] = test['label'].map({'pos': 1, 'neg': 0})

In [97]:
len(train[train['label'] == 1])

12500

In [98]:
train['label'].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [99]:
#lower casing
def lower_case(text):
    return text.lower()


#remove html_tags
def remove_htmltags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)


#remove numbers
def remove_numbers(text):
    output = re.sub(r'\d+', '', text)
    return output

# remove punctuation

import string
def remove_punctuation(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    return text_p

#tokenize text

import nltk
nltk.download('punkt')
from nltk import word_tokenize

def tokenize(text):
    words = word_tokenize(text)
    return words

#remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stopwords(text):
    filtered_words = [word for word in text if word not in stop_words]
    return filtered_words

#stemming
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    stemmed = [stemmer.stem(word) for word in text]
    return stemmed


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [100]:
class Preprocessor():
    def __init__(self):
        pass
    
    def preprocess(self,df_column, steps):
        self.df_column = df_column
        self.steps = steps
    
        if 'lower_case' in self.steps:
            self.df_column = self.df_column.apply(lambda x: lower_case(x))
        
        if 'remove_htmltags' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_htmltags(x))
        
        if 'remove_numbers' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_numbers(x))
        
        if 'remove_punctuation' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_punctuation(x))           
        
        if 'tokenize' in self.steps:
            self.df_column = self.df_column.apply(lambda x: tokenize(x))
        
        if 'stopwords' in self.steps:
            self.df_column = self.df_column.apply(lambda x: remove_stopwords(x))
        
        if 'stemming' in self.steps:
            self.df_column = self.df_column.apply(lambda x: stem_words(x))
            
        return self.df_column    
        
          

In [101]:
steps = ['lower_case','remove_html_tags','remove_numbers','remove_punctuation',
        'tokenize','stopwords','stemming']
processor = Preprocessor()
train_processed = processor.preprocess(train['review'],steps)
test_processed = processor.preprocess(test['review'],steps)

In [103]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(text):
    return text


train_reviews = list(train_processed)
test_reviews = list(test_processed)

cv = CountVectorizer(tokenizer=dummy,preprocessor=dummy, lowercase=False)
X_train = cv.fit_transform(train_reviews)
X_test = cv.transform(test_reviews)

In [106]:
y_train = train['label']
y_test = test['label']

from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train,y_train)
ypred_train = MNB.predict(X_train)
ypred_test = MNB.predict(X_test)

In [107]:
from sklearn.metrics import f1_score
train_score = f1_score(ypred_train,y_train)
test_score = f1_score(ypred_test,y_test)
print('Sklearn Naive Bayes, train set F1-score: {:.1f} %'.format(train_score*100))
print('Sklearn Naive Bayes, test set F1-score: {:.1f} %'.format(test_score*100))

Sklearn Naive Bayes, train set F1-score: 91.0 %
Sklearn Naive Bayes, test set F1-score: 80.7 %


In [111]:
class Naive_bayes():
    def __init__(self):
        pass
    
    def fit(self,X_train, y_train):
        
        self.X_train = X_train
        self.y_train = y_train
        
        #prior probabilities
        positive_class_prob = len(y_train[y_train==1])/len(y_train)
        negative_class_prob = len(y_train[y_train==0])/len(y_train)
        
        #vocabulary size
        V = X_train.shape[1]
        
        #posterior probabilities for each token in vocabulary
        negative_matrix = X_train[0:12500,:]
        positive_matrix = X_train[12500::,:]

        positive_count = positive_matrix.sum(axis=0)
        negative_count = negative_matrix.sum(axis=0)

        positive_totalcount = positive_matrix.sum()
        negative_totalcount = negative_matrix.sum()
        
        log_positive_probs = np.log10((positive_count+1)/(positive_totalcount +V))
        log_negative_probs = np.log10((negative_count+1)/(negative_totalcount +V))
        
        self.positive_class_prob = positive_class_prob
        self.negative_class_prob = negative_class_prob
        self.log_positive_probs = log_positive_probs
        self.log_negative_probs = log_negative_probs
    
    
    def predict(self,X):
        self.X = X

        X_positive = X @ self.log_positive_probs.T + np.log10(self.positive_class_prob)
        X_negative = X @ self.log_negative_probs.T + np.log10(self.negative_class_prob)
        
        bool_array = np.asarray(X_positive > X_negative).flatten()
        ypred = bool_array.astype(int)
        
        return ypred
           

In [112]:
def f1_score(y_true,y_pred):
    positives_index = list(y_true[y_true == 1].index)
    negatives_index = list(y_true[y_true == 0].index)
    
    tp = y_pred[positives_index].sum()
    tn = len(y_true[y_true == 0]) - y_pred[negatives_index].sum()
    fp = y_pred[negatives_index].sum()
    fn = len(y_true[y_true == 1]) - y_pred[positives_index].sum()
    
    f1_score = tp/(tp + 0.5*(fp + fn))
    
    return f1_score

In [113]:
clf = Naive_bayes()
clf.fit(X_train,y_train)
ytrain_pred = clf.predict(X_train)
ytest_pred = clf.predict(X_test)

In [114]:
#reset index for y_train
y_train_reset = y_train.reset_index(drop=True)

In [115]:
train_f1 = f1_score(y_train_reset,ytrain_pred)
test_f1 = f1_score(y_test,ytest_pred)
print('Naive Bayes implementation from scratch, train set F1-score: {:.1f} % '.format(train_f1*100))
print('Naive Bayes implementation from scratch, test set F1-score: {:.1f} %'.format(test_f1*100))

Naive Bayes implementation from scratch, train set F1-score: 91.0 % 
Naive Bayes implementation from scratch, test set F1-score: 80.7 %
