In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/imdb_master.csv',encoding="ISO-8859-1")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [4]:
#remove columns that are not required
df.drop(['Unnamed: 0','file'], axis=1, inplace=True)
df.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


In [5]:
#split dataframe into train and test sets

train = df[df['type']== 'train']
print('No. of observations in train set: ' + str(len(train)))

test = df[df['type']== 'test']
print('No. of observations in test set: ' + str(len(test)))


No. of observations in train set: 75000
No. of observations in test set: 25000


In [6]:
# remove unlabelled samples

train = train[train['label'] != 'unsup']
test = test[test['label'] != 'unsup']

print('No. of observations in train set after removing unlabelled samples: ' + str(len(train)))
print('No. of observations in test set after removing unlabelled samples: ' + str(len(test)))

No. of observations in train set after removing unlabelled samples: 25000
No. of observations in test set after removing unlabelled samples: 25000


### Text preprocessing

In [7]:
train.head()

Unnamed: 0,type,review,label
25000,train,Story of a man who has unnatural feelings for ...,neg
25001,train,Airport '77 starts as a brand new luxury 747 p...,neg
25002,train,This film lacked something I couldn't put my f...,neg
25003,train,"Sorry everyone,,, I know this is supposed to b...",neg
25004,train,When I was little my parents took me along to ...,neg


In [8]:
train.tail()

Unnamed: 0,type,review,label
49995,train,"Seeing as the vote average was pretty low, and...",pos
49996,train,"The plot had some wretched, unbelievable twist...",pos
49997,train,I am amazed at how this movie(and most others ...,pos
49998,train,A Christmas Together actually came before my t...,pos
49999,train,Working-class romantic drama from director Mar...,pos


In [9]:
train['label'].value_counts()

pos    12500
neg    12500
Name: label, dtype: int64

In [10]:
train['label'] = train['label'].map({'pos': 1, 'neg': 0})
len(train[train['label'] == 0])

12500

In [11]:
test['label'] = test['label'].map({'pos': 1, 'neg': 0})

In [12]:
len(train[train['label'] == 1])

12500

In [13]:
train['label'].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [14]:
#lower casing
def lower_case(text):
    return text.lower()
train['review'] = train['review'].apply(lambda x: lower_case(x))
train.head()

Unnamed: 0,type,review,label
25000,train,story of a man who has unnatural feelings for ...,0
25001,train,airport '77 starts as a brand new luxury 747 p...,0
25002,train,this film lacked something i couldn't put my f...,0
25003,train,"sorry everyone,,, i know this is supposed to b...",0
25004,train,when i was little my parents took me along to ...,0


In [15]:
test['review'] = test['review'].apply(lambda x: lower_case(x))

In [16]:
train.iloc[24993,1]

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.<br /><br />i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.<br /><br />it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.<br /><br />hence, for the children, a 9/10 from me."

In [17]:
import re

def remove_htmltags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

remove_htmltags(train.iloc[24993,1])

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a 9/10 from me."

In [18]:
train['review'] = train['review'].apply(lambda x: remove_htmltags(x))
train.iloc[24993,1]

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a 9/10 from me."

In [19]:
test['review']  =test['review'].apply(lambda x: remove_htmltags(x))

In [20]:
def remove_numbers(text):
    output = re.sub(r'\d+', '', text)
    return output

train['review'] = train['review'].apply(lambda x: remove_numbers(x))
train.iloc[24993,1]


"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a / from me."

In [21]:
test['review'] = test['review'].apply(lambda x: remove_numbers(x))

In [22]:
import string

def remove_punctuation(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    return text_p

train['review'] = train['review'].apply(lambda x: remove_punctuation(x))
train.head()

Unnamed: 0,type,review,label
25000,train,story of a man who has unnatural feelings for ...,0
25001,train,airport starts as a brand new luxury plane i...,0
25002,train,this film lacked something i couldnt put my fi...,0
25003,train,sorry everyone i know this is supposed to be a...,0
25004,train,when i was little my parents took me along to ...,0


In [23]:
test['review'] = test['review'].apply(lambda x: remove_punctuation(x))

In [24]:
train.iloc[24993,1]

'i have not read the other comments on the film but judging from the average rating i can see that they are unlikely to be very complementaryi watched it for the second time with my children they absolutely loved it true it did not have the adults rolling around the floor but the sound of the childrens enjoyment made it seem soit is a true mel brooks farce with plenty of moral content  how sad it is to be loved for our money not for whom we are and how fickle are our friends and associates there are many other films on a similar subject matter no doubt many of which will have a greater comic or emotional impact on adults its hard for me to imagine such an impact on the junior members of the family howeverhence for the children a  from me'

In [25]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

def tokenize(text):
    words = word_tokenize(text)
    return words

train['review'] = train['review'].apply(lambda x: tokenize(x))
train.head()

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,type,review,label
25000,train,"[story, of, a, man, who, has, unnatural, feeli...",0
25001,train,"[airport, starts, as, a, brand, new, luxury, p...",0
25002,train,"[this, film, lacked, something, i, couldnt, pu...",0
25003,train,"[sorry, everyone, i, know, this, is, supposed,...",0
25004,train,"[when, i, was, little, my, parents, took, me, ...",0


In [26]:
test['review'] = test['review'].apply(lambda x: tokenize(x))

In [27]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
print(stop_words)
def remove_stopwords(text):
    filtered_words = [word for word in text if word not in stop_words]
    return filtered_words



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
train['review'] = train['review'].apply(lambda x: remove_stopwords(x))
train.head()

Unnamed: 0,type,review,label
25000,train,"[story, man, unnatural, feelings, pig, starts,...",0
25001,train,"[airport, starts, brand, new, luxury, plane, l...",0
25002,train,"[film, lacked, something, couldnt, put, finger...",0
25003,train,"[sorry, everyone, know, supposed, art, film, w...",0
25004,train,"[little, parents, took, along, theater, see, i...",0


In [29]:
test['review'] = test['review'].apply(lambda x: remove_stopwords(x))

In [30]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    stemmed = [stemmer.stem(word) for word in text]
    return stemmed

train['review'] = train['review'].apply(lambda x: stem_words(x))
train.head()

Unnamed: 0,type,review,label
25000,train,"[stori, man, unnatur, feel, pig, start, open, ...",0
25001,train,"[airport, start, brand, new, luxuri, plane, lo...",0
25002,train,"[film, lack, someth, couldnt, put, finger, fir...",0
25003,train,"[sorri, everyon, know, suppos, art, film, wow,...",0
25004,train,"[littl, parent, took, along, theater, see, int...",0


In [31]:
test['review'] = test['review'].apply(lambda x: stem_words(x))

In [32]:
# def remove_numbers(text):
#     return [word for word in text if not word.isdigit()]
# train['review'] = train['review'].apply(lambda x: remove_numbers(x))
# train.head()

In [33]:
all_reviews = train['review'].tolist()
vocab = [item for sublist in all_reviews for item in sublist ]
len(vocab)
    

3000532

In [34]:
vocab = list(set(vocab))
len(vocab)

109953

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(text):
    return text


train_reviews = list(train['review'])
test_reviews = list(test['review'])

cv = CountVectorizer(tokenizer=dummy,preprocessor=dummy, lowercase=False)
X_train = cv.fit_transform(train_reviews)
X_test = cv.transform(test_reviews)

In [83]:
y_train = train['label']

from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train,y_train)
ypred_train = MNB.predict(X_train)
ypred_test = MNB.predict(X_test)


from sklearn.metrics import accuracy_score
train_acc = accuracy_score(ypred_train,y_train)
test_acc = accuracy_score(ypred_test,y_test)
print('Sklearn Naive Bayes train accuracy: {:.1f} %'.format(train_acc*100))
print('Sklearn Naive Bayes test accuracy: {:.1f} %'.format(test_acc*100))

Sklearn Naive Bayes train accuracy: 91.8 %
Sklearn Naive Bayes test accuracy: 81.7 %


In [84]:
from sklearn.metrics import f1_score
train_score = f1_score(ypred_train,y_train)
test_score = f1_score(ypred_test,y_test)
print('Sklearn Naive Bayes train F1-score: {:.1f} %'.format(train_score*100))
print('Sklearn Naive Bayes test F1-score: {:.1f} %'.format(test_score*100))

Sklearn Naive Bayes train F1-score: 91.6 %
Sklearn Naive Bayes test F1-score: 80.6 %


In [46]:
#length of vocabulary
V = X_train.shape[1]
V

109953

In [56]:
type(y_train)

pandas.core.series.Series

In [57]:
len(y_train[y_train==0])/len(y_train)

0.5

In [74]:
class Naive_bayes():
    def __init__(self):
        pass
    
    def fit(self,X_train, y_train):
        
        self.X_train = X_train
        self.y_train = y_train
        
        #prior probabilities
        positive_class_prob = len(y_train[y_train==1])/len(y_train)
        negative_class_prob = len(y_train[y_train==0])/len(y_train)
        
        #vocabulary size
        V = X_train.shape[1]
        
        #posterior probabilities for each token in vocabulary
        negative_matrix = X_train[0:12500,:]
        positive_matrix = X[12500::,:]

        positive_count = positive_matrix.sum(axis=0)
        negative_count = negative_matrix.sum(axis=0)

        positive_totalcount = positive_matrix.sum()
        negative_totalcount = negative_matrix.sum()
        
        log_positive_probs = np.log10((positive_count+1)/(positive_totalcount +V))
        log_negative_probs = np.log10((negative_count+1)/(negative_totalcount +V))
        
        self.positive_class_prob = positive_class_prob
        self.negative_class_prob = negative_class_prob
        self.log_positive_probs = log_positive_probs
        self.log_negative_probs = log_negative_probs
    
    
    def predict(self,X):
        self.X = X

        X_positive = X @ self.log_positive_probs.T + np.log10(self.positive_class_prob)
        X_negative = X @ self.log_negative_probs.T + np.log10(self.negative_class_prob)
        
        bool_array = np.asarray(X_positive > X_negative).flatten()
        ypred = bool_array.astype(int)
        
        return ypred
           

In [77]:
clf = Naive_bayes()
clf.fit(X_train,y_train)
ytrain_pred = clf.predict(X_train)
ytest_pred = clf.predict(X_test)

In [76]:
accuracy_score(ytrain_pred,y_train)

0.91768

In [78]:
accuracy_score(ytest_pred,y_test)

0.81676

In [89]:
(y_test[:12500]==0).sum()

12500

In [None]:
precision = 