In [47]:
import pandas as pd
import numpy as np

In [48]:
df = pd.read_csv('./data/imdb_master.csv',encoding="ISO-8859-1")

In [49]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [50]:
#remove columns that are not required
df.drop(['Unnamed: 0','file'], axis=1, inplace=True)
df.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


In [51]:
#split dataframe into train and test sets

train = df[df['type']== 'train']
print('No. of observations in train set: ' + str(len(train)))

test = df[df['type']== 'test']
print('No. of observations in test set: ' + str(len(test)))


No. of observations in train set: 75000
No. of observations in test set: 25000


In [52]:
# remove unlabelled samples

train = train[train['label'] != 'unsup']
test = test[test['label'] != 'unsup']

print('No. of observations in train set after removing unlabelled samples: ' + str(len(train)))
print('No. of observations in test set after removing unlabelled samples: ' + str(len(test)))

No. of observations in train set after removing unlabelled samples: 25000
No. of observations in test set after removing unlabelled samples: 25000


### Text preprocessing

In [53]:
train.head()

Unnamed: 0,type,review,label
25000,train,Story of a man who has unnatural feelings for ...,neg
25001,train,Airport '77 starts as a brand new luxury 747 p...,neg
25002,train,This film lacked something I couldn't put my f...,neg
25003,train,"Sorry everyone,,, I know this is supposed to b...",neg
25004,train,When I was little my parents took me along to ...,neg


In [54]:
train.tail()

Unnamed: 0,type,review,label
49995,train,"Seeing as the vote average was pretty low, and...",pos
49996,train,"The plot had some wretched, unbelievable twist...",pos
49997,train,I am amazed at how this movie(and most others ...,pos
49998,train,A Christmas Together actually came before my t...,pos
49999,train,Working-class romantic drama from director Mar...,pos


In [55]:
train['label'].value_counts()

pos    12500
neg    12500
Name: label, dtype: int64

In [56]:
train['label'] = train['label'].map({'pos': 1, 'neg': 0})
len(train[train['label'] == 0])

12500

In [57]:
len(train[train['label'] == 1])

12500

In [58]:
train['label'].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [59]:
#lower casing
def lower_case(text):
    return text.lower()
train['review'] = train['review'].apply(lambda x: lower_case(x))
train.head()

Unnamed: 0,type,review,label
25000,train,story of a man who has unnatural feelings for ...,0
25001,train,airport '77 starts as a brand new luxury 747 p...,0
25002,train,this film lacked something i couldn't put my f...,0
25003,train,"sorry everyone,,, i know this is supposed to b...",0
25004,train,when i was little my parents took me along to ...,0


In [60]:
train.iloc[24993,1]

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.<br /><br />i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.<br /><br />it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.<br /><br />hence, for the children, a 9/10 from me."

In [61]:
import re

def remove_htmltags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

remove_htmltags(train.iloc[24993,1])

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a 9/10 from me."

In [None]:
train['review'] = train['review'].apply(lambda x: remove_htmltags(x))
train.iloc[24993,1]

"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a 9/10 from me."

In [None]:
def remove_numbers(text):
    output = re.sub(r'\d+', '', text)
    return output

train['review'] = train['review'].apply(lambda x: remove_numbers(x))
train.iloc[24993,1]


"i have not read the other comments on the film, but judging from the average rating i can see that they are unlikely to be very complementary.i watched it for the second time with my children. they absolutely loved it. true, it did not have the adults rolling around the floor, but the sound of the children's enjoyment made it seem so.it is a true mel brooks farce, with plenty of moral content - how sad it is to be loved for our money, not for whom we are, and how fickle are our friends and associates. there are many other films on a similar subject matter, no doubt, many of which will have a greater comic or emotional impact on adults. it's hard for me to imagine such an impact on the junior members of the family, however.hence, for the children, a / from me."

In [None]:
import string

def remove_punctuation(text):
    text_p = "".join([char for char in text if char not in string.punctuation])
    return text_p

train['review'] = train['review'].apply(lambda x: remove_punctuation(x))
train.head()

Unnamed: 0,type,review,label
25000,train,story of a man who has unnatural feelings for ...,0
25001,train,airport starts as a brand new luxury plane i...,0
25002,train,this film lacked something i couldnt put my fi...,0
25003,train,sorry everyone i know this is supposed to be a...,0
25004,train,when i was little my parents took me along to ...,0


In [None]:
train.iloc[24993,1]

'i have not read the other comments on the film but judging from the average rating i can see that they are unlikely to be very complementaryi watched it for the second time with my children they absolutely loved it true it did not have the adults rolling around the floor but the sound of the childrens enjoyment made it seem soit is a true mel brooks farce with plenty of moral content  how sad it is to be loved for our money not for whom we are and how fickle are our friends and associates there are many other films on a similar subject matter no doubt many of which will have a greater comic or emotional impact on adults its hard for me to imagine such an impact on the junior members of the family howeverhence for the children a  from me'

In [None]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

def tokenize(text):
    words = word_tokenize(text)
    return words

train['review'] = train['review'].apply(lambda x: tokenize(x))
train.head()

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
print(stop_words)
def remove_stopwords(text):
    filtered_words = [word for word in text if word not in stop_words]
    return filtered_words



In [None]:
train['review'] = train['review'].apply(lambda x: remove_stopwords(x))
train.head()

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_words(text):
    stemmed = [stemmer.stem(word) for word in text]
    return stemmed

train['review'] = train['review'].apply(lambda x: stem_words(x))
train.head()

In [None]:
# def remove_numbers(text):
#     return [word for word in text if not word.isdigit()]
# train['review'] = train['review'].apply(lambda x: remove_numbers(x))
# train.head()

In [None]:
a = train.iloc[0,1]
b = train.iloc[1,1]
c = list(set(a+b))
len(c)

In [None]:
len(a)

In [None]:
len(b)

In [None]:
all_reviews = train['review'].tolist()
vocab = [item for sublist in all_reviews for item in sublist ]
len(vocab)
    

In [None]:
vocab = list(set(vocab))
len(vocab)

In [None]:
bag_vec = np.zeros(shape=(1,len(vocab)))

bag_vec.sum()

In [None]:
a = train.iloc[5,1]
for word in a:
    for i,token in enumerate(vocab):
        if token == word:
            bag_vec[0,i] += 1
        
        
    

In [None]:
list(train['review'])

In [None]:
# train['label'] = train['label'].map({'pos': 1, 'neg': 0})
# train['label'].value_counts()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(text):
    return text


all_reviews = list(train['review'])

cv = CountVectorizer(tokenizer=dummy,preprocessor=dummy, lowercase=False)
X = cv.fit_transform(all_reviews)
len(cv.get_feature_names())

In [None]:
X_train = all_reviews
y_train = train['label']

from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X,y_train)

from sklearn.metrics import accuracy_score
y_pred = MNB.predict(X)
train_acc = accuracy_score(y_pred,y_train)
train_acc

In [None]:
X_train = all_reviews
y_train = train['label']

from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
BNB.fit(X,y_train)

from sklearn.metrics import accuracy_score
y_pred = BNB.predict(X)
train_acc = accuracy_score(y_pred,y_train)
train_acc

In [None]:
#prior probabilities
p_pos = len(train[train['label']== 1])/len(train)
p_neg = len(train[train['label']== 0])/len(train)

In [None]:
#vocabulary size
skvocab = cv.get_feature_names()
V = len(skvocab)
V

In [None]:
#posterior probabilities for each token in vocabulary
neg_mat = X[0:12500,:]
pos_mat = X[12500::,:]

pos_count = pos_mat.sum(axis=0)
neg_count = neg_mat.sum(axis=0)

pos_all = pos_mat.sum()
neg_all = neg_mat.sum()


In [None]:
pos_count = np.asarray(pos_count).reshape(-1)
neg_count = np.asarray(neg_count).reshape(-1)

In [None]:
pos_prob = (pos_count+1)/(pos_all+V)
neg_prob = (neg_count+1)/(neg_all+V)
pos_prob_log = np.log10(pos_prob)
neg_prob_log = np.log10(neg_prob)

In [None]:
spos = np.multiply(X[-2,:].todense(),pos_prob)
spos = np.asarray(spos).reshape(-1)
sposp = np.prod(spos, where = spos>0) * 0.5

sneg = np.multiply(X[-2,:].todense(),neg_prob)
sneg = np.asarray(sneg).reshape(-1)
snegp = np.prod(sneg, where = sneg>0) * 0.5


In [None]:
pos_prob.reshape(1,-1).shape

In [None]:
spos = np.multiply(X[-2,:],pos_prob)

In [None]:
# pos_probs = np.multiply(X.todense(), pos_prob.reshape(1,-1), where =X.todense()>0, keepdims=True)

In [None]:
sposp

In [None]:
snegp

In [None]:
snegp>sposp

In [None]:
#s1_p = np.multiply(X.todense(),pos_prob_log, where = X.todense() >0)
#s1_n = np.multiply(X.todense(),neg_prob_log, where = X.todense() >0)

In [None]:
s1_pos = np.sum(np.asarray(s1_p).reshape(-1)) + np.log10(0.5)
s1_n = np.sum(np.asarray(s1_n).reshape(-1)) + np.log10(0.5)

In [None]:
s1_posp = 10**s1_pos
s1_negp = 10**s1_n

In [None]:
test_count1 = X[0,:]/(pos_all + V) + pos_prob
test_count2 = X[0,:]/(neg_all +V) + neg_prob
print(np.prod(test_count1))
print(np.prod(test_count2))

In [None]:
s1_p = np.multiply(X[0,:].todense(), pos_prob)


In [None]:
s1_posp

In [None]:
s1_negp