In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import re

In [2]:
df=pd.read_csv("imdb_data.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [4]:
df['sentiment'].value_counts()/(len(df))*100
# '''The data is balanced'''

positive    50.0
negative    50.0
Name: sentiment, dtype: float64

In [5]:
df.info() #  no missing data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
df.shape

(50000, 2)

# Text Cleaning:
1. Sampling 1000 rows
2. Replacing positive and negative sentences to 1 and 0 respectively
3. Removing html tags
4. Removing special characters
5. Converting everything to lower case
6. Removing stop words
7. stemming

In [8]:
df1=df.sample(15000)

In [9]:
df1.shape

(15000, 2)

In [10]:
df1.info() 
#no null values are there

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 35045 to 23885
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     15000 non-null  object
 1   sentiment  15000 non-null  object
dtypes: object(2)
memory usage: 351.6+ KB


In [11]:
df1['sentiment'].replace({'positive':1 , 'negative':0},inplace=True)
#Replacing positive and negative sentences to 1 and 0 respectively


In [12]:
#Function to clean html/unwanted tags
def clean_tags(text):
    clean=re.compile(r'<.*?>')
    return clean.sub('', text)


#applying the function on the dataset
df1['review']=df1['review'].apply(clean_tags)

In [13]:
df1['review'].iloc[2]

"Jacqueline Hyde starts like any other normal day for telemarketing individual Jackie Hyde (co-producer Gabriella Hall) until her boss (Robert Donovan) fires her for taking personal calls at work, however it's not all bad news as the call she took was from a lawyer informing her that her Grandfather (Malcolm Bennett) has recently died & that he left her his mansion & fortune (why doesn't stuff like that ever happen to me? Sigh). Very excited Jackie heads on over there & makes herself right at home, while looking for the thermostat late one night Jackie stumbles upon a secret room where her Grandfather stashes the bright red formula that he invented that allows whoever drinks it to change their appearance. Being a bit on the porky side Jackie finally settles on the glamorous Jacqueline (Blythe Metz), however Jackie's better looking alter-ego starts to take control...Written, co-produced & directed by Rolfe Kanefsky I thought Jacqueline Hyde was complete total & utter crap from start to 

In [14]:
#Function to convert text to lower case
def con_lower(text):
    return text.lower()


#applying the function on the dataset
df1['review']=df1['review'].apply(con_lower)

In [15]:
df1['review'].iloc[2]

"jacqueline hyde starts like any other normal day for telemarketing individual jackie hyde (co-producer gabriella hall) until her boss (robert donovan) fires her for taking personal calls at work, however it's not all bad news as the call she took was from a lawyer informing her that her grandfather (malcolm bennett) has recently died & that he left her his mansion & fortune (why doesn't stuff like that ever happen to me? sigh). very excited jackie heads on over there & makes herself right at home, while looking for the thermostat late one night jackie stumbles upon a secret room where her grandfather stashes the bright red formula that he invented that allows whoever drinks it to change their appearance. being a bit on the porky side jackie finally settles on the glamorous jacqueline (blythe metz), however jackie's better looking alter-ego starts to take control...written, co-produced & directed by rolfe kanefsky i thought jacqueline hyde was complete total & utter crap from start to 

In [16]:
#Function to remove special char

def rem_spcl(text):
    x=''
    for i in text:
        if i.isalnum() == True:
            x=x+i
        else:
            x=x+' '
    return x



#applying the function on the dataset
df1['review']=df1['review'].apply(rem_spcl)

In [17]:
df1['review'].iloc[2]

'jacqueline hyde starts like any other normal day for telemarketing individual jackie hyde  co producer gabriella hall  until her boss  robert donovan  fires her for taking personal calls at work  however it s not all bad news as the call she took was from a lawyer informing her that her grandfather  malcolm bennett  has recently died   that he left her his mansion   fortune  why doesn t stuff like that ever happen to me  sigh   very excited jackie heads on over there   makes herself right at home  while looking for the thermostat late one night jackie stumbles upon a secret room where her grandfather stashes the bright red formula that he invented that allows whoever drinks it to change their appearance  being a bit on the porky side jackie finally settles on the glamorous jacqueline  blythe metz   however jackie s better looking alter ego starts to take control   written  co produced   directed by rolfe kanefsky i thought jacqueline hyde was complete total   utter crap from start to 

In [18]:
# for removing stopwords
import nltk
from nltk.corpus import stopwords

In [19]:
len(stopwords.words('English'))

179

In [20]:
#Function to remove stopwords
def rem_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y



#applying the function on the dataset
df1['review']=df1['review'].apply(rem_stopwords)

In [21]:
print(df1['review'].iloc[2] , end='')

['jacqueline', 'hyde', 'starts', 'like', 'normal', 'day', 'telemarketing', 'individual', 'jackie', 'hyde', 'co', 'producer', 'gabriella', 'hall', 'boss', 'robert', 'donovan', 'fires', 'taking', 'personal', 'calls', 'work', 'however', 'bad', 'news', 'call', 'took', 'lawyer', 'informing', 'grandfather', 'malcolm', 'bennett', 'recently', 'died', 'left', 'mansion', 'fortune', 'stuff', 'like', 'ever', 'happen', 'sigh', 'excited', 'jackie', 'heads', 'makes', 'right', 'home', 'looking', 'thermostat', 'late', 'one', 'night', 'jackie', 'stumbles', 'upon', 'secret', 'room', 'grandfather', 'stashes', 'bright', 'red', 'formula', 'invented', 'allows', 'whoever', 'drinks', 'change', 'appearance', 'bit', 'porky', 'side', 'jackie', 'finally', 'settles', 'glamorous', 'jacqueline', 'blythe', 'metz', 'however', 'jackie', 'better', 'looking', 'alter', 'ego', 'starts', 'take', 'control', 'written', 'co', 'produced', 'directed', 'rolfe', 'kanefsky', 'thought', 'jacqueline', 'hyde', 'complete', 'total', 'utt

In [22]:
df1

Unnamed: 0,review,sentiment
35045,"[remember, trailer, legend, zu, quite, impress...",1
38508,"[recently, watched, film, 30, th, gothenburg, ...",0
48522,"[jacqueline, hyde, starts, like, normal, day, ...",0
965,"[never, want, see, movie, dreadfully, bad, sta...",0
13006,"[nothing, dull, movie, held, together, fully, ...",1
...,...,...
26280,"[one, original, idea, story, themes, pulled, v...",0
41647,"[haines, excellent, brash, cadet, thinks, west...",1
38716,"[dislike, film, seem, think, loved, one, someh...",1
17232,"[awful, film, badly, written, badly, acted, cl...",0


In [23]:
#porter stemmer
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
y=[]
def stem_words(text):
    
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z



#applying the function on the dataset
df1['review']=df1['review'].apply(stem_words)

In [24]:
print(df1['review'].iloc[2] , end='')

['jacquelin', 'hyde', 'start', 'like', 'normal', 'day', 'telemarket', 'individu', 'jacki', 'hyde', 'co', 'produc', 'gabriella', 'hall', 'boss', 'robert', 'donovan', 'fire', 'take', 'person', 'call', 'work', 'howev', 'bad', 'news', 'call', 'took', 'lawyer', 'inform', 'grandfath', 'malcolm', 'bennett', 'recent', 'die', 'left', 'mansion', 'fortun', 'stuff', 'like', 'ever', 'happen', 'sigh', 'excit', 'jacki', 'head', 'make', 'right', 'home', 'look', 'thermostat', 'late', 'one', 'night', 'jacki', 'stumbl', 'upon', 'secret', 'room', 'grandfath', 'stash', 'bright', 'red', 'formula', 'invent', 'allow', 'whoever', 'drink', 'chang', 'appear', 'bit', 'porki', 'side', 'jacki', 'final', 'settl', 'glamor', 'jacquelin', 'blyth', 'metz', 'howev', 'jacki', 'better', 'look', 'alter', 'ego', 'start', 'take', 'control', 'written', 'co', 'produc', 'direct', 'rolf', 'kanefski', 'thought', 'jacquelin', 'hyde', 'complet', 'total', 'utter', 'crap', 'start', 'finish', 'simpl', 'straight', 'forward', 'accord', '

In [25]:
print(df1['review'].iloc[9] , end='')

['movi', 'everyth', 'want', 'action', 'movi', 'explos', 'shootout', 'bad', 'guy', 'wors', 'guy', 'fun', 'see', 'jame', 'belushi', 'use', 'humor', 'get', 'troubl', 'gotten', 'sinc', 'stole', '12', 'million', 'dollar', 'ultim', 'big', 'boss', 'skipper', 'sound', 'cheesi', 'cours', 'boy', 'fun', 'watch', 'movi', 'whole', 'lot', 'better', 'direct', 'dvd', 'garbag', 'made', 'nowaday', 'get', 'silli', 'plot', 'find', 'movi', 'quit', 'surpris', 'store', 'could', 'argu', 'twist', 'predict', 'fast', 'pace', 'movi', 'give', 'time', 'think', 'much', 'bless', 'sinc', 'movi', 'reveal', 'ultim', 'twist', 'journey', 'moment', 'titl', 'bit', 'mislead', 'could', 'reason', 'mani', 'peopl', 'hate', 'movi', 'probabl', 'expect', 'movi', 'mobster', 'stead', 'crook', 'doubl', 'cross', 'pure', 'fun']

In [26]:
df1

Unnamed: 0,review,sentiment
35045,"[rememb, trailer, legend, zu, quit, impress, f...",1
38508,"[recent, watch, film, 30, th, gothenburg, film...",0
48522,"[jacquelin, hyde, start, like, normal, day, te...",0
965,"[never, want, see, movi, dread, bad, stand, se...",0
13006,"[noth, dull, movi, held, togeth, fulli, realiz...",1
...,...,...
26280,"[one, origin, idea, stori, theme, pull, variou...",0
41647,"[hain, excel, brash, cadet, think, west, point...",1
38716,"[dislik, film, seem, think, love, one, somehow...",1
17232,"[aw, film, badli, written, badli, act, clich, ...",0


In [27]:
def join_back(text):
    return ' '.join(text)


df1['review']=df1['review'].apply(join_back)

In [28]:
df1

Unnamed: 0,review,sentiment
35045,rememb trailer legend zu quit impress fan man ...,1
38508,recent watch film 30 th gothenburg film festiv...,0
48522,jacquelin hyde start like normal day telemarke...,0
965,never want see movi dread bad stand see hero s...,0
13006,noth dull movi held togeth fulli realiz charac...,1
...,...,...
26280,one origin idea stori theme pull variou sourc ...,0
41647,hain excel brash cadet think west point realli...,1
38716,dislik film seem think love one somehow belong...,1
17232,aw film badli written badli act clich hackney ...,0


In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [30]:
X=cv.fit_transform(df1['review']).toarray()

In [31]:
X.shape

(15000, 43046)

In [32]:
y=df1.iloc[:,-1].values

In [33]:
y.shape

(15000,)

In [34]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.2)

In [35]:
X_train.shape

(12000, 43046)

In [36]:
y_train.shape

(12000,)

In [37]:
X_test.shape

(3000, 43046)

In [38]:
y_test.shape

(3000,)

In [39]:
from sklearn.naive_bayes import GaussianNB , BernoulliNB , MultinomialNB

In [40]:
cf1=GaussianNB()
cf2=BernoulliNB()
cf3=MultinomialNB()

In [41]:
cf1.fit(X_train,y_train)
cf2.fit(X_train,y_train)
cf3.fit(X_train,y_train)

MultinomialNB()

In [42]:
y_pred1=cf1.predict(X_test)
y_pred2=cf2.predict(X_test)
y_pred3=cf3.predict(X_test)

In [43]:
y_test.shape

(3000,)

In [44]:
y_pred1.shape

(3000,)

In [45]:
y_pred_train1=cf1.predict(X_train)
y_pred_train2=cf2.predict(X_train)
y_pred_train3=cf3.predict(X_train)

In [46]:
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

In [47]:
print('Training Accuracy for Gaussian Naive Bayes: ',accuracy_score(y_train,y_pred_train1))
print('Training Accuracy for Bernoulli Naive Bayes: ',accuracy_score(y_train,y_pred_train2))
print('Training Accuracy for Multinomial Naive Bayes: ',accuracy_score(y_train,y_pred_train3))

Training Accuracy for Gaussian Naive Bayes:  0.883
Training Accuracy for Bernoulli Naive Bayes:  0.9195
Training Accuracy for Multinomial Naive Bayes:  0.9168333333333333


In [48]:
print('Test Accuracy for Gaussian Naive Bayes: ',accuracy_score(y_test,y_pred1))
print('Test Accuracy for Bernoulli Naive Bayes: ',accuracy_score(y_test,y_pred2))
print('Test Accuracy for Multinomial Naive Bayes: ',accuracy_score(y_test,y_pred3))

Test Accuracy for Gaussian Naive Bayes:  0.6333333333333333
Test Accuracy for Bernoulli Naive Bayes:  0.8416666666666667
Test Accuracy for Multinomial Naive Bayes:  0.8396666666666667
