In [13]:
import numpy as np
import pandas as pd
import chardet 

In [16]:
with open('IMDB Dataset.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large
df = pd.read_csv('IMDB Dataset.csv', encoding=result['encoding'])

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [18]:
df = df.iloc[:10000]

In [20]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [23]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [31]:
df.isnull().sum()  #No null value present

review       0
sentiment    0
dtype: int64

In [32]:
df.duplicated().sum() #there are 17 duplicate value present


17

In [34]:
df.drop_duplicates(inplace=True)

In [35]:
df.duplicated().sum()

0

# Basic Preprocessing of Data 
# Remove tags
# lowercase
# remove stopwords

In [37]:
import re 
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [40]:
df['review'] = df['review'].apply(remove_tags)  #remove html tags from the text 

In [43]:
df['review'] = df['review'].apply(lambda x : x.lower())  #lowecasing the letters 

In [45]:
from nltk.corpus import stopwords
sw_list = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [47]:
df.sample(5)

Unnamed: 0,review,sentiment
8846,movie serves timely warning anyone thinks writ...,negative
3162,"long-time fan superman comics, 1950s series, f...",negative
7513,"upon time, sweden, poor salvation army sister....",positive
3751,"great german slasher, that's often quite suspe...",positive
5060,"unfortunately, film typical watering good film...",negative


In [48]:
x = df.iloc[:,0:1]
y=df['sentiment']

In [49]:
x

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
9995,"fun, entertaining movie wwii german spy (julie..."
9996,"give break. anyone say ""good hockey movie""? kn..."
9997,movie bad movie. watching endless series bad h...
9998,"movie probably made entertain middle school, e..."


In [50]:
y

0       positive
1       positive
2       positive
3       negative
4       positive
          ...   
9995    positive
9996    negative
9997    negative
9998    negative
9999    positive
Name: sentiment, Length: 9983, dtype: object

In [56]:
#Label Encoding the value 
#changing the y coloumn value to numbers 

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y= encoder.fit_transform(y)

In [54]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [79]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [84]:
x_train.shape
#x_test.shape

(7986, 1)

Applying CBOW

In [81]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [82]:
#This method converts the sparse matrix representation generated by CountVectorizer into a dense numpy array.
x_train_bow = cv.fit_transform(x_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

In [85]:
x_train_bow.shape


(7986, 48220)

Applying Model

In [87]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB
mnb = MultinomialNB()
gnb = GaussianNB()

In [91]:
#forMultinomialNB 
mnb.fit(x_train_bow, y_train)

In [99]:
y_pred = mnb.predict(x_test_bow)
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)


0.8452679018527791

In [98]:
confusion_matrix(y_test,y_pred)

array([[879, 118],
       [191, 809]], dtype=int64)

In [101]:
#using GaussianNB 
gnb = GaussianNB()
gnb.fit(x_train_bow,y_train)
y_pred1 = gnb.predict(x_test_bow)
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred1)


0.6129193790686029

In [100]:
confusion_matrix(y_test,y_pred1)

array([[701, 296],
       [477, 523]], dtype=int64)

In [106]:
#applying Random Forest 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train_bow,y_train)
y_pred2 = rf.predict(x_test_bow)
accuracy_score(y_test,y_pred2)

0.8457686529794692

#so after comparing 3 model Random forest is slight upper hand on Naive Bayes Algorithm
#I have also tired to tune the max feature hyperparameter and TFIDF vectorizer, but it was reducing the accuracy 