In [1]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/content/drive/MyDrive/SentimentAnalysis/SentimentAnalysis/IMDB Dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
df.sample(7)

Unnamed: 0,review,sentiment
14834,The worst movie ever made. If anyone asks you ...,negative
7147,What is most disturbing about this film is not...,positive
17611,"My first Ichikawa in many years, and the first...",positive
32000,I think this is the worst movie I have seen si...,negative
206,"If you liked William Hickey in ""Prizzi's Honor...",negative
29699,I'll give it a two because it has a lot of mus...,negative
44324,"Although I had some hopes for this film, parti...",negative


In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [7]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from string import punctuation
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Text Preprocessing

In [8]:
def preprocess(txt):
  ps = PorterStemmer()
  out = re.findall(r'https?://\S+|www\.\S+|.com',txt)

  if(len(out)):
    for i in out:
      txt = txt.replace(i,"")

  out = re.findall(r'<.*?>',txt)
  if(len(out)):
    for i in out:
      txt = txt.replace(i,"")
  txt = txt.lower()
  if(len(out)):
    for i in out:
      txt = txt.replace(i,"")
  word_removal = stopwords.words('english')
  y = []
  for i in word_tokenize(txt):
    if (i not in word_removal and i not in punctuation):
      y.append(i)
  txt = y[:]
  y.clear()
  txt = " ".join(txt)
  txt = re.sub(r'\.{2,}', '', txt)
  y = []
  for i in word_tokenize(txt):
    y.append(ps.stem(i))
  txt = y[:]
  y.clear()
  txt = " ".join(txt)
  return txt

In [9]:
df['no_words'] = df['review'].str.len()

In [10]:
df

Unnamed: 0,review,sentiment,no_words
0,One of the other reviewers has mentioned that ...,positive,1761
1,A wonderful little production. <br /><br />The...,positive,998
2,I thought this was a wonderful way to spend ti...,positive,926
3,Basically there's a family where a little boy ...,negative,748
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1317
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1008
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,642
49997,I am a Catholic taught in parochial elementary...,negative,1280
49998,I'm going to have to disagree with the previou...,negative,1234


In [11]:
df['review'] = df['review'].apply(preprocess)

In [12]:
df['review']

Unnamed: 0,review
0,one review mention watch 1 oz episod 'll hook ...
1,wonder littl product film techniqu unassuming-...
2,thought wonder way spend time hot summer weeke...
3,basic 's famili littl boy jake think 's zombi ...
4,petter mattei 's `` love time money `` visual ...
...,...
49995,thought movi right good job n't creativ origin...
49996,bad plot bad dialogu bad act idiot direct anno...
49997,cathol taught parochi elementari school nun ta...
49998,'m go disagre previous side maltin one second ...


In [13]:
Ps = PorterStemmer()
Ps.stem('mentioned')

'mention'

In [14]:
preprocess(df['review'][0])

"one review mention watch 1 oz episod 'll hook right exactli happen me.th first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word.it call oz nicknam given oswald maximum secur state penitentari focu mainli emerald citi experi section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away.i would say main appeal show due fact goe show would n't dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz n't mess around first episod ever saw struck nasti surreal could n't say readi watch develop tast oz got accustom high level graphic violenc violenc injust crook guard 'll sold nickel inmat 'll kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may befort ufort view get 

In [15]:
df['no_words'] = df['review'].str.len()

In [16]:
df

Unnamed: 0,review,sentiment,no_words
0,one review mention watch 1 oz episod 'll hook ...,positive,1028
1,wonder littl product film techniqu unassuming-...,positive,574
2,thought wonder way spend time hot summer weeke...,positive,546
3,basic 's famili littl boy jake think 's zombi ...,negative,403
4,petter mattei 's `` love time money `` visual ...,positive,787
...,...,...,...
49995,thought movi right good job n't creativ origin...,positive,483
49996,bad plot bad dialogu bad act idiot direct anno...,negative,354
49997,cathol taught parochi elementari school nun ta...,negative,741
49998,'m go disagre previous side maltin one second ...,negative,744


In [17]:
df['sentiment'].replace({'positive':1,'negative':0},inplace=True)

In [18]:
from collections import Counter

In [19]:
df_neg = df[df['sentiment']==0]
df_pos = df[df['sentiment']==1]


In [20]:
txt_pos = []
for i in df_pos['review'].values:
   for j in word_tokenize(i):
    txt_pos.append(j)
txt_neg = []
for i in df_neg['review'].values:
   for j in word_tokenize(i):
    txt_neg.append(j)


In [21]:
len(txt_pos)

3155210

In [22]:
len(txt_neg)

3092144

In [23]:
c1 = Counter(txt_pos)
c2 = Counter(txt_neg)

In [24]:
c1.most_common(10)

[('``', 63471),
 ("'s", 62939),
 ('film', 49254),
 ('movi', 43392),
 ('one', 27088),
 ("n't", 26466),
 ('like', 19933),
 ('time', 15582),
 ('see', 14728),
 ('good', 14579)]

In [25]:
c2.most_common(10)

[('``', 68408),
 ("'s", 58714),
 ('movi', 56221),
 ('film', 43264),
 ("n't", 39509),
 ('one', 25833),
 ('like', 24019),
 ('make', 15403),
 ('would', 15198),
 ('even', 15190)]

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,ComplementNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [27]:
X = TfidfVectorizer(max_features=2000)

In [28]:
bow = X.fit_transform(df['review'])

In [29]:
X_train,X_test,y_train,y_test = train_test_split(bow.toarray(),df['sentiment'],test_size=0.2)

In [30]:
m1 = MultinomialNB()
m2 = BernoulliNB()
m3 = ComplementNB()
m4 = DecisionTreeClassifier()
m5 = RandomForestClassifier()
m6 = AdaBoostClassifier()
m7 = GradientBoostingClassifier()
m8 = LogisticRegression()

In [31]:
est = {'MNB':m1,'BNB':m2,'CNB':m3}

In [32]:
def create_models(est,X_train,X_test,y_train,y_test):
  X = []
  cnt = 1
  col_names = ['Model','Accuracy']
  for i in est.keys():
    a = []
    est[i].fit(X_train,y_train)
    y_pred = est[i].predict(X_test)
    a.append(i)
    a.append(accuracy_score(y_pred,y_test))
    X.append(a)
    print(cnt ,"Ho Gaya",end='\n')
    cnt += 1
  return pd.DataFrame(X,columns=col_names)


In [33]:
ans = create_models(est,X_train,X_test,y_train,y_test)

1 Ho Gaya
2 Ho Gaya
3 Ho Gaya


In [34]:
ans

Unnamed: 0,Model,Accuracy
0,MNB,0.8411
1,BNB,0.8429
2,CNB,0.8409
