In [1]:
#Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [6]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [9]:
df['sentiment_encoded'] = df['sentiment'].apply(lambda x:1 if x=="positive" else 0)
df.head()

Unnamed: 0,review,sentiment,sentiment_encoded
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [11]:
df.drop(columns=['sentiment'],axis = 1,inplace=True)

In [16]:
df.sample(3)

Unnamed: 0,review,sentiment_encoded
210,"I have just given a 10 for Thieves Highway, I ...",0
35009,A charming boy and his mother move to a middle...,0
27118,"Guy is a loser. Can't get girls, needs to buil...",0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment_encoded'], test_size=0.20)

Exercise-1

    using sklearn pipeline module create a classification pipeline to classify the movie review's positive or negative.


In [18]:
clf = Pipeline([
    ("vectorizer",CountVectorizer()),
    ("rfc",RandomForestClassifier(n_estimators = 50,criterion="entropy"))
])


In [19]:
clf.fit(X_train,y_train)

In [36]:
data = {"y_pred":clf.predict(X_test),"y_test":y_test}

In [37]:
df_pred = pd.DataFrame(data)
df_pred

Unnamed: 0,y_pred,y_test
17946,0,0
27989,1,1
37776,1,1
1325,1,1
39311,0,0
...,...,...
32678,1,1
34642,1,0
32724,0,0
26483,1,1


In [38]:
print(classification_report(df_pred['y_test'],df_pred['y_pred']))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      4996
           1       0.85      0.84      0.84      5004

    accuracy                           0.84     10000
   macro avg       0.85      0.84      0.84     10000
weighted avg       0.85      0.84      0.84     10000



## KNN

In [41]:
clf2 = Pipeline([
    ("vectorizer",CountVectorizer()),
    ("knncf",KNeighborsClassifier(n_neighbors=10,metric="euclidean"))
])

In [42]:
clf2.fit(X_train,y_train)

In [43]:
df_pred2 = pd.DataFrame(data = {"y_test":y_test,"y_pred":clf2.predict(X_test)})
df_pred2

Unnamed: 0,y_test,y_pred
17946,0,0
27989,1,1
37776,1,1
1325,1,1
39311,0,0
...,...,...
32678,1,1
34642,0,1
32724,0,0
26483,1,1


In [47]:
print(classification_report(df_pred2['y_test'],df_pred2['y_pred']))

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      4996
           1       0.66      0.64      0.65      5004

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000



In [48]:
clf3 = Pipeline([
    ("vectorizer",CountVectorizer()),
    ("mnb",MultinomialNB())
])

In [49]:
clf3.fit(X_train,y_train)

In [50]:
df_pred3 = pd.DataFrame(data = {"y_test":y_test,"y_pred":clf3.predict(X_test)})
df_pred3

Unnamed: 0,y_test,y_pred
17946,0,0
27989,1,1
37776,1,1
1325,1,1
39311,0,0
...,...,...
32678,1,1
34642,0,0
32724,0,0
26483,1,1


In [51]:
print(classification_report(df_pred3['y_test'],df_pred3['y_pred']))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      4996
           1       0.87      0.81      0.84      5004

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [62]:
cv = CountVectorizer()
cv.fit(X_train)

In [64]:
cv.transform(["Family Guy"]).toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

In [82]:
clf.predict(['Phil the Alien is a weird movie with bad plot and terrible sound'])

array([0])

In [78]:
df[df['sentiment_encoded']==0]

Unnamed: 0,review,sentiment_encoded
3,Basically there's a family where a little boy ...,0
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
10,Phil the Alien is one of those quirky films wh...,0
11,I saw this movie when I was about 12 when it c...,0
...,...,...
49994,This is your typical junk comedy.<br /><br />T...,0
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0
