In [29]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,accuracy_score

In [30]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df.shape

(50000, 2)

In [5]:
df.columns.tolist()

['review', 'sentiment']

In [6]:
df['positive'] = df['sentiment'].apply(lambda x : 1 if x == 'positive' else 0)

In [7]:
df

Unnamed: 0,review,sentiment,positive
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [8]:
df[:5]

Unnamed: 0,review,sentiment,positive
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [9]:
df.sentiment.value_counts()
# balanced dataset

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.review,df.positive,test_size=0.2)

In [11]:
X_train[:7]

45635    For the first fifteen minutes the story of NAK...
45507    I just did not enjoy this film. But then I lov...
28590    Travolta and Thurman deserved a better movie. ...
23104    One of Disney's best films that I can enjoy wa...
47688    This might quite possibly be the worst movie I...
22381    The writer came up with a pretty decent idea f...
13618    Two sisters, Su-mi (IM Soo-jung) and Su-yeon (...
Name: review, dtype: object

In [12]:
y_train[:7]

45635    0
45507    0
28590    0
23104    1
47688    0
22381    0
13618    1
Name: positive, dtype: int64

In [13]:
# using Random Classifier algorithm 

from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('randomclassifier',RandomForestClassifier(n_estimators=50,criterion='entropy',random_state=42))
])
    

In [14]:
clf.fit(X_train,y_train)

In [31]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84      4932
           1       0.85      0.83      0.84      5068

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

0.841


In [22]:
#using k nearest neighbour algorithm
pdf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('knn',KNeighborsClassifier(n_neighbors=10,metric='euclidean'))
])
    

In [23]:
pdf.fit(X_train,y_train)

In [35]:
y_pred = pdf.predict(X_test)
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.65      0.68      0.66      4932
           1       0.67      0.64      0.66      5068

    accuracy                           0.66     10000
   macro avg       0.66      0.66      0.66     10000
weighted avg       0.66      0.66      0.66     10000

0.6598


In [25]:
#using naive bayes multinomialnb
jkf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('multinomialnb',MultinomialNB())
])
    

In [26]:
jkf.fit(X_train,y_train)

In [32]:
y_pred = jkf.predict(X_test)
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      4932
           1       0.87      0.82      0.85      5068

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

0.8507


In [38]:
#accuracy of different algorithms
#RandomForestClassifier - 0.84
#KNeighboursClassifier  - 0.65(fails)
#MultinomialNB          - 0.85

#reasons might be:
#K-Nearest Neighbors is sensitive to the curse of dimensionality, meaning that its performance might degrade as the number of features increases