# Movie review classifier


Importing 

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

Importing data

In [6]:
df=pd.read_csv("IMDB Dataset.csv")

df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [7]:
df["category"]=df.sentiment.apply(lambda x: 1 if x=="positive" else 0 )
df.category

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: category, Length: 50000, dtype: int64

Preprocessing data by removing stop words, punctuations,and converting to their respective lemma

In [8]:
import spacy

nlp=spacy.load("en_core_web_sm")

def preprocess(text):
    doc=nlp(text)
    filterd_token = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filterd_token.append(token.lemma_)
    return " ".join(filterd_token)


Making a parallel column for preprocessed review

In [9]:
df['preprocessed_review'] = df['review'].apply(preprocess)
df

Unnamed: 0,review,sentiment,category,preprocessed_review
0,One of the other reviewers has mentioned that ...,positive,1,reviewer mention watch 1 oz episode hook right...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production < br /><br />the f...
2,I thought this was a wonderful way to spend ti...,positive,1,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,0,basically family little boy Jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,Petter Mattei love Time money visually stunnin...
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,1,think movie right good job creative original e...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,bad plot bad dialogue bad act idiotic directin...
49997,I am a Catholic taught in parochial elementary...,negative,0,Catholic teach parochial elementary school nun...
49998,I'm going to have to disagree with the previou...,negative,0,go disagree previous comment Maltin second rat...


Spliting data into train and test data with test size (0.2)

In [10]:
x_train,x_test,y_train,y_test=train_test_split(df.preprocessed_review,df.category,test_size=0.2,stratify=df.category,random_state=23)

In [11]:
x_train.shape

(40000,)

In [12]:
y_train.value_counts()

0    20000
1    20000
Name: category, dtype: int64

In [13]:
y_test.value_counts()

0    5000
1    5000
Name: category, dtype: int64

In [14]:
x_test.shape

(10000,)

Creating Pipeline which include CountVectorizer(Bag of Words), MultinomialNB
And after that training and predicting .
In the end printing classification_report for model analysis.

In [15]:
#1. create a pipeline object
clf_1= Pipeline([
                
     ('vectorizer', CountVectorizer()),   
      ('Multi NB', MultinomialNB())   #using the Multinomial Naive Bayes classifier 
])


#2. fit with X_train and y_train
clf_1.fit(x_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf_1.predict(x_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      5000
           1       0.87      0.83      0.85      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



Creating Pipeline which include CountVectorizer(ngrams_range), MultinomialNB 
And after that training and predicting .
In the end printing classification_report for model analysis.

In [16]:

#1. create a pipeline object
clf_2= Pipeline([
    ('vectorizer_n_grams', CountVectorizer(ngram_range = (3, 3))), #using the ngram_range parameter 
    ('random_forest', (RandomForestClassifier()))         
])

#2. fit with X_train and y_train
clf_2.fit(x_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf_2.predict(x_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.43      0.56      5000
           1       0.61      0.90      0.73      5000

    accuracy                           0.66     10000
   macro avg       0.71      0.66      0.65     10000
weighted avg       0.71      0.66      0.65     10000



Creating Pipeline which include TfidfVectorizer, MultinomialNB 
And after that training and predicting . 
In the end printing classification_report for model analysis.

In [19]:
clf_3=Pipeline([
   ( "vectorizer",TfidfVectorizer()),
    ("classifer",RandomForestClassifier( n_estimators=200,criterion='gini'))
])
clf_3.fit(x_train,y_train)

y_preds=clf_3.predict(x_test)

print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      5000
           1       0.86      0.86      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [20]:

clf_4=Pipeline([
   ( "vectorizer",TfidfVectorizer()),
    ("classifer",KNeighborsClassifier( n_neighbors=3))
])
clf_4.fit(x_train,y_train)

y_preds=clf_4.predict(x_test)

print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.80      0.72      0.76      5000
           1       0.75      0.82      0.78      5000

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000



In [21]:
review=["that movie was amazing i really liked it but kind of average ","this is a disaster no one should watch this it would be their waste fo time."]


for output 1 it is POSITIVE and for 0 it is NEGATIVE

In [25]:
clf_3.predict(review)

array([1, 0], dtype=int64)