In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [40]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [12]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [14]:
# There seem not to be any blank spaces in any of the reviews

blanks = []

for i, rv, s in df.itertuples():
    if type(rv)==str:
        if rv.isspace():
            blanks.append(i)
            
blanks

[]

In [16]:
# Now we split the data into training and test sets

X = df['review']
y = df['sentiment']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import spacy

In [23]:
nlp = spacy.load('en_core_web_sm')

In [26]:
print(nlp.Defaults.stop_words)

{'does', 'fifty', 'myself', 'there', 'they', 'amongst', 'these', 'am', 'cannot', 'almost', 'themselves', '‘s', 'after', 'seems', 'about', '’re', 'neither', 'upon', 'within', 'afterwards', 'noone', 'made', 'any', 'most', 'how', 'another', 'anyone', 'thereby', 'from', 'this', 'have', 'no', 'else', 'may', 'has', 'thereafter', 'had', 'alone', 'moreover', 'put', 'enough', 'himself', 'an', 'that', '’m', 'above', 'before', 'below', 'first', "'d", 'less', 'must', 'someone', 'done', 'was', 'throughout', 'then', 'say', 'elsewhere', 'its', 'front', 'herself', 'hers', 'really', 'via', 'get', 'her', 'me', 'everything', 'once', 'somewhere', 'also', 'some', 'thru', 'sixty', 'whither', 'except', 'why', 'did', 'former', 'whoever', 'wherein', 'across', '’d', 'per', 'hereby', 'just', '’ve', 'under', 'because', 'whence', 'often', 'one', 'whereas', 'can', 'among', 'four', 'against', 'make', 'due', 'are', 'namely', "'m", 'several', 'anything', 'few', 'fifteen', 'still', 'beforehand', 'amount', 'formerly', '

In [27]:
# We wont chose every single stop word beacuse it may interfere with our reviews.

stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [31]:
# We will make 2 Piplines for 2 different models

# Naive-Bayees model:
tfidf_nb = Pipeline([('clf', TfidfVectorizer(stop_words=stopwords)),
                    ('nb', MultinomialNB()),
                    ])

# Linear SVC model:
tfidf_lsvc = Pipeline([('clf', TfidfVectorizer(stop_words=stopwords)),
                      ('lsvc', LinearSVC()),
                      ])

# We fit (train) the data into both models:

tfidf_nb.fit(X_train,y_train)
tfidf_lsvc.fit(X_train,y_train)



In [32]:
# Now we predict

prediction_nb = tfidf_nb.predict(X_test)

predictions_lsvc = tfidf_lsvc.predict(X_test)

In [33]:
# Now to evaluate these models
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [34]:
# Naive-Bayees evaluation

print(classification_report(y_test,prediction_nb))
print('\n')
print(confusion_matrix(y_test,prediction_nb))
print('\n')
accuracy_score(y_test,prediction_nb)

              precision    recall  f1-score   support

    negative       0.85      0.88      0.87      8208
    positive       0.88      0.84      0.86      8292

    accuracy                           0.86     16500
   macro avg       0.86      0.86      0.86     16500
weighted avg       0.86      0.86      0.86     16500



[[7263  945]
 [1302 6990]]




0.8638181818181818

In [35]:
# Linear SVC evaluation
# It seems it did a better job than Naive-bayees model

print(classification_report(y_test,predictions_lsvc))
print('\n')
print(confusion_matrix(y_test,predictions_lsvc))
print('\n')
accuracy_score(y_test,predictions_lsvc)

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      8208
    positive       0.89      0.91      0.90      8292

    accuracy                           0.90     16500
   macro avg       0.90      0.90      0.90     16500
weighted avg       0.90      0.90      0.90     16500



[[7296  912]
 [ 760 7532]]




0.8986666666666666

In [36]:
# Lets try to make predictions using my own review

review = 'A movie had a good start and was overall good untill it completely deteriorated on the middle and had a really bad ending for my taste'

In [37]:
print(tfidf_nb.predict([review]))

['negative']


In [39]:
print(tfidf_lsvc.predict([review]))

['negative']


In [None]:
# Feel free to try it yourself