In [148]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from langdetect import detect_langs
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Importing Dataset

In [2]:
df_reviews = pd.read_csv('IMDB Dataset.csv')
df_reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Exploring Dataset

In [3]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
df_reviews['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# Detecting Language 
### Detecting Language of each review so that only 'English' reviews extracted.

In [15]:
df_reviews['language'] = df_reviews['review'].apply(detect_langs)
df_reviews.head()

Unnamed: 0,review,sentiment,language
0,One of the other reviewers has mentioned that ...,positive,[en:0.9999972620757107]
1,A wonderful little production. <br /><br />The...,positive,[en:0.9999970949756234]
2,I thought this was a wonderful way to spend ti...,positive,[en:0.999995988717917]
3,Basically there's a family where a little boy ...,negative,[en:0.9999966227841428]
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,[en:0.9999957500510976]


In [18]:
df_reviews['lang'] = df_reviews['language'].apply(lambda lang: str(lang).split(':')[0][1:])

In [27]:
df_reviews.head()

Unnamed: 0,review,sentiment,language,lang
0,One of the other reviewers has mentioned that ...,positive,[en:0.9999972620757107],en
1,A wonderful little production. <br /><br />The...,positive,[en:0.9999970949756234],en
2,I thought this was a wonderful way to spend ti...,positive,[en:0.999995988717917],en
3,Basically there's a family where a little boy ...,negative,[en:0.9999966227841428],en
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,[en:0.9999957500510976],en


In [19]:
df_reviews['lang'].value_counts()

en    49999
id        1
Name: lang, dtype: int64

In [28]:
df_reviews_en = df_reviews[df_reviews['lang']=='en']
df_reviews_en = df_reviews_en[['review','sentiment']]
df_reviews_en.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Pre-processing Text
### Pre-processing the text by removing html tags, special characters and stopwords

In [134]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_special_characters(text):
    pattern=r'[^a-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_text = ' '.join([token for token in tokens if token not in english_stops])
    return filtered_text

english_stops = stopwords.words('english')

def text_preprocessing(text):
    text = strip_html(text)
    text = text.lower()
    text = remove_special_characters(text)
    text = remove_stopwords(text)
    text = text.strip()
    return text
    

df_reviews_en['pro_text'] = df_reviews_en['review'].apply(text_preprocessing)
df_reviews_en.head()

  soup = BeautifulSoup(text, "html.parser")


Unnamed: 0,review,sentiment,pro_text
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


# Feature Selection & Splitting data for training and testing

In [144]:
X = df_reviews_en['pro_text']
y = df_reviews_en['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# LinearSVC Model

In [145]:
lin_svc_pipe = Pipeline(([('tfidf',TfidfVectorizer()),('clf',LinearSVC(dual=True))]))
lin_svc_pipe.fit(X_train,y_train)
lin_svc_pred = lin_svc_pipe.predict(X_test)
print(confusion_matrix(y_test,lin_svc_pred))
print()
print(classification_report(y_test,lin_svc_pred))

[[6664  795]
 [ 732 6809]]

              precision    recall  f1-score   support

    negative       0.90      0.89      0.90      7459
    positive       0.90      0.90      0.90      7541

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



# Logistic Regression Model

In [146]:
lr_pipe = Pipeline(([('tfidf',TfidfVectorizer()),('clf',LogisticRegression())]))
lr_pipe.fit(X_train,y_train)
lr_pipe_pred = lr_pipe.predict(X_test)
print(confusion_matrix(y_test,lr_pipe_pred))
print()
print(classification_report(y_test,lr_pipe_pred))

[[6641  818]
 [ 719 6822]]

              precision    recall  f1-score   support

    negative       0.90      0.89      0.90      7459
    positive       0.89      0.90      0.90      7541

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



# Naive Bayes Model

In [147]:
nb_pipe = Pipeline(([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())]))
nb_pipe.fit(X_train,y_train)
nb_pipe_pred = nb_pipe.predict(X_test)
print(confusion_matrix(y_test,nb_pipe_pred))
print()
print(classification_report(y_test,nb_pipe_pred))

[[6614  845]
 [1129 6412]]

              precision    recall  f1-score   support

    negative       0.85      0.89      0.87      7459
    positive       0.88      0.85      0.87      7541

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000



# Vader Lexicon

In [149]:
sia = SentimentIntensityAnalyzer()

In [151]:
df_reviews['scores'] = df_reviews['review'].apply(lambda review: sia.polarity_scores(review))
df_reviews['compound'] = df_reviews['scores'].apply(lambda score_dict: score_dict['compound'])
df_reviews['comp_score'] = df_reviews['compound'].apply(lambda c: 'positive' if c>=0 else 'negative')
df_reviews.head()

Unnamed: 0,review,sentiment,language,lang,scores,compound,comp_score
0,One of the other reviewers has mentioned that ...,positive,[en:0.9999972620757107],en,"{'neg': 0.203, 'neu': 0.748, 'pos': 0.048, 'co...",-0.9951,negative
1,A wonderful little production. <br /><br />The...,positive,[en:0.9999970949756234],en,"{'neg': 0.053, 'neu': 0.776, 'pos': 0.172, 'co...",0.9641,positive
2,I thought this was a wonderful way to spend ti...,positive,[en:0.999995988717917],en,"{'neg': 0.094, 'neu': 0.714, 'pos': 0.192, 'co...",0.9605,positive
3,Basically there's a family where a little boy ...,negative,[en:0.9999966227841428],en,"{'neg': 0.138, 'neu': 0.797, 'pos': 0.065, 'co...",-0.9213,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,[en:0.9999957500510976],en,"{'neg': 0.052, 'neu': 0.801, 'pos': 0.147, 'co...",0.9744,positive


In [152]:
print(confusion_matrix(df_reviews['sentiment'],df_reviews['comp_score']))
print()
print(classification_report(df_reviews['sentiment'],df_reviews['comp_score']))

[[13410 11590]
 [ 3597 21403]]

              precision    recall  f1-score   support

    negative       0.79      0.54      0.64     25000
    positive       0.65      0.86      0.74     25000

    accuracy                           0.70     50000
   macro avg       0.72      0.70      0.69     50000
weighted avg       0.72      0.70      0.69     50000



# Evaluating Models

In [158]:
print('Linear SVC : ',accuracy_score(y_test,lin_svc_pred))
print('Linear Regression : ',accuracy_score(y_test,lr_pipe_pred))
print('Naive Bayes : ',accuracy_score(y_test,nb_pipe_pred))
print('Vader Lexicon : ',accuracy_score(df_reviews['sentiment'],df_reviews['comp_score']))

Linear SVC :  0.8982
Linear Regression :  0.8975333333333333
Naive Bayes :  0.8684
Vader Lexicon :  0.69626
