In [1]:
# importing libraries
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns 

%matplotlib inline

In [2]:
# importing some NLP libraries

import nltk

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import LabelBinarizer

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

from wordcloud import WordCloud, STOPWORDS

from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize,sent_tokenize

from bs4 import BeautifulSoup

import spacy

import re, string, unicodedata

from nltk.tokenize.toktok import ToktokTokenizer

from nltk.stem import LancasterStemmer, WordNetLemmatizer

from sklearn.linear_model import LogisticRegression, SGDClassifier

from sklearn.naive_bayes import MultinomialNB

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th 
{
    border: 2px  black solid !important;  color: black !important;
}
</style>

In [4]:
# Code to display all the columns in the dataset
pd.set_option('display.max_columns', None)

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv("IMDB-Dataset 1.csv")

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df.shape

(24999, 2)

In [9]:
# installing text blob
#!pip install textblob

In [10]:
from textblob import TextBlob

from textblob import Word

#### Exploratery data analysis

In [11]:
df.describe()

Unnamed: 0,review,sentiment
count,24999,24999
unique,24896,2
top,Loved today's show!!! It was a variety and not...,negative
freq,4,12525


In [12]:
# Checking data count for review
df["sentiment"].value_counts()

negative    12525
positive    12474
Name: sentiment, dtype: int64

In [13]:
# Balanced dataset

In [14]:
# splitting review and sentiment columns into seprate dataframes
df_train_review = df["review"]

df_train_sentiment = df["sentiment"]

In [15]:
print("Train Review shape ====>",df_train_review.shape)
print("Train Sentiment shape ====>",df_train_sentiment.shape)

Train Review shape ====> (24999,)
Train Sentiment shape ====> (24999,)


In [16]:
# no null values

In [17]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [18]:
# Tokenization of text

tokenizer = ToktokTokenizer()

In [19]:
# setting stopwords

stopwords_list = nltk.corpus.stopwords.words("english")

In [20]:
# We need to remove html from the text

def remove_html_tags(text):
    
    soup = BeautifulSoup(text, "html.parser")
    
    return soup.get_text()

In [21]:
# Removing brackets 

def remove_sq_brackets(text):
    
    return re.sub('\[[^]]*\]', '',text)

In [22]:
# removing noisy text

def denoise_text(text):
    
    text = remove_html_tags(text)
    
    text = remove_sq_brackets(text)
    
    return text

In [23]:
# applying function on review column
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [24]:
df["review"] = df["review"].apply(denoise_text) 

In [25]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [26]:
# Function to remove special characters

def remove_special_char(text, remove_digits = True):
    
    pattern = r'[^a-zA-z0-9\s]'
    
    text = re.sub(pattern,"",text)
    
    return text

In [27]:
# APplying the function
df["review"] = df["review"].apply(remove_special_char)

In [28]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [29]:
# Stemming the text
def simple_seaamer(text):
    
    # using porter stemmer
    
    ps = nltk.porter.PorterStemmer()
    
    text = " ".join([ps.stem(word) for word in text.split()])
    
    return text

In [30]:
# applying stemmer on review column
df["review"] = df["review"].apply(simple_seaamer)

In [31]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,A wonder littl product the film techniqu is ve...,positive
2,I thought thi wa a wonder way to spend time on...,positive
3,basic there a famili where a littl boy jake th...,negative
4,petter mattei love in the time of money is a v...,positive


In [32]:
# Stop words
stop = set(stopwords.words("english"))

In [33]:
print(stop)

{"shan't", 'isn', 'under', 'those', 'down', 'what', 'but', 'before', 'then', "aren't", "doesn't", 'wasn', 'between', 'who', 'not', 'she', 'here', 'd', 'in', 'so', "isn't", 'our', 'as', 'has', 'how', "you'd", 'we', "she's", 'theirs', 'y', 'any', "mightn't", 'weren', 'this', 'herself', 'did', 'you', 'should', 'through', 'wouldn', 'against', "you'll", 'by', 'off', "it's", "wasn't", 'each', 'doesn', 'there', 'them', 'am', 'won', 'their', 'have', 'shan', 'was', 'haven', 'a', 'ourselves', 'themselves', 'aren', 'my', 'or', 'myself', 'with', 'its', 'above', 'from', 'other', 'and', 'of', "hasn't", 'where', 'which', 'further', 'll', "couldn't", 'more', 'these', 'ours', 'all', 'shouldn', 'after', 'hadn', 'couldn', 'mightn', 'don', "you've", 'why', 'needn', 'too', "that'll", "didn't", 'ain', "don't", 'no', 'into', "mustn't", 'both', "hadn't", 'they', 'been', 'few', 'him', 'same', 'm', 'himself', 'up', 'than', 're', 'do', 'for', 't', 'nor', 'ma', 'will', 'very', 'the', 'had', 'because', 'is', 've',

In [34]:
stopword_list = nltk.corpus.stopwords.words("english")

In [35]:
# removing stop words

def remove_stopwords(text, is_lower_case=False):
    
    tokens = tokenizer.tokenize(text)
    
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    
    filtered_text = ' '.join(filtered_tokens)    
    
    return filtered_text

In [36]:
# removing stopwords
df["review"] = df["review"].apply(remove_stopwords)

In [37]:
df.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 Oz episod youll ...,positive
1,wonder littl product film techniqu veri unassu...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic famili littl boy jake think zombi hi clo...,negative
4,petter mattei love time money visual stun film...,positive


In [38]:
df["review"][0]

'one review ha mention watch 1 Oz episod youll hook right thi exactli happen meth first thing struck Oz wa brutal unflinch scene violenc set right word GO trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call OZ nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda Em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast Oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

In [39]:
# Using Bag of words model

In [40]:
# It is used to convert text documents to numerical vectors or bag of words.

In [41]:
df.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 Oz episod youll ...,positive
1,wonder littl product film techniqu veri unassu...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic famili littl boy jake think zombi hi clo...,negative
4,petter mattei love time money visual stun film...,positive


In [42]:
df.shape

(24999, 2)

In [45]:
# we need to seprate the train and test data 
train_df = df.iloc[:20000,:]

In [46]:
train_df.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 Oz episod youll ...,positive
1,wonder littl product film techniqu veri unassu...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic famili littl boy jake think zombi hi clo...,negative
4,petter mattei love time money visual stun film...,positive


In [47]:
train_df.shape

(20000, 2)

In [54]:
# test dataframe
test_df= df.iloc[20000:,:]

In [55]:
test_df.head()

Unnamed: 0,review,sentiment
20000,huge fan northern exposur men tree complet kno...,negative
20001,watch thi movi starz let go thing thought coul...,negative
20002,stori hare rama hare krishna actual came dev a...,positive
20003,oddli enough fred macmurray play screwi part t...,negative
20004,fan horror comedi like thi one might get quick...,negative


In [56]:
test_df.shape

(4999, 2)

In [57]:
train_df.shape

(20000, 2)

In [58]:
test_df.shape

(4999, 2)

In [59]:
#Count vectorizer for bag of words

cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))

In [60]:
#transformed train reviews

cv_train_reviews=cv.fit_transform(train_df["review"])

In [61]:
#transformed test reviews

cv_test_reviews=cv.transform(test_df["review"])

In [62]:
print('Bag of Words cv train:',cv_train_reviews.shape)
print("\n")
print('Bag of Words cv test:',cv_test_reviews.shape)

Bag of Words cv train: (20000, 3378380)


Bag of Words cv test: (4999, 3378380)


### Term Frequency-Inverse Document Frequency model (TFIDF)

It is used to convert text documents to matrix of tfidf features.

In [63]:
#Tfidf vectorizer

tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

In [64]:
#transformed train reviews

tv_train_reviews=tv.fit_transform(train_df["review"])

In [65]:
#transformed test reviews

tv_test_reviews=tv.transform(test_df["review"])

In [66]:
print('Tfidf_train:',tv_train_reviews.shape)
print("\n")
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (20000, 3378380)


Tfidf_test: (4999, 3378380)


In [67]:
#labeling the sentient data

lb=LabelBinarizer()

In [69]:
#transformed sentiment data
sentiment_data=lb.fit_transform(df['sentiment'])

print(sentiment_data.shape)

(24999, 1)


In [71]:
sentiment_data

array([[1],
       [1],
       [1],
       ...,
       [0],
       [1],
       [1]])

In [73]:
#Spliting the sentiment data
train_sentiments=sentiment_data[:20000]

test_sentiments=sentiment_data[20000:]

print(train_sentiments.shape)
print(test_sentiments.shape)

(20000, 1)
(4999, 1)


In [74]:
#training the model
Lr = LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

In [75]:
#Fitting the model for Bag of words

Lr_bow= Lr.fit(cv_train_reviews,train_sentiments)

In [76]:
Lr_bow

LogisticRegression(C=1, max_iter=500, random_state=42)

In [77]:
Lr_bow.score(cv_train_reviews,train_sentiments)

0.9977

In [78]:
#Fitting the model for tfidf features

Lr_tfidf= Lr.fit(tv_train_reviews,train_sentiments)

In [79]:
print(Lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [80]:
Lr_tfidf.score(tv_train_reviews,train_sentiments)

0.9977

In [81]:
#Predicting the model for bag of words

lr_bow_predict=Lr.predict(cv_test_reviews)

In [82]:
print(lr_bow_predict)

[0 0 1 ... 1 1 1]


In [83]:
##Predicting the model for tfidf features

lr_tfidf_predict= Lr.predict(tv_test_reviews)

In [84]:
print(lr_tfidf_predict)

[0 0 1 ... 0 1 1]


### Accuracy of the model

In [86]:
#Accuracy score for bag of words

lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)

print("lr_bow_score :",lr_bow_score)

lr_bow_score : 0.7295459091818364


In [87]:
#Accuracy score for tfidf features

lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)

print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.7065413082616523


In [89]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

              precision    recall  f1-score   support

    Positive       0.70      0.78      0.74      2428
    Negative       0.77      0.68      0.72      2571

    accuracy                           0.73      4999
   macro avg       0.73      0.73      0.73      4999
weighted avg       0.73      0.73      0.73      4999



In [90]:
#Classification report for tfidf features
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.64      0.89      0.75      2428
    Negative       0.84      0.53      0.65      2571

    accuracy                           0.71      4999
   macro avg       0.74      0.71      0.70      4999
weighted avg       0.74      0.71      0.70      4999



In [91]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)

[[1747  824]
 [ 528 1900]]


In [92]:
#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[1375 1196]
 [ 271 2157]]
