# Import Modules

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [2]:
df=pd.read_csv("IMDB Dataset.csv")

# Display Data

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Encode Characters in the sentiment column

In [4]:
df['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


# Remove HTML tags using RegEx

In [6]:
# Test the RegEx
clean = re.compile('<.*?>')
re.sub(clean,'',df.iloc[2].review)

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [7]:
# Function to clean HTML tags
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)

In [8]:
# Removing HTML tags from reviews column
df['review'] = df['review'].apply(clean_html)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


# Convert all the reviews to lowercase

In [9]:
def convert_lower(text):
    return text.lower()
df['review'] = df['review'].apply(convert_lower)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


# Remove special Characters

In [10]:
def remove_special(text):
    x=''
    for t in text:
        if t.isalnum():
            x=x+t
        else:
            x=x+' '
    
    return x

In [11]:
df['review'] = df['review'].apply(remove_special)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is...,1


# Remove Stopwords

In [12]:
def remove_stopwords(text):
    x = []
    for i in text.split():
        if i not in STOPWORDS:
            x.append(i)
    
    # Transporting all the contents of x to y
    y = x[:]
    x.clear()
    return y

In [13]:
df['review'] = df['review'].apply(remove_stopwords)
df.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",1
1,"[wonderful, little, production, filming, techn...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, family, little, boy, jake, thinks,...",0
4,"[petter, mattei, love, time, money, visually, ...",1


In [14]:
# Join back
def join_back(list_input):
    return " ".join(list_input)

In [15]:
df['review'] = df['review'].apply(join_back)
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake thinks zombie...,0
4,petter mattei love time money visually stunnin...,1


# Vectorization 

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)

In [19]:
X = cv.fit_transform(df['review']).toarray()

In [20]:
X.shape

(50000, 5000)

In [21]:
y = df.iloc[:,-1].values
y.shape

(50000,)

# Naive Bayes Algorithm

In [22]:
clf = BernoulliNB()

In [23]:
print("BernoulliNB accuracy : ", cross_val_score(clf,X,y,cv=10,scoring='accuracy').mean()*100 , ' %')

GaussianNB accuracy :  74.71  %
MultinomialNB accuracy :  84.946  %
BernoulliNB accuracy :  85.394  %


In [24]:
clf.fit(X,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [25]:
BernoulliNB()

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [26]:
func1 = FunctionTransformer(func = clean_html)
# Eg:
func1.transform("<p> Swades is an excellent, mindblowing movie played by Shah Rukh Khan </p>")

' Swades is an excellent, mindblowing movie played by Shah Rukh Khan '

In [27]:
func2 = FunctionTransformer(func = convert_lower)
# Eg: 
func2.transform(' Swades is an excellent, mindblowing movie played by Shah Rukh Khan ')

' swades is an excellent, mindblowing movie played by shah rukh khan '

In [28]:
func3 = FunctionTransformer(func = remove_special)
# Eg: 
func3.transform(' swades is an excellent, mindblowing movie played by shah rukh khan ')

' swades is an excellent  mindblowing movie played by shah rukh khan '

In [29]:
func4 = FunctionTransformer(func = remove_stopwords)
# Eg: 
func4.transform(' swades is an excellent  mindblowing movie played by shah rukh khan ')

['swades',
 'excellent',
 'mindblowing',
 'movie',
 'played',
 'shah',
 'rukh',
 'khan']

In [30]:
func5 = FunctionTransformer(func = join_back)
# Eg: 
func5.transform(['swade', 'excel', 'mindblow', 'movi', 'play', 'shah', 'rukh', 'khan'])


'swade excel mindblow movi play shah rukh khan'

# Building readily deployable Pipeline

In [31]:
pipe = Pipeline([
    ('func1',func1),
    ('func2',func2),
    ('func3',func3),
    ('func4',func4),
    ('func5',func5)
])


In [32]:
review1 = 'Bhool Bhoolaiya, Phir Hera pheri, De Dana Dan and Bhaagam Bhaag are a few of the good comedy movies played by Akshay Kumar as a lead actor'
review2 = 'Golmaal 2 is the worst movie in the entire frenchise'

In [33]:
def sentiment_analyzer(text):
    buffer = []
    buffer.append(pipe.transform(text))
    estimator = clf3.predict(cv.transform(buffer))[0]

    if estimator == 0:
        return 'the review is negative'
    else: 
        return 'the review is positive'

In [34]:
sentiment_analyzer(review1)

'the review is positive'

In [35]:
sentiment_analyzer(review2)

'the review is negative'