In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re #
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Loading Dataset
df = pd.read_csv('IMDB Dataset.csv')
df = df.sample(50000)

In [None]:
df.head()

Unnamed: 0,review,sentiment
8352,"Did people expect ""Jurassic Park 3"" to be full...",negative
35758,This movie deviated from the Bible and fell so...,negative
15671,"I'm not a big fan of musicals, although this t...",negative
21554,The people who bash this movie were looking fo...,negative
43423,Obnoxious Eva Longoria dies on her wedding day...,negative


Text Cleaning:
- Removing HTML tags
- Removing speacial characters
- Converting uppercase to lowercase
- removing Stop words (e.g., and, is, the, etc)
- Stemming words (i.e., words like play, playing and played are grouped as one)

In [None]:
df.shape

(50000, 2)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 8352 to 37410
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [None]:
df.sentiment.replace('positive', 1, inplace=True)
df.sentiment.replace('negative', 0, inplace=True)

In [None]:
df.head()

Unnamed: 0,review,sentiment
8352,"Did people expect ""Jurassic Park 3"" to be full...",0
35758,This movie deviated from the Bible and fell so...,0
15671,"I'm not a big fan of musicals, although this t...",0
21554,The people who bash this movie were looking fo...,0
43423,Obnoxious Eva Longoria dies on her wedding day...,0


In [None]:
# Function to clean html tags
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
df.review = df.review.apply(clean_html)

In [None]:
# Function to remove spaces
def remove_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

df.review = df.review.apply(remove_special)

In [None]:
# Function to convert to lowercase
def to_lower(text):
    return text.lower()

df.review = df.review.apply(to_lower)

In [None]:
df['review'][49997]

'i am a catholic taught in parochial elementary schools by nuns  taught by jesuit priests in high school   college  i am still a practicing catholic but would not be considered a  good catholic  in the church s eyes because i don t believe certain things or act certain ways just because the church tells me to so back to the movie   its bad because two people are killed by this nun who is supposed to be a satire as the embodiment of a female religious figurehead  there is no comedy in that and the satire is not done well by the over acting of diane keaton  i never saw the play but if it was very different from this movies then it may be good at first i thought the gun might be a fake and the first shooting all a plan by the female lead of the four former students as an attempt to demonstrate sister mary s emotional and intellectual bigotry of faith  but it turns out the bullets were real and the story has tragedy   the tragedy of loss of life  besides the two former students   the lives

In [None]:
# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

df.review = df.review.apply(remove_stopwords)

In [None]:
df.head()

Unnamed: 0,review,sentiment
8352,"[people, expect, jurassic, park, 3, full, surp...",0
35758,"[movie, deviated, bible, fell, bar, 1956, movi...",0
15671,"[big, fan, musicals, although, technically, mi...",0
21554,"[people, bash, movie, looking, cool, slick, fi...",0
43423,"[obnoxious, eva, longoria, dies, wedding, day,...",0


In [None]:
# Perform stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
y = []
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z = y[:]
    y.clear()
    return z

stem_words(['I','loved','loving','it'])

['i', 'love', 'love', 'it']

In [None]:
df.review = df.review.apply(stem_words)

In [None]:
# join back
def join_back(list_input):
    return " ".join(list_input)

df.review = df.review.apply(join_back)

In [None]:
df.review

Unnamed: 0,review
8352,peopl expect jurass park 3 full surpris one mo...
35758,movi deviat bibl fell bar 1956 movi hate repla...
15671,big fan music although technic might qualifi m...
21554,peopl bash movi look cool slick first one movi...
43423,obnoxi eva longoria die wed day ice sculptur a...
...,...
2624,wonder film non stop patter take sever watch f...
5638,prue piper bring dr griffith home save sauc as...
32033,commentari noth polit sentiment found film fac...
34925,talk market poster home video cover new twenti...


Now we get to the Machine Learning part, we will be using __Naive Bayes__ Algorithms to process the data.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)
# max_feature = 10000 will choose the first 10000 most used words and make them part of ur analysis

In [None]:
X = cv.fit_transform(df.review).toarray()
X.shape # A 2D array where each inner array represents a review and each word represents

(50000, 2000)

In [None]:
y = df.iloc[:,-1].values
y.shape

(50000,)

In [None]:
# splitting training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=9)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [None]:
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [None]:
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

In [None]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print("Gaussian: {}%".format(accuracy_score(y_test, y_pred1)*100))
print("Multinomial: {}%".format(accuracy_score(y_test, y_pred2)*100))
print("Bernaulli: {}%".format(accuracy_score(y_test, y_pred3)*100))

Gaussian: 75.39%
Multinomial: 84.92%
Bernaulli: 85.38%
