In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
movie_review=pd.read_csv('../datasets/imdb-movie-reviews.csv')
movie_review.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### Convert positive to 1 and negative to 0

In [3]:
movie_review['sentiment']=movie_review['sentiment'].replace({'positive':1,'negative':0})
movie_review.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
movie_review['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [5]:
movie_review.isna().sum()

review       0
sentiment    0
dtype: int64

In [6]:
movie_review.duplicated().sum()

418

In [7]:
movie_review.drop_duplicates(subset=None, keep='first', inplace=True)

In [8]:
movie_review.duplicated().sum()

0

### Problems with The Data

* HTML Tags
* Special Characters
* Convert eveything to lowercase
* Stops words
* Stemming

#### Remove HTML Tags

In [10]:
import re
clean = re.compile('<.*?>')
re.sub(clean, '',movie_review.iloc[0].review)

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.I would say the main appeal of the show is due to the fact that it goes where other shows wo

In [11]:
def remove_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '',text)

In [12]:
movie_review['review']=movie_review['review'].apply(remove_html)
movie_review.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### Convert to lower

In [13]:
def to_lower(text):
    return text.lower()

In [14]:
movie_review['review']=movie_review['review'].apply(to_lower)
movie_review.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


#### Removing Special Characters

In [15]:
def remove_special_character(text):
    new_text=''
    for character in text:
        if character.isalnum():
            new_text += character
        else:
            new_text+=" "
    return new_text

In [16]:
movie_review['review']=movie_review['review'].apply(remove_special_character)
movie_review.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is...,1


#### Remove Stopwords

In [17]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saree\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word not in stopwords.words('english'):
            new_text.append(word)
    return new_text

In [20]:
remove_stopwords('i thought this movie did a down right good job i m going to have to disagree with the previous no one expects the star trek movies to be high')

['thought',
 'movie',
 'right',
 'good',
 'job',
 'going',
 'disagree',
 'previous',
 'one',
 'expects',
 'star',
 'trek',
 'movies',
 'high']

In [21]:
movie_review['review']=movie_review['review'].apply(remove_stopwords)
movie_review.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",1
1,"[wonderful, little, production, filming, techn...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, family, little, boy, jake, thinks,...",0
4,"[petter, mattei, love, time, money, visually, ...",1


#### Stemming

In [22]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [23]:
def stemming(text):
    new_words=[]
    for words in text:
        new_words.append(ps.stem(words))
    return new_words

In [24]:
stemming(['I','loved','love','loving'])

['i', 'love', 'love', 'love']

In [25]:
movie_review['review']=movie_review['review'].apply(stemming)
movie_review.head()

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, h...",1
1,"[wonder, littl, product, film, techniqu, unass...",1
2,"[thought, wonder, way, spend, time, hot, summe...",1
3,"[basic, famili, littl, boy, jake, think, zombi...",0
4,"[petter, mattei, love, time, money, visual, st...",1


 #### Join all the words back to a string

In [26]:
def join_back(list):
    return " ".join(list)

In [27]:
movie_review['review']=movie_review['review'].apply(join_back)
movie_review.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=10000)

In [83]:
vector=cv.fit_transform(movie_review['review']).toarray()
pickle.dump(cv,open('CountVectorizer.pkl','wb'))


In [31]:
vector.shape

(49582, 10000)

In [32]:
vector[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [33]:
vector[0].mean()

0.0153

In [34]:
y=movie_review.iloc[:,-1].values

In [35]:
y.shape

(49582,)

Train Test Split

In [36]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(vector,y,test_size=0.2)

In [37]:
X_train.shape,y_train.shape

((39665, 10000), (39665,))

In [38]:
X_test.shape,y_test.shape

((9917, 10000), (9917,))

In [39]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()
clf4=RandomForestClassifier()

In [54]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)
clf4.fit(X_train,y_train)

In [56]:
y_pred1=clf1.predict(X_test)
y_pred2=clf2.predict(X_test)
y_pred3=clf3.predict(X_test)
y_pred4=clf4.predict(X_test)

In [57]:
y_pred1.shape

(9917,)

In [58]:
from sklearn.metrics import accuracy_score

In [60]:
print("Gaussian ",accuracy_score(y_test,y_pred1))
print("Multinomial ",accuracy_score(y_test,y_pred2))
print("Bernoulli ",accuracy_score(y_test,y_pred3))
print("Random-Forest ",accuracy_score(y_test,y_pred4))

Gaussian  0.6844811939094484
Multinomial  0.8448119390944843
Bernoulli  0.8438035696279117
Random-Forest  0.8489462539074317


In [84]:
pickle.dump(clf1,open('Gaussian.pkl','wb'))
pickle.dump(clf2,open('Multinomial.pkl','wb'))
pickle.dump(clf3,open('Bernoulli.pkl','wb'))
pickle.dump(clf4,open('RandomForest.pkl','wb'))

In [44]:
movie_review.to_csv('./movie_review_cleaned.csv',index=False)

### Testing

In [80]:

text="It is an out and out action movie, with no excuse & relentless thrills but with context & lead up to climax. You can't make out the VFX, it seems all real so it turns out exhilarating. Amazing camera angles & vivid colors through out. 2 action pieces were standout for me - opening fight sequence with fireworks and tram chase standoff sequence. Dhanush is great in the few scenes he got but boy he leaves solid impact and you kind of want his backstory or something more out of this character. Let's see if we get Dhanush as lead in Russo brothers next...."
all_stopwords=stopwords.words('english')
all_stopwords.remove('not')
corpus=[]
text = re.sub('[^a-zA-Z]', ' ', text)
text = text.lower()
text = text.split()
text = [ps.stem(word) for word in text if not word in all_stopwords]
text = ' '.join(text)
corpus.append(text)
print(corpus)


['action movi excus relentless thrill context lead climax make vfx seem real turn exhilar amaz camera angl vivid color action piec standout open fight sequenc firework tram chase standoff sequenc dhanush great scene got boy leav solid impact kind want backstori someth charact let see get dhanush lead russo brother next']


In [81]:
X_fresh = cv.transform(corpus).toarray()
X_fresh.shape

(1, 10000)

In [82]:

y_pred = clf4.predict(X_fresh)
print(y_pred)

[1]
