In [1]:
import pandas as pd
import numpy as np

import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.corpus import stopwords
import texthero as hero
from texthero import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
df_original = pd.read_csv('IMDB Dataset.csv') #This dataframe will not be changed to ensure data integrity.
df = df_original.copy()

df['num_label'] = df['sentiment'].map({'negative':0, 'positive':1})
y = df.iloc[:,-1]

In [3]:
df.head()

Unnamed: 0,review,sentiment,num_label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [4]:
# Adding a 'Word Count' column via list comprehension.
# This column will need updating after each round of pruning.

df['Word Count'] = [len(x.split()) for x in df['review'].tolist()]

In [5]:
df.head()

Unnamed: 0,review,sentiment,num_label,Word Count
0,One of the other reviewers has mentioned that ...,positive,1,307
1,A wonderful little production. <br /><br />The...,positive,1,162
2,I thought this was a wonderful way to spend ti...,positive,1,166
3,Basically there's a family where a little boy ...,negative,0,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,230


In [6]:
print("Average review length = %.2f" % df['Word Count'].mean())

Average review length = 231.16


In [7]:
# Helper method that re-calculates the Word Count column, prints first 5 samples, and average word count/sample

def display_dataframe():
    df['Word Count'] = [len(x.split()) for x in df['review'].tolist()]
    print(df.head())
    print("\nAverage review length = %.2f" % df['Word Count'].mean())

In [8]:
# Bringing all text to lowercase.
df['review'] = df['review'].str.lower()

In [9]:
display_dataframe()

                                              review sentiment  num_label  \
0  one of the other reviewers has mentioned that ...  positive          1   
1  a wonderful little production. <br /><br />the...  positive          1   
2  i thought this was a wonderful way to spend ti...  positive          1   
3  basically there's a family where a little boy ...  negative          0   
4  petter mattei's "love in the time of money" is...  positive          1   

   Word Count  
0         307  
1         162  
2         166  
3         138  
4         230  

Average review length = 231.16


In [10]:
# Remove HTML tags.

def remove_html(text):
    '''Good but slow, needs rewriting'''
    return BeautifulSoup(text, "lxml").text

In [11]:
# Remove all special characters.

def remove_noise(text):
    return re.sub('[^A-Za-z0-9\s]+', '', text)

In [12]:
df['review'] = df['review'].apply(remove_html)
df['review'] = df['review'].apply(remove_noise)

In [13]:
display_dataframe()

                                              review sentiment  num_label  \
0  one of the other reviewers has mentioned that ...  positive          1   
1  a wonderful little production the filming tech...  positive          1   
2  i thought this was a wonderful way to spend ti...  positive          1   
3  basically theres a family where a little boy j...  negative          0   
4  petter matteis love in the time of money is a ...  positive          1   

   Word Count  
0         301  
1         156  
2         162  
3         129  
4         222  

Average review length = 226.20


In [14]:
stop = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [15]:
display_dataframe()

                                              review sentiment  num_label  \
0  one reviewers mentioned watching 1 oz episode ...  positive          1   
1  wonderful little production filming technique ...  positive          1   
2  thought wonderful way spend time hot summer we...  positive          1   
3  basically theres family little boy jake thinks...  negative          0   
4  petter matteis love time money visually stunni...  positive          1   

   Word Count  
0         168  
1          84  
2          86  
3          67  
4         125  

Average review length = 119.79


In [16]:
df['sentiment'].value_counts(normalize=True)

negative    0.5
positive    0.5
Name: sentiment, dtype: float64

In [17]:
count_vec = CountVectorizer(decode_error = 'ignore')
x = count_vec.fit_transform(df['review'])

In [18]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size = 0.3)

In [19]:
clf = MultinomialNB(alpha = 1)
clf.fit(xtrain,ytrain)

MultinomialNB(alpha=1)

In [20]:
print("Training Score:", clf.score(xtrain,ytrain))
print("Testing Score:", clf.score(xtest,ytest))

Training Score: 0.9261142857142857
Testing Score: 0.8594


In [21]:
df['prediction'] = clf.predict(x)

In [22]:
sneaky_spam = df[(df['prediction'] == 0) & (df['num_label'] == 1)]
not_actually_spam = df[(df['prediction'] == 1) & (df['num_label'] == 0)]

In [23]:
confusion_matrix = pd.crosstab(df['num_label'], df['prediction'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,23174,1826
1,2869,22131


In [24]:
# False Negatives
len(sneaky_spam)

2869

In [25]:
# False Positives
len(not_actually_spam)

1826

In [62]:
stringy = 'I didnt like the movie. It was a disappointment. The story wasnt bad, but it was unnecessarily long. Although there is alternative history, I did not know about cousin kings :) Interesting information. Overall I didnt like the movie. The action scenes were good, but only those were good. The movie isnt bad in my opinion, but its below average.'
# https://www.imdb.com/review/rw7903997/

In [73]:
df_new_but_not = df.copy() # Use df_original.copy() is you want to also clean it again, which means don't use it!!!

# Here is the torture of cleaning
#   df_new_but_not['review'] = df_new_but_not['review'].str.lower()
#   df_new_but_not['review'] = df_new_but_not['review'].apply(remove_html)
#   df_new_but_not['review'] = df_new_but_not['review'].apply(remove_noise)
#   df_new_but_not['review'] = df_new_but_not['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Find a better way to do this becuase it is super slow
string_list = {'String':[stringy]}
df_input = pd.DataFrame(string_list) 
df_inputs = df_input['String'].append(df_new_but_not['review'], ignore_index = True)
x_new_inputs = count_vec.transform(df_inputs)

In [74]:
df_inputs.loc[len(df_new_but_not)] = [[],[]]
df_inputs['prediction'] = clf.predict(x_new_inputs)

In [75]:
if df_inputs['prediction'][0] == 0:
    print('The Review:', '"'+stringy+'"', 'has a negative sentiment')
else:
    print('The Review:', '"'+stringy+'"', 'has a positive sentiment')

The Review: "I didnt like the movie. It was a disappointment. The story wasnt bad, but it was unnecessarily long. Although there is alternative history, I did not know about cousin kings :) Interesting information. Overall I didnt like the movie. The action scenes were good, but only those were good. The movie isnt bad in my opinion, but its below average." has a negative sentiment


In [76]:
df_inputs[0]

'I didnt like the movie. It was a disappointment. The story wasnt bad, but it was unnecessarily long. Although there is alternative history, I did not know about cousin kings :) Interesting information. Overall I didnt like the movie. The action scenes were good, but only those were good. The movie isnt bad in my opinion, but its below average.'

In [77]:
df_inputs['prediction'][0]

0