In [1]:
import pandas as pd

In [2]:
reviews=pd.read_csv("data/processed_file.csv",index_col=0)

In [3]:
reviews.shape

(50000, 3)

In [4]:
reviews.head()

Unnamed: 0,review,label,file
26247,Fame is one of the best movies I've seen about...,-1,11122_8.txt
35067,This movie fully deserves to be one of the top...,-1,7811_10.txt
34590,"in a time of predictable movies, in which abou...",-1,7382_10.txt
16668,I saw this on TV the other nightÂ or rather I...,0,2501_1.txt
12196,I am a huge fan of Simon Pegg and have watched...,1,9728_7.txt


In [5]:
reviews = reviews[["review", "label", "file"]].sample(frac=1, random_state=1)
train = reviews[reviews.label!=-1].sample(frac=0.6, random_state=1)
valid = reviews[reviews.label!=-1].drop(train.index)
test = reviews[reviews.label==-1]
print(train.shape)
print(valid.shape)
print(test.shape)

(15000, 3)
(10000, 3)
(25000, 3)


# Data Preprocessing
The next step is data preprocessing. The following class behaves like your typical SKLearn vectorizer.

It can perform the following operations.

Discard non alpha-numeric characters
Set everything to lower case
Stems all words using PorterStemmer, and change the stems back to the most occurring existent word.
Discard non-Egnlish words (not by default).

In [6]:
import os
import numpy as np
from IPython.display import HTML

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 
from sklearn.decomposition import PCA

from tensorflow.python.keras.models import Sequential, load_model
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras import optimizers

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import words
from nltk.corpus import wordnet 
allEnglishWords = words.words() + [w for w in wordnet.words()]
allEnglishWords = np.unique([x.lower() for x in allEnglishWords])

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters


In [7]:
import re
import numpy as np
class Preprocessor(object):
    ''' Preprocess data for NLP tasks. '''

    def __init__(self, alpha=True, lower=True, stemmer=True, english=False):
        self.alpha = alpha
        self.lower = lower
        self.stemmer = stemmer
        self.english = english
        
        self.uniqueWords = None
        self.uniqueStems = None
        
    def fit(self, texts):
        texts = self._doAlways(texts)

        allwords = pd.DataFrame({"word": np.concatenate(texts.apply(lambda x: x.split()).values)})
        self.uniqueWords = allwords.groupby(["word"]).size().rename("count").reset_index()
        self.uniqueWords = self.uniqueWords[self.uniqueWords["count"]>1]
        if self.stemmer:
            self.uniqueWords["stem"] = self.uniqueWords.word.apply(lambda x: PorterStemmer().stem(x)).values
            self.uniqueWords.sort_values(["stem", "count"], inplace=True, ascending=False)
            self.uniqueStems = self.uniqueWords.groupby("stem").first()
        
        #if self.english: self.words["english"] = np.in1d(self.words["mode"], allEnglishWords)
        print("Fitted.")
            
    def transform(self, texts):
        texts = self._doAlways(texts)
        if self.stemmer:
            allwords = np.concatenate(texts.apply(lambda x: x.split()).values)
            uniqueWords = pd.DataFrame(index=np.unique(allwords))
            uniqueWords["stem"] = pd.Series(uniqueWords.index).apply(lambda x: PorterStemmer().stem(x)).values
            uniqueWords["mode"] = uniqueWords.stem.apply(lambda x: self.uniqueStems.loc[x, "word"] if x in self.uniqueStems.index else "")
            texts = texts.apply(lambda x: " ".join([uniqueWords.loc[y, "mode"] for y in x.split()]))
        #if self.english: texts = self.words.apply(lambda x: " ".join([y for y in x.split() if self.words.loc[y,"english"]]))
        print("Transformed.")
        return(texts)

    def fit_transform(self, texts):
        texts = self._doAlways(texts)
        self.fit(texts)
        texts = self.transform(texts)
        return(texts)
    
    def _doAlways(self, texts):
        # Remove parts between <>'s
        texts = texts.apply(lambda x: re.sub('<.*?>', ' ', x))
        # Keep letters and digits only.
        if self.alpha: texts = texts.apply(lambda x: re.sub('[^a-zA-Z0-9 ]+', ' ', x))
        # Set everything to lower case
        if self.lower: texts = texts.apply(lambda x: x.lower())
        return texts

In [8]:
preprocess = Preprocessor(alpha=True, lower=True, stemmer=True)

In [9]:
%%time
trainX = preprocess.fit_transform(train.review)
validX = preprocess.transform(valid.review)

Fitted.
Transformed.
Transformed.
Wall time: 6min 2s


In [10]:
trainX.head()

15710    the earlier part of the film was rather enjoy ...
14400    there is only one reason to watch this movie i...
313      if your idea of a thriller is car chase explos...
14601    sadly the print of the film we were go to watc...
20691    sure one of the most ill advise remake of a cl...
Name: review, dtype: object

In [11]:
trainX.describe()

count                                                 15000
unique                                                14964
top       this show come up with interesting location as...
freq                                                      3
Name: review, dtype: object

In [12]:
print(preprocess.uniqueWords.shape)
preprocess.uniqueWords[preprocess.uniqueWords.word.str.contains("disappoint")]

(38409, 3)


Unnamed: 0,word,count,stem
15085,disappointingly,8,disappointingli
15083,disappointed,581,disappoint
15084,disappointing,257,disappoint
15086,disappointment,245,disappoint
15082,disappoint,62,disappoint
15089,disappoints,23,disappoint
15088,disappointments,14,disappoint


In [13]:
print(preprocess.uniqueStems.shape)
preprocess.uniqueStems[preprocess.uniqueStems.word.str.contains("disappoint")]


(25433, 2)


Unnamed: 0_level_0,word,count
stem,Unnamed: 1_level_1,Unnamed: 2_level_1
disappoint,disappointed,581
disappointingli,disappointingly,8


# Feature Engineering
Next, we take the preprocessed texts as input and calculate their TF-IDF's (info). We retain 10000 features per text.

In [14]:
stop_words = text.ENGLISH_STOP_WORDS.union(["thats","weve","dont","lets","youre","im","thi","ha",
    "wa","st","ask","want","like","thank","know","susan","ryan","say","got","ought","ive","theyre"])
tfidf = TfidfVectorizer(min_df=2, max_features=10000, stop_words=stop_words) #, ngram_range=(1,3)

In [15]:
%%time
trainX = tfidf.fit_transform(trainX).toarray()
validX = tfidf.transform(validX).toarray()

Wall time: 4min 25s


In [16]:
print(trainX.shape)
print(validX.shape)

(15000, 10000)
(10000, 10000)


In [17]:
trainY = train.label
validY = valid.label

In [18]:
print(trainX.shape, trainY.shape)
print(validX.shape, validY.shape)

(15000, 10000) (15000,)
(10000, 10000) (10000,)


# Feature Selection
Next, we take the 10k dimensional tfidf's as input, and keep the 2000 dimensions that correlate the most with our sentiment target. The corresponding words - see below - make sense.

In [19]:
from scipy.stats.stats import pearsonr

In [20]:
getCorrelation = np.vectorize(lambda x: pearsonr(trainX[:,x], trainY)[0])
correlations = getCorrelation(np.arange(trainX.shape[1]))
print(correlations)

[-0.00785785 -0.02112479  0.01765996 ...  0.01993091  0.02072059
  0.00056014]


In [21]:
allIndeces = np.argsort(-correlations)
bestIndeces = allIndeces[np.concatenate([np.arange(1000), np.arange(-1000, 0)])]

In [22]:
vocabulary = np.array(tfidf.get_feature_names())
print(vocabulary[bestIndeces][:10])
print(vocabulary[bestIndeces][-10:])

['great' 'love' 'excellent' 'beautiful' 'best' 'perfect' 'enjoy' 'amazing'
 'favorite' 'performance']
['horrible' 'poor' 'stupid' 'terrible' 'worse' 'boring' 'awful' 'waste'
 'worst' 'bad']


In [23]:
trainX = trainX[:,bestIndeces]
validX = validX[:,bestIndeces]

In [24]:
print(trainX.shape, trainY.shape)
print(validX.shape, validY.shape)

(15000, 2000) (15000,)
(10000, 2000) (10000,)


# K fold Validation

In [25]:
from numpy import array
from sklearn.model_selection import KFold
# data sample
data = array([trainX.shape,trainY.shape])
# prepare cross validation
kfold = KFold(2, True, 1)
# enumerate splits
for train, test in kfold.split(data):
	print('train: %s, test: %s' % (data[train], data[test]))

train: [(15000,)], test: [(15000, 2000)]
train: [(15000, 2000)], test: [(15000,)]


In [26]:
DROPOUT = 0.5
ACTIVATION = "tanh"

model = Sequential([    
    Dense(int(trainX.shape[1]/2), activation=ACTIVATION, input_dim=trainX.shape[1]),
    Dropout(DROPOUT),
    Dense(int(trainX.shape[1]/2), activation=ACTIVATION, input_dim=trainX.shape[1]),
    Dropout(DROPOUT),
    Dense(int(trainX.shape[1]/4), activation=ACTIVATION),
    Dropout(DROPOUT),
    Dense(100, activation=ACTIVATION),
    Dropout(DROPOUT),
    Dense(20, activation=ACTIVATION),
    Dropout(DROPOUT),
    Dense(5, activation=ACTIVATION),
    Dropout(DROPOUT),
    Dense(1, activation='sigmoid'),
])

In [27]:
model.compile(optimizer=optimizers.Adam(0.00005), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              2001000   
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 500)               500500    
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               50100     
__________

# Model Training

In [28]:
EPOCHS = 7
BATCHSIZE = 1500

In [29]:
model.fit(trainX, trainY, epochs=EPOCHS, batch_size=BATCHSIZE, validation_data=(validX, validY))

Train on 15000 samples, validate on 10000 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x26a64160>

In [30]:
x = np.arange(EPOCHS)
history = model.history.history

data = [
    go.Scatter(x=x, y=history["acc"], name="Train Accuracy", marker=dict(size=5), yaxis='y2'),
    go.Scatter(x=x, y=history["val_acc"], name="Valid Accuracy", marker=dict(size=5), yaxis='y2'),
    go.Scatter(x=x, y=history["loss"], name="Train Loss", marker=dict(size=5)),
    go.Scatter(x=x, y=history["val_loss"], name="Valid Loss", marker=dict(size=5))
]
layout = go.Layout(
    title="Model Training Evolution", font=dict(family='Palatino'), xaxis=dict(title='Epoch', dtick=1),
    yaxis1=dict(title="Loss", domain=[0, 0.45]), yaxis2=dict(title="Accuracy", domain=[0.55, 1]),
)
py.iplot(go.Figure(data=data, layout=layout), show_link=False)


# Model Evaluation
Accuracy AND LOSS

In [31]:
train["probability"] = model.predict(trainX)
train["prediction"] = train.probability-0.5>0
train["truth"] = train.label==1
train.tail()

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [30]:
print(model.evaluate(trainX, trainY))
print((train.truth==train.prediction).mean())

[0.28194266799290973, 0.9213333333015442]
0.9213333333333333


In [31]:
valid["probability"] = model.predict(validX)
valid["prediction"] = valid.probability-0.5>0
valid["truth"] = valid.label==1
valid.tail()

Unnamed: 0,review,label,file,probability,prediction,truth
20070,Probably the worst Dolph film ever. There's no...,0,5564_2.txt,0.131978,False,False
8612,If you want just about everything you want to ...,1,6501_10.txt,0.854555,True,True
3341,I was really surprised with this movie. Going ...,1,1758_10.txt,0.134732,False,True
22644,This film is a portrait of the half-spastic te...,0,7881_2.txt,0.141678,False,False
10802,This charmingly pleasant and tenderhearted seq...,1,8473_8.txt,0.785451,True,True


In [32]:
print(model.evaluate(validX, validY))
print((valid.truth==valid.prediction).mean())

[0.34729949016571043, 0.8717]
0.8717


# Error Analysis

In [33]:
trainCross = train.groupby(["prediction", "truth"]).size().unstack()
trainCross

truth,False,True
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
False,6911,627
True,553,6909


In [34]:
validCross = valid.groupby(["prediction", "truth"]).size().unstack()
validCross

truth,False,True
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4400,647
True,636,4317


In [35]:
truepositives = valid[(valid.truth==True)&(valid.truth==valid.prediction)]
print(len(truepositives), "true positives.")
truepositives.sort_values("probability", ascending=False).head(3)

4317 true positives.


Unnamed: 0,review,label,file,probability,prediction,truth
10694,One of the best movies out there. Yeah maybe t...,1,8376_10.txt,0.870353,True,True
9005,This is a taut suspenseful masterpiece from Br...,1,6856_10.txt,0.870348,True,True
6180,If you have not seen this excellent movie abou...,1,4312_10.txt,0.870308,True,True


In [36]:
truenegatives = valid[(valid.truth==False)&(valid.truth==valid.prediction)]
print(len(truenegatives), "true negatives.")
truenegatives.sort_values("probability", ascending=True).head(3)

4400 true negatives.


Unnamed: 0,review,label,file,probability,prediction,truth
16094,Must have to agree with the other reviewer. Th...,0,1986_1.txt,0.129942,False,False
23041,This film was positively the worst film I have...,0,8238_1.txt,0.129966,False,False
23196,This movie is most possibly the worst movie I ...,0,8378_1.txt,0.129979,False,False


In [37]:
falsepositives = valid[(valid.truth==True)&(valid.truth!=valid.prediction)]
print(len(falsepositives), "false positives.")
falsepositives.sort_values("probability", ascending=True).head(3)


647 false positives.


Unnamed: 0,review,label,file,probability,prediction,truth
12340,I wouldn't go so far as to not recommend this ...,1,9858_7.txt,0.130624,False,True
7172,A bunch of American students and their tutor d...,1,5205_7.txt,0.130877,False,True
12303,This flick is sterling example of the state of...,1,9824_10.txt,0.131126,False,True


In [38]:
falsenegatives = valid[(valid.truth==False)&(valid.truth!=valid.prediction)]
print(len(falsenegatives), "false negatives.")
falsenegatives.sort_values("probability", ascending=False).head(3)

636 false negatives.


Unnamed: 0,review,label,file,probability,prediction,truth
14449,this is a great movie. I love the series on tv...,0,11755_3.txt,0.869631,True,False
14715,I was so excited and hyped up about watching t...,0,11995_4.txt,0.868042,True,False
19440,this one of the best celebrity's reality shows...,0,4998_3.txt,0.867258,True,False


# Model Application
Custom Reviews
To use this model, we would store the model, along with the preprocessing vectorizers, and run the unseen texts through following pipeline.

In [39]:
unseen = pd.Series("this movie very good")

In [40]:
unseen = preprocess.transform(unseen)       # Text preprocessing
unseen = tfidf.transform(unseen).toarray()  # Feature engineering
unseen = unseen[:,bestIndeces]              # Feature selection
probability = model.predict(unseen)[0,0]  # Network feedforward

Transformed.


In [41]:
print(probability)
print("Positive!") if probability > 0.5 else print("Negative!")

0.67211
Positive!
