## Sentiment Analysis to predict whether the review is Positive or Negative.

In [1]:
'''
    Movie Reviews Sentiment Analysis
    
    Author: Ashish Kumar
'''
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
import numpy as np
import time


In [2]:
import os
import sys
import time
import re
import unicodecsv as csv
import unicodedata as un
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

### Importing Natural Language Toolkit(NLTK) for symbolic and statistical natural language processing for English written words.

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Reading the train csv file generated by combining all the negative and positive reviews.

In [4]:
df_train = pd.read_csv('/Users/ashish/Box/jupyter data/movieSentiment/Data-Science/aclImdb_v1/aclImdb/train.csv')

In [5]:
df_train.head()

Unnamed: 0,label,data
0,0,ashishashishvirtualbox filehomeashishconfiglib...
1,1,i get tired of my and year old daughters con...
2,1,this movie is outrageous funny ribald sophisti...
3,1,i have to start saying it has been a long time...
4,1,if the very thought of arthur askey twists you...


### Reading the test csv file generated by combining all the negative and positive reviews.

In [6]:
df_test = pd.read_csv('/Users/ashish/Box/jupyter data/movieSentiment/Data-Science/aclImdb_v1/aclImdb/test.csv')

In [7]:
df_test.head()

Unnamed: 0,label,data
0,1,ok so it can be done we have here the perfect ...
1,1,i thought this was a very good movie someone s...
2,1,are we allowed to interfere with our fellowmen...
3,1,i described woody allens manhattan as perfect ...
4,1,married to the mob was one of the first vhs ta...


### Counting the number of tokens in train & test dataset 

In [8]:
counter = 0
counter1 = 0
for each_row in df_train['data']:
    counter += len(each_row)

for each_row in df_test['data']:
    counter1 += len(each_row)
print("Tokens in Train = ",counter)
print("Tokens in Test = ", counter1)

Tokens in Train =  31408174
Tokens in Test =  30660597


### Printing average number of tokens in each sentence in train and test dataset.

In [9]:
print("Average Tokens in Train in each sentence= ",counter/df_train.shape[0])
print("Average Tokens in Test in seach sentence = ", counter1/df_test.shape[0])

Average Tokens in Train in each sentence=  1256.2767089316428
Average Tokens in Test in seach sentence =  1226.42388


In [10]:
from nltk.util import ngrams
from collections import Counter

tokens_list_train = df_train.data.tolist()
tokens_list_test = df_train.data.tolist()

In [11]:
tokens = []
for i in tokens_list_train:
    for j in i.split():
        tokens.append(j)
        
        

In [12]:
tokens[1:10]

['filehomeashishconfiglibreoffice',
 'i',
 'get',
 'tired',
 'of',
 'my',
 'and',
 'year',
 'old']

In [13]:
tokens_list_test[3].split()[2]

'to'

### Displaying the most common trigrams

In [14]:
trigram = ngrams(tokens,3)

Counter(trigram).most_common(10)
    

[(('one', 'of', 'the'), 4815),
 (('this', 'movie', 'is'), 2476),
 (('of', 'the', 'film'), 2363),
 (('a', 'lot', 'of'), 2255),
 (('this', 'is', 'a'), 2140),
 (('of', 'the', 'movie'), 2039),
 (('some', 'of', 'the'), 1855),
 (('is', 'one', 'of'), 1746),
 (('the', 'film', 'is'), 1686),
 (('this', 'film', 'is'), 1669)]

In [15]:
from nltk import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tokenizer = TweetTokenizer()


### "TF-idf" term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

In [17]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenizer.tokenize)

In [18]:
#merge train & test list
full_text = tokens_list_train + tokens_list_test

In [19]:
%%time
vectorizer.fit(full_text)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x1a33ee4b00>>,
        use_idf=True, vocabulary=None)

In [20]:
'''Shuffle the data before vectorizing it
    
    
'''
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

In [21]:
df_train.head()

Unnamed: 0,label,data
0,2,lot of silly plot holes in the film first we s...
1,2,i gave this stars out of a possible because...
2,2,the only reason i am commenting is because i f...
3,2,there are so many puns to play on the title of...
4,2,there was a genie played by shaq his name was ...


In [22]:
df_test.head()

Unnamed: 0,label,data
0,2,i decided to hire out this movie along with a ...
1,2,sometimes i think that somewhere in the lifeti...
2,1,the brighton has a traumatic drama in the brea...
3,1,i was skeptical before going to this because o...
4,1,this is a great film for pure entertainment no...


In [23]:
#transform our train & test data
train_vectorized = vectorizer.transform(df_train.data)
test_vectorized = vectorizer.transform(df_test.data)

In [24]:
#check the shape of transform data
print(train_vectorized.shape)

(25001, 1621817)


### Machine Learning Part starts here... 

### Training the model using Logistic Regression with One vs Rest Classifier. 

In [27]:
%%time
#Logistic Regression and One Vs Rest Classifier 

logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

CPU times: user 679 µs, sys: 464 µs, total: 1.14 ms
Wall time: 581 µs


In [28]:
%%time
ovr.fit(train_vectorized, df_train.label.values)

CPU times: user 28.4 s, sys: 952 ms, total: 29.3 s
Wall time: 15.2 s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

### Accuracy of logistic Regression.

In [29]:

ovr.score(test_vectorized, df_test.label.values)

0.88392

In [38]:
review = ['I hate this movies']
review_vectorized = vectorizer.transform(review)

In [39]:
y_predict = ovr.predict(review_vectorized)

In [40]:
pred_review = "Positive"

for each_review, each_y_pred in zip(review, y_predict):
    if each_y_pred == 1:
        pred_review = "Positive"
    else:
        pred_review = "Negative"
    print("{0}: {1}".format(each_review, pred_review))

I hate this movies: Negative


In [41]:
%%time
from sklearn.model_selection import cross_val_score
scores = cross_val_score(ovr, train_vectorized, df_train.label.values, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))



Cross-validation mean accuracy 87.91%, std 0.53.
CPU times: user 944 ms, sys: 365 ms, total: 1.31 s
Wall time: 35.3 s


In [33]:
#Try with different algorithms.
## - LogReg/SVM/NB/DT/NN
#    CrossVal

In [42]:

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [48]:
%%time
svc = SVC(gamma = 0.01, C = 100)
svc.fit(train_vectorized, df_train.label.values)
# scores_svc.fit(train_vectorized, df_train.label.values)

CPU times: user 1h 4min 37s, sys: 28.9 s, total: 1h 5min 6s
Wall time: 1h 6min 42s


In [49]:
svm_acc = svc.score(test_vectorized, df_test.label.values)
print(svm_acc)

0.89912


In [47]:
%%time
ovr = LogisticRegression()
param_grid = {'C': [0.1,10, 1000,5000]}
logistic_grid = GridSearchCV(ovr, param_grid, cv=10, refit=True, verbose=1)
logistic_grid.fit(train_vectorized, df_train.label.values)



Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 21.6min finished


CPU times: user 26min 33s, sys: 33.7 s, total: 27min 6s
Wall time: 22min 27s


In [50]:
log_acc = logistic_grid.score(test_vectorized, df_test.label.values)
print(log_acc)


0.90008


In [74]:
review = ['I hate this movies', 'It was normal', 'It was not good']
review_vectorized = vectorizer.transform(review)
y_predict = logistic_grid.predict(review_vectorized)

pred_review = "Positive"

for each_review, each_y_pred in zip(review, y_predict):
    if each_y_pred == 1:
        pred_review = "Positive"
    else:
        pred_review = "Negative"
    print("{0}: {1}".format(each_review, pred_review))

I hate this movies: Negative
It was normal: Positive
It was not good: Negative


In [55]:
%%time
# Multinomial NB
mnb = MultinomialNB()
mnb.fit(train_vectorized, df_train.label.values)
mnb_acc = mnb.score(test_vectorized, df_test.label.values)
print(mnb_acc)

0.86664
CPU times: user 569 ms, sys: 55.1 ms, total: 624 ms
Wall time: 575 ms


In [64]:
%%time
# BernouliNB
bnb = BernoulliNB()
bnb.fit(train_vectorized, df_train.label.values)
bnb_acc = mnb.score(test_vectorized, df_test.label.values)
print(mnb_acc)

0.86664
CPU times: user 710 ms, sys: 136 ms, total: 846 ms
Wall time: 790 ms


In [75]:
%%time
#KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_vectorized, df_train.label.values)
knn_acc = neigh.score(test_vectorized, df_test.label.values)

CPU times: user 1min 48s, sys: 1min 36s, total: 3min 24s
Wall time: 10min 17s


In [76]:
print(knn_acc)

0.66272


In [71]:
%%time
#Perceptron
from sklearn.linear_model import Perceptron
perc = Perceptron(tol=1e-3, random_state=0)
perc.fit(train_vectorized, df_train.label.values)
perc_acc = perc.score(test_vectorized, df_test.label.values)
print(perc_acc)

0.8768
CPU times: user 1.69 s, sys: 66.1 ms, total: 1.75 s
Wall time: 1.04 s


In [85]:
review = ['the actor was eating', 'cat, dog and animals', 'My favorite actor was killed ']
review_vectorized = vectorizer.transform(review)
y_predict = perc.predict(review_vectorized)

pred_review = "Positive"

for each_review, each_y_pred in zip(review, y_predict):
    if each_y_pred == 1:
        pred_review = "Positive"
    else:
        pred_review = "Negative"
    print("{0}: {1}".format(each_review, pred_review))

the actor was eating: Negative
cat, dog and animals: Negative
My favorite actor was killed : Positive


In [50]:
%%time 
#KNN with GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors': [3, 4, 5]}
ovr = KNeighborsClassifier()
logistic_grid = GridSearchCV(ovr, param_grid, cv=10, refit=True, verbose=1)
logistic_grid.fit(train_vectorized, df_train.label.values)
logistic_grid_acc = logistic_grid.score(train_vectorized, df_train.label.values)
print(logistic_grid_acc)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 959.9min finished


0.8851645934162633
CPU times: user 50min 19s, sys: 23min 59s, total: 1h 14min 18s
Wall time: 2d 21h 47min 7s


In [26]:
%%time
#MLP Classifier for score = 'adam'
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(train_vectorized, df_train.label.values)
clf_acc = clf.score(test_vectorized, df_test.label.values)
print(clf_acc)

0.5
CPU times: user 1h 16min 29s, sys: 16min 21s, total: 1h 32min 50s
Wall time: 48min 57s


In [None]:
%%time
#MLP Classifier for score = 'lbfgs'
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(train_vectorized, df_train.label.values)
clf_acc = clf.score(test_vectorized, df_test.label.values)
print(clf_acc)

In [None]:
%%time
#MLP Classifier for score = 'sgd'
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(train_vectorized, df_train.label.values)
clf_acc = clf.score(test_vectorized, df_test.label.values)
print(clf_acc)

In [35]:
#MLP Classifier with GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

In [37]:
%%time 
param_grid = {'solver':['lbfgs', 'sgd', 'adam'] }
ovr = MLPClassifier(alpha=1e-5, hidden_layer_sizes = (5,2))
mlp_grid = GridSearchCV(ovr, param_grid, cv=5, refit=True, verbose=1)
mlp_grid_fit = mlp_grid.fit(train_vectorized, df_train.label.values)

print(mlp_grid_fit.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 1018.1min finished


0.7394504219831207
CPU times: user 7h 16min 29s, sys: 1h 19min 48s, total: 8h 36min 17s
Wall time: 17h 2min


In [38]:
print(mlp_grid_fit.best_estimator_)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


In [49]:
%%time
## Adaboost with logistic Regression.
from sklearn.ensemble import AdaBoostClassifier
log = LogisticRegression()
ada = AdaBoostClassifier(base_estimator=log, n_estimators=50, learning_rate=1.0,random_state=None)
ada.fit(train_vectorized, df_train.label.values)

CPU times: user 7min 54s, sys: 18 s, total: 8min 12s
Wall time: 4min 27s


In [50]:
%%time
ada_acc = ada.score(test_vectorized, df_test.label.values)
print(ada_acc)

0.78412
CPU times: user 18.5 s, sys: 2 s, total: 20.5 s
Wall time: 11.8 s


In [52]:
%%time
## Adaboost with Percepton.
from sklearn.linear_model import Perceptron
perc = Perceptron(tol=1e-3, random_state=0)
ada = AdaBoostClassifier(base_estimator=perc, n_estimators=50, learning_rate=1.0,random_state=None, algorithm='SAMME')
ada.fit(train_vectorized, df_train.label.values)
ada_acc = ada.score(test_vectorized, df_test.label.values)
print(ada_acc)

0.86944
CPU times: user 1min 19s, sys: 2.99 s, total: 1min 22s
Wall time: 51.4 s


In [69]:
%%time
## GradientBoostingClassifier with MLP Classifier.
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(train_vectorized, df_train.label.values)
gbc_acc = gbc.score(test_vectorized, df_test.label.values)
print(gbc_acc)

0.82404
CPU times: user 27min 53s, sys: 12 s, total: 28min 5s
Wall time: 27min 42s


In [None]:
''' 
    TO-DO List
    
    Decision Tree, Random Forest
    Ensemble methods
    Word Embeding: Word2vec, glove
    
    Visualization:
    Tag Clouds
    Colocate clouds
    nltk Vader
    High & low top words
    
'''