# IMDB Review


In [55]:
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [7]:
# Read Data
data=pd.read_csv('imdb_master.csv',encoding='latin',index_col=[0])
data.head()

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 4 columns):
type      100000 non-null object
review    100000 non-null object
label     100000 non-null object
file      100000 non-null object
dtypes: object(4)
memory usage: 3.8+ MB


In [9]:
data.type.value_counts()

train    75000
test     25000
Name: type, dtype: int64

In [10]:
train=data[data.type=='train']
test=data[data.type=='test']

In [16]:
train=train[train.label!='unsup']

In [17]:
train_X=train['review']
train_Y=train['label']
test_X=test['review']
test_Y=test['label']

In [18]:
train_Y.value_counts()

pos    12500
neg    12500
Name: label, dtype: int64

In [19]:
test_Y.value_counts()

pos    12500
neg    12500
Name: label, dtype: int64

In [21]:
token='[A-Za-z0-9]+(?=\\s+)'

In [71]:
cv=CountVectorizer(token_pattern=token,ngram_range=(1,2))

In [72]:
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='[A-Za-z0-9]+(?=\\s+)',
        tokenizer=None, vocabulary=None)

In [73]:
tf=TfidfVectorizer(token_pattern=token,ngram_range=(1,2))

In [74]:
tf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='[A-Za-z0-9]+(?=\\s+)', tokenizer=None, use_idf=True,
        vocabulary=None)

In [75]:
hv=HashingVectorizer(token_pattern=token,ngram_range=(1,2), non_negative=True)

In [76]:
hv

HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=1048576, ngram_range=(1, 2), non_negative=True,
         norm='l2', preprocessor=None, stop_words=None, strip_accents=None,
         token_pattern='[A-Za-z0-9]+(?=\\s+)', tokenizer=None)

In [77]:
train_x_cv=cv.fit_transform(train_X)


In [78]:
train_x_tf=tf.fit_transform(train_X)


In [79]:
train_x_hv=hv.fit_transform(train_X)



In [80]:
clf_cv=MultinomialNB()

In [81]:
clf_cv.fit(train_x_cv,train_Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [95]:
accuracy_score(train_Y,clf_cv.predict(train_x_cv))

0.98488

In [82]:
accuracy_score(test_Y,clf_cv.predict(cv.transform(test_X)))

0.842

In [83]:
clf_tf=MultinomialNB()

In [84]:
clf_tf.fit(train_x_tf,train_Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [93]:
accuracy_score(train_Y,clf_tf.predict(train_x_tf))

0.965

In [85]:
accuracy_score(test_Y,clf_tf.predict(tf.transform(test_X)))

0.8536

In [86]:
clf_hv=MultinomialNB()
clf_hv.fit(train_x_hv,train_Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [97]:
accuracy_score(train_Y,clf_hv.predict(train_x_hv))

0.89152

In [87]:
accuracy_score(test_Y,clf_hv.predict(hv.transform(test_X)))



0.82292

In [88]:
train_x_hv.shape

(25000, 1048576)

In [89]:
train_x_cv.shape

(25000, 1179246)

In [90]:
train_x_tf.shape

(25000, 1179246)

In [91]:
data.review[0]

"Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."

In [98]:
from sklearn.linear_model import LogisticRegression

In [99]:
clf_lg=LogisticRegression()

In [101]:
mapping={'neg':0,'pos':1}
train1_Y=train_Y.apply(lambda x: mapping[x])
test1_Y=test_Y.apply(lambda x: mapping[x])

In [103]:
clf_lg.fit(train_x_tf,train1_Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [105]:
accuracy_score(train1_Y,clf_lg.predict(train_x_tf))

0.94764

In [107]:
accuracy_score(test1_Y,clf_lg.predict(tf.transform(test_X)))

0.8704