### obtaining the movie review dataset

#### http://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
import pandas as pd
import os

In [2]:
basepath = 'C:/sentiment_analysis/aclImdb/'





In [3]:
labels = {'pos':1,'neg':0}

In [4]:
df = pd.DataFrame()

In [5]:
for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath,s,l)
        
        for fil in os.listdir(path):
            
            with open(os.path.join(path,fil),'r',encoding='utf-8') as f:
                txt = f.read()
            df = df.append([[txt,labels[l]]],ignore_index=True)
            
df.columns=['review','sentiment']

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [7]:
df.sentiment.value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [8]:
import numpy as np

In [9]:
from sklearn.utils import shuffle

In [10]:
df = shuffle(df)

In [11]:
df.head()

Unnamed: 0,review,sentiment
42130,"After watching this film, I was left with a tw...",0
47992,How hard is it to write a watchable film with ...,0
39121,"Not much to say beyond the summary, save that ...",0
46874,When I caught a glimpse of the title I thought...,0
34012,When people ask me why do I like movies so muc...,1


In [12]:
df.to_csv('movie_reviews.csv',index=False)

In [13]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [14]:
stop = stopwords.words('english')

In [15]:
df_new = pd.read_csv('movie_reviews.csv')

In [16]:
df_new.head()

Unnamed: 0,review,sentiment
0,"After watching this film, I was left with a tw...",0
1,How hard is it to write a watchable film with ...,0
2,"Not much to say beyond the summary, save that ...",0
3,When I caught a glimpse of the title I thought...,0
4,When people ask me why do I like movies so muc...,1


In [17]:
df_new.loc[0,'review'][-50:]

'nut-hugging Chen fans. For me, "Dog Bite This DVD"'

In [18]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

# \W =matches any non-alphanumeric character;
# \D = matches any non-digit character
#<[^>]*> :all tag  ex..<br />, <a>
# [^>] :except '>' 

In [19]:
preprocessor("</a>This is a test :-)!</a>")

'this is a test :'

In [20]:
df_new['review'] = df_new['review'].apply(preprocessor)

In [21]:
X_train = df_new.loc[:2500, 'review'].values
y_train = df_new.loc[:2500, 'sentiment'].values
X_test = df_new.loc[2500:5000, 'review'].values
y_test = df_new.loc[2500:5000, 'sentiment'].values

In [22]:
print(np.bincount(y_test))
print(np.unique(y_test))

[1216 1285]
[0 1]


In [23]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [24]:
tfidf = TfidfVectorizer(stop_words='english')

param_grid = {'clf__C': [1.0, 10.0, 100.0]}

In [25]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression())])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',)

#lr_tfidf.steps

In [26]:
gs_lr_tfidf.fit(X_train, y_train)
#print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
#print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__C': [1.0, 10.0, 100.0]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [27]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0} 
CV Accuracy: 0.826


In [28]:
clf = gs_lr_tfidf.best_estimator_

In [29]:
clf.score(X_test,y_test)

0.837265093962415

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [31]:
nb = Pipeline([('vect', tfidf),
               ('clf', MultinomialNB())])


In [32]:
nb.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [33]:
nb.score(X_train,y_train)

0.9724110355857657

In [34]:
nb.score(X_test,y_test)

0.8168732506997202

In [35]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [36]:
#cv = CountVectorizer(stop_words='english')
cv = TfidfVectorizer(stop_words='english')

In [37]:
new_data = cv.fit_transform(X_train)
new_test = cv.transform(X_test)

In [38]:
new_data.shape

(2501, 28065)

In [39]:
nb = MultinomialNB()
nb.fit(new_data,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [40]:
nb.score(new_test,y_test)

0.8168732506997202

In [41]:
nb.score(new_data,y_train)

0.9724110355857657

In [42]:
df

Unnamed: 0,review,sentiment
42130,"After watching this film, I was left with a tw...",0
47992,How hard is it to write a watchable film with ...,0
39121,"Not much to say beyond the summary, save that ...",0
46874,When I caught a glimpse of the title I thought...,0
34012,When people ask me why do I like movies so muc...,1
19099,"Hip, distinguished doctor James Coburn (as Pet...",0
39539,Ultimately too silly and pointless. Yes there ...,0
9175,While most movies that pit humans against horr...,1
34171,Will Smith is one of the best actors of all ti...,1
40574,In the title I write that the story is ludicro...,0


In [43]:
df.loc[0]

review       I went and saw this movie last night after bei...
sentiment                                                    1
Name: 0, dtype: object

In [44]:
df.loc[0,'review']

"I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."

In [45]:
df.loc[2400,'review']

'With all of mainland Europe under his control Hitler prepares for the last obstacle in his way before heading for North America, Great Britain. With an overwhelming edge in aircraft Goering\'s Luftwaffe looks unstoppable on paper. Once in the air however the RAF tenaciously disrupts the paradigm by blowing the enemy out of sky air at a seven to one rate. The Battle of Britain rages on for a over a year as the Island nation is bloodied but unbowed providing crucial time for their American allies to produce more arms for the inevitable struggle. <br /><br />Using more staged footage than the three previous documentaries in the Why We Fight series the Battle of Britain has a more propaganda like feel to it with the dramatized (some with unmistakable Warners music score ) scenes glaringly obvious to newsreel. In an ironic twist amid the devastation caused by German air attacks Beethoven\'s Seventh Symphony is employed to underscore the visual suffering. The story itself is one of remarkab

In [46]:
df.loc[2400,'sentiment']

1

In [47]:
df.loc[1234,'review']

'Fame, I think, was the best movie that I have ever seen. In ways it was funny and dramatic, but that is what makes a movie. True, it has a few loose ends, actually a lot, but I still think that it is a terrific movie. Some of the funny things happen in the audition at the beginning of the movie. I think it is hilarious when the girl tries to act out O.J. Simpson in "The Towering Inferno" and Raul/Ralph goes around to every art department saying that his father was great at every one. He says that his dad danced with the Rockets and left Ralph his tap shoes. The Rockets, as far as I know, are made up of women. And the tap shoes were just regular shoes with bottle caps on the bottom. Also the guy who read the lines of Juliet in the Romeo and Juliet play was funny. One thing about the movie that just turns me on is the music. I have never heard anything like it!!! My favorite song is "I Sing the Body Electric" and my second is the theme song itself "Fame". Irene Cara has a great voice an