## Movie Review Sentiment Analysis

#### 1. Load data

Data are from preserved datasets in nltk library

In [1]:
import nltk                           
nltk.download("movie_reviews")         
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Zhimin\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [2]:
#'neg/...' means it contains negative review, and 'pos/...' positive review
files=movie_reviews.fileids()
files[:5]+files[-5:]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'pos/cv995_21821.txt',
 'pos/cv996_11592.txt',
 'pos/cv997_5046.txt',
 'pos/cv998_14111.txt',
 'pos/cv999_13106.txt']

In [3]:
len(files)

2000

#### 2. Train and test dataset 

In [4]:
#first shuffle files randomly 
from random import shuffle
shuffle(files)

In [5]:
#split into 80% train and 20% test
train=files[:1600]
test=files[1600:]

In [6]:
X_train=[movie_reviews.raw(movie) for movie in train]
X_test=[movie_reviews.raw(movie) for movie in test]

In [7]:
#get the y label
def label(file):
    if file[:3]=='pos':
        return 'positive'
    else:
        return 'negative'

y_train=[label(file) for file in train]
y_test=[label(file) for file in test]

#### 3. Model selection

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load('en_core_web_sm', disable=['parser','tagger','ner','textcat'])

def tokenize_lemma(text):
    """
    function to stem the words, eg: 'apple', 'apples' both return 'apple'
    """
    return [w.lemma_ for w in nlp(text)]

#since text is stemmed, stop_words should also be stemmed.
stop_words_lemma = set(w.lemma_ for w in nlp(' '.join(STOP_WORDS)))

vectorizer=TfidfVectorizer(stop_words=stop_words_lemma, tokenizer=tokenize_lemma)
classifier=SGDClassifier(max_iter=50)

pipe=Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])

In [17]:
#grid search to find the best estimater
from sklearn.model_selection import GridSearchCV

parameters={#(1,1) means features are single word, (1,2) has single word but also has phrase
            'vectorizer__ngram_range':[(1,1), (1,2)],
            #alpha is regularization parameter, the smaller, the heavier regu;arization
           'classifier__alpha': (0.001, 0.0001, 0.00001),
            #log means linear regression, hinge means linear SVM
           'classifier__loss': ('log', 'hinge')}

gridsearch=GridSearchCV(pipe, parameters, cv=5)
gridsearch.fit(X_train, y_train)
gridsearch.score(X_test, y_test)



0.855

In [18]:
#get the best model
best_model=gridsearch.best_estimator_

In [19]:
#get the parameters in vectorizer of best model
best_model.named_steps['vectorizer']

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'anyone', 'fifteen', 'whereby', 'therein', 'until', 'whereupon', 'name', 'we', 'hence', 'take', 'the', 'far', 'when', 'fifty', 'once', 'nevertheless', 'wherein', 'one', 'toward', 'show', 'otherwise', 'by', 'while', 'her', 'indeed', 'top', 'too', 'beside', 'much', 'below', 'nowhere', 'yet...ometime', 'such', 'full', 'myself', '\ufeff1', 'only', 'six', 'whereafter', 'eight', 'quite', 'any'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_lemma at 0x0000028F6A395F28>,
        use_idf=True, vocabulary=None)

In [20]:
#get the parameters in classifier of the best model
best_model.named_steps['classifier']

SGDClassifier(alpha=1e-05, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=50,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

#### 4. Model Prediction

In [41]:
best_model.fit(X_train+X_test, y_train+y_test)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [50]:
#get some of reviews about movie <Abe> from Rotten Tomatoes
review=["With a more streamlined script, or even fewer characters and more developed relationships, \"Abe\" could have made a real impact. As it stands, there are too many cooks in the kitchen.",
       "A great measure of \"Abe\"'s success is that it made me hungry. More than that, it's the first movie in quite some time to make me smile.",
       "Abe could have been one unforgettable meal, but instead, its aftertaste is barely memorable."
       ]

In [51]:
best_model.predict(review)

array(['negative', 'positive', 'positive'], dtype='<U8')