# Sentiment Analysis

Steps:

Load IMBDb movie review dataset

Build feature vector from text data
- Bag-of-Words
- TF-IDF
- Tokens/n-grams
- Stemming
- Stop-words

Train ML model to classify positive vs negative reviews

Implement online algorithm and Out-of-Core learning

1. obtain the movie dataset

In [2]:
import pandas as pd
import os
import pyprind

pwd = os.getcwd()
print(pwd+'/aclImdb/')
file = os.listdir(pwd)
basepath = '/Users/zhangdoudou/Desktop/aclImdb'

pbar = pyprind.ProgBar(50000)

labels = {'pos':1, 'neg':0}

df = pd.DataFrame()

for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath,s,l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:

                try:
                    txt = infile.read()
                    df = df.append([ [txt, labels[l]] ], ignore_index=True)
                    pbar.update()
                except:
                    pass

df.columns = ['review', 'sentiment']



/Users/zhangdoudou/Desktop/Lecture 4/aclImdb/


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:49


In [3]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding='utf-8')

In [4]:
import pandas as pd
import os

pwd = os.getcwd()

df = pd.read_csv( pwd+'/movie_data.csv', encoding='utf-8')
df.columns = ['review', 'sentiment']

print(df.shape)

(50000, 2)


2. preprocess

bag-of-words model

In [5]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()

docs = np.array(['The sun is shining', 
                 'The weather is sweet',
                 'The sun is shining and the weather is sweet'])

bag = count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


cleaning text data

In [6]:
import re

def preprocessor(text): 
# find '<' then anything not '>' [^>], [^>]* 0 or more prefix, then close with '>'    
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) 
    # eyes[:,;,=], optional nose [-], and mouth[),(,D,P)]
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

df['review'] = df['review'].apply(preprocessor)

In [7]:
df.iloc[0,0]

'in 1974 the teenager martha moxley maggie grace moves to the high class area of belle haven greenwich connecticut on the mischief night eve of halloween she was murdered in the backyard of her house and her murder remained unsolved twenty two years later the writer mark fuhrman christopher meloni who is a former la detective that has fallen in disgrace for perjury in o j simpson trial and moved to idaho decides to investigate the case with his partner stephen weeks andrew mitchell with the purpose of writing a book the locals squirm and do not welcome them but with the support of the retired detective steve carroll robert forster that was in charge of the investigation in the 70 s they discover the criminal and a net of power and money to cover the murder murder in greenwich is a good tv movie with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a kennedy the powerful and rich family used their influence to cover the mur

3. Training Logistic Regression for document classification

logistic regression

In [30]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values

X_test  = df.loc[25000:, 'review'].values
y_test  = df.loc[25000:, 'sentiment'].values

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
tfidf = TfidfVectorizer(strip_accents = None,
                        lowercase = False,
                        preprocessor = None)



In [33]:
param_grid = [
              {'vect__ngram_range':[(1,1)],
               'vect__stop_words': [ None], # stop
               'vect__tokenizer': [tokenizer], #, tokenizer_porter],
               'clf__penalty': ['l1'], # , 'l2'
               'clf__C': [100]}, # 0.1, 1.0, 10.0, 100.0]},
              
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [None],
               'vect__tokenizer': [tokenizer], #, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1'], # , 'l2'
               'clf__C':[100]} #[0.1, 1.0,10.0,100.0]}
                ]



In [34]:
lr_tfidf = Pipeline([ ('vect', tfidf) ,
                      ('clf',  LogisticRegression(random_state=0))])



In [35]:
gs_lr_tfidf = GridSearchCV( lr_tfidf, param_grid,
                          scoring = 'accuracy',
                          cv = 5, verbose = 1,n_jobs=-1) # n_jobs -1 uses all computer cores

In [36]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...e, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [<function tokenizer at 0x1a23db8d90>], 'clf__penalty': ['l1'], 'clf__C': [100]}, {'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [<function tokenizer at 0x1a23db8d90>], 'vect__use_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1'], 'clf__C': [100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_scor

In [37]:
print('The Best parameter set: %s' % gs_lr_tfidf.best_params_)

The Best parameter set: {'clf__C': 100, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x1a23db8d90>}


In [38]:
print('CV Accuracy: %.3f'
     % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_

print('Test Accuracy: %.3f' % clf.score(X_test, y_test))



CV Accuracy: 0.868
Test Accuracy: 0.875


-Use f1 score as our metric

In [39]:
gs_lr_tfidf = GridSearchCV( lr_tfidf, param_grid,
                          scoring = 'f1',
                          cv = 5, verbose = 1,n_jobs=-1) 

In [40]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...e, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [<function tokenizer at 0x1a23db8d90>], 'clf__penalty': ['l1'], 'clf__C': [100]}, {'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [<function tokenizer at 0x1a23db8d90>], 'vect__use_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1'], 'clf__C': [100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_scor

In [41]:
print('The Best parameter set: %s' % gs_lr_tfidf.best_params_)

The Best parameter set: {'clf__C': 100, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x1a23db8d90>}


In [42]:
print('CV F1 Score: %.3f'
     % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_

print('Test F1 Score: %.3f' % clf.score(X_test, y_test))



CV F1 Score: 0.870
Test F1 Score: 0.875


-Using ROC-AUC as our evaluation metric

In [47]:
gs_lr_tfidf = GridSearchCV( lr_tfidf, param_grid,
                          scoring = 'roc_auc',
                          cv = 5, verbose = 1,n_jobs=-1) # n_jobs -1 uses all computer cores

In [48]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...e, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [<function tokenizer at 0x1a23db8d90>], 'clf__penalty': ['l1'], 'clf__C': [100]}, {'vect__ngram_range': [(1, 1)], 'vect__stop_words': [None], 'vect__tokenizer': [<function tokenizer at 0x1a23db8d90>], 'vect__use_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1'], 'clf__C': [100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_scor

In [45]:
print('The Best parameter set: %s' % gs_lr_tfidf.best_params_)

The Best parameter set: {'clf__C': 100, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x1a23db8d90>}


In [49]:
print('CV ROC/AUC: %.3f'
     % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_

print('Test ROC/AUC: %.3f' % clf.score(X_test, y_test))

CV ROC/AUC: 0.942
Test ROC/AUC: 0.875
