# Train simple sentiment analysis model

In [336]:
# Load libraries
import pandas as pd
import numpy as np
import re
import glob
import os

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn import metrics
from nltk.corpus import stopwords

In [337]:
# Load data
df = pd.read_csv("../Data/imdb_master.csv", header=0)

In [338]:
def get_data(df, typeof):
    
    filteridxs = np.logical_and(df.type == typeof, df.label != 'unsup')
    # We only want to keep the text and the label
    df = df.loc[filteridxs, ['review', 'label']]
    df_x = df['review']
    df_Y = df['label']
    
    return df_x, df_Y 

train_x, train_Y = get_data(df, 'train')
test_x, test_Y = get_data(df, 'test')

In [339]:
# Strip out non-alphanumeric characters
features = [re.sub(r'[^a-zA-Z0-9]',' ', D).lower().strip() for D in train_x]

In [340]:
features = [re.split('\s+', D) for D in features if features != '']
features = [[x for x in s if x.isalpha()] for s in features]

In [341]:
# Remove stopwords
#stop = set(stopwords.words('english'))
#features = [[w for w in s if w not in stop] for s in features]

In [342]:
features = [' '.join(x) for x in features]

In [343]:
# Vectorize
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2))

In [344]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [345]:
X = tfidf.fit_transform(features)

In [346]:
nb_model = MultinomialNB()

In [347]:
nb_model.fit(X = X, y = train_Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [348]:
preds = nb_model.predict(X)

In [349]:
metrics.accuracy_score(train_Y, preds)

0.9846

In [350]:
# Strip out non-alphanumeric characters
features_test = [re.sub(r'[^a-zA-Z0-9]',' ', D).lower().strip() for D in test_x]

In [351]:
# Strip out numbers [0-9]
features_test = [re.split('\s+', D) for D in features_test if D != '']
features_test = [[x for x in s if x.isalpha()] for s in features_test]

In [352]:
# Remove stopwords
stop = set(stopwords.words('english'))
features_test = [[w for w in s if w not in stop] for s in features_test]

In [353]:
features_test = [' '.join(x) for x in features_test]

In [354]:
X_test = tfidf.transform(features_test)

In [355]:
pred_test = nb_model.predict(X_test)

In [356]:
metrics.accuracy_score(test_Y, pred_test)

0.85596

In [357]:
pred_test

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'pos'], dtype='<U3')

In [358]:
def predict_review(review):
    
    # Strip out non-alphanumeric characters
    features_test = [re.sub(r'[^a-zA-Z0-9]',' ', D).lower().strip() for D in review]
    # Strip out numbers [0-9]
    features_test = [re.split('\s+', D) for D in features_test if D != '']
    features_test = [[x for x in s if x.isalpha()] for s in features_test]
    # Remove stopwords
    stop = set(stopwords.words('english'))
    features_test = [[w for w in s if w not in stop] for s in features_test]
    features_test = [' '.join(x) for x in features_test]
    X_test = tfidf.transform(features_test)
    pred_test = nb_model.predict(X_test)
    return pred_test

In [361]:
predict_review(['not interesting'])

array(['neg'], dtype='<U3')

In [362]:
# Save out model
joblib.dump(tfidf, '../FunctionsCode/sentiment-predictor/tfidf.pkl')
joblib.dump(nb_model, '../FunctionsCode/sentiment-predictor/model.pkl')

['../FunctionsCode/sentiment-predictor/model.pkl']