In [1]:
# Data manipulation
import numpy as np
import pandas as pd
np.random.seed(2)
pd.options.display.max_columns = None

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
%matplotlib inline

# Modelling 
import lightgbm as lgb
from scipy.stats import expon, randint, uniform
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV, train_test_split

# NLP
from gensim.models import Word2Vec
from nltk import word_tokenize

# Other
import utility as util
import importlib 
importlib.reload(util)
import joblib

# Display all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load preprocessed data

In [2]:
reviews_preprocessed = pd.read_csv('../Datasets/processedAnimeReviews.csv', index_col = 'id')

In [3]:
reviews_preprocessed.head(10)

Unnamed: 0_level_0,workName,overallRating,review,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8121,Cowboy_Bebop,10,cowboy bebop episodic series episodic mean one...,1
63480,Utawarerumono,8,utawarerumono manages one harem anime anyone p...,1
8452,Hajime_no_Ippo,10,first let say fan boxing fact pretty much hate...,1
66544,Gensoumaden_Saiyuuki,9,saiyuki one anime grab first episode let go ev...,1
55936,Ranma_½,7,comedy romance based manga rumiko takahashi ra...,1
22039,Kino_no_Tabi__The_Beautiful_World,9,say anime traveler journeying different countr...,1
68626,Kareshi_Kanojo_no_Jijou,8,kare kano romance anime could become incredibl...,1
18797,Hunter_x_Hunter,10,overall best anime actually seen anything else...,1
43899,Golden_Boy,10,overall honestly really care others opinion an...,1
18796,Hunter_x_Hunter,10,think hear anime people killing poor cute anim...,1


## Apply Word2Vec

In [6]:
# load model
w2v_model = Word2Vec.load('../Models/w2vmodel.bin')

In [7]:
# Get mean feature vector of all words in a sentence
def meanFeatureVec(sentence, word_vectors):
    word_vecs = [word_vectors[word] for word in word_tokenize(sentence)]
    mean_vec = np.asarray(word_vecs).mean(axis=0)
    return mean_vec

# Takes a dataframe of the reviews and returns a new dataframe of the word embeddings per review
def reviewToVectors(sentences, word_vectors):
    sent_vecs = [meanFeatureVec(sentence, word_vectors) for sentence in sentences]
    df = pd.DataFrame(sent_vecs, index=sentences.index)
    return df

In [8]:
# # Convert reviews to word embeddings
# X_vectors = reviewToVectors(reviews_preprocessed['review'], w2v_model.wv)

# Load presaved results
X_vectors = pd.read_csv('../Datasets/reviewsWordVectors.csv', index_col='id')

# # Save reviews
# X_vectors.to_csv('../Datasets/reviewsWordVectors.csv')

In [9]:
# Split into train and test
y = reviews_preprocessed[['sentiment', 'overallRating']]

X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.1, random_state=2)

## Dummy model

In [10]:
metrics = ['accuracy', 'recall', 'precision', 'f1', 'roc_auc']

In [10]:
clf_dummy = DummyClassifier()
util.cross_validate_scores(clf_dummy, X_train, y_train['sentiment'], cv=5, metrics=metrics)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Training scores
accuracy: 0.6903 (0.0010)
recall: 0.8088 (0.0004)
precision: 0.8088 (0.0007)
f1: 0.8082 (0.0005)
roc_auc: 0.5008 (0.0014)

Validation Scores
accuracy: 0.6915 (0.0009)
recall: 0.8091 (0.0019)
precision: 0.8093 (0.0015)
f1: 0.8094 (0.0017)
roc_auc: 0.4990 (0.0032)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s finished


Above is the worst baseline/worst accuracy we will be using. The above strategy randomly predicts a class based on the probability distribution of the observed samples.

## LightGBM Model

In [59]:
clf_lgb = lgb.LGBMClassifier(n_jobs=-1)

util.cross_validate_scores(clf_lgb, X_train, y_train['sentiment'], cv=5, metrics=metrics)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Training scores
accuracy: 0.8699 (0.0006)
recall: 0.9725 (0.0002)
precision: 0.8795 (0.0004)
f1: 0.9237 (0.0003)
roc_auc: 0.9016 (0.0004)

Validation Scores
accuracy: 0.8528 (0.0016)
recall: 0.9641 (0.0013)
precision: 0.8684 (0.0010)
f1: 0.9137 (0.0010)
roc_auc: 0.8592 (0.0029)


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   38.6s finished


From the average fold validation scores it seems like the model is doing a pretty good job of not overfitting. As well it does much better than the baseline dummy model. We can perform some grid searching to try and gain lower bias ...

### Randomized search

Randomized search was chosen over the brute force grid search as grid searching would've taken too long and random search provides decent results in a much smaller amount of time

In [10]:
# Start with frehs model
clf_lgb = lgb.LGBMClassifier(n_jobs=-1)

lgb_params_dists = {
    'num_leaves': randint(1, 100),
    'max_depth': [-1, 2, 4, 8],
    'min_child_samples': randint(1, 30),
    'min_child_weight': uniform(1e-4, 1e-1),
    'n_estimators': randint(100, 2000),
    'reg_alpha': uniform(1e-3, 1e2),
    'reg_lambda': uniform(1e-3, 1e2),
}

lgb_random_search = RandomizedSearchCV(
    estimator = clf_lgb, 
    param_distributions = lgb_params_dists, 
    n_iter = 100,
    scoring = metrics,
    refit = 'accuracy',
    cv = 5,
    iid = True,
    n_jobs = -1,
    return_train_score = True,
    verbose = 1)

In [None]:
# Fit grid search
lgb_random_search.fit(X_train, y_train['sentiment'])

# Save grid search results 
joblib.dump(lgb_random_search, '../Models/lgb_grid_search.pkl')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 20.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 82.2min
exception calling callback for <Future at 0x20b5e3ed4c8 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "D:\ProgramData\Anaconda3\envs\csi-4106-project\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "D:\ProgramData\Anaconda3\envs\csi-4106-project\lib\site-packages\joblib\parallel.py", line 340, in __call__
    self.parallel.dispatch_next()
  File "D:\ProgramData\Anaconda3\envs\csi-4106-project\lib\site-packages\joblib\parallel.py", line 768, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "D:\ProgramData\Anaconda3\envs\csi-4106-project\lib\site-packages\joblib\parallel.py", line 834, in dispatch_one_batch
    self._dispatch(tasks)
  File "D:\ProgramData\A