In [23]:
import pandas as pd
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
import string
import ast
import re
import warnings
from meter import *
warnings.filterwarnings('ignore')

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten, LSTM
from keras.models import Model, Sequential, load_model
from keras.optimizers import Adam
from gensim.models.word2vec import Word2Vec

### Data preprocessing

In [11]:
# read in data
imdb_movie = pd.read_csv('data/imdb_multilabel.csv')

# drop movies with unknown plot
imdb_movie['plot'] = imdb_movie['plot'].apply(lambda sentence: 'drop' if (('plot ' in sentence.lower()) 
                                              and ('unknown' in sentence.lower()))
                                              or (len(sentence.split()) < 100) 
                                              else sentence)
imdb_movie = imdb_movie[imdb_movie['plot'] != 'drop']

# single-label encoding
genre_dict = dict(zip(imdb_movie.genre.unique(), range(20)))
genre_dict['sci-fi'] = 8
imdb_movie['genre_code'] = imdb_movie.genre.replace(genre_dict).values

# multi-label encoded as an array
def multi_label_encoder(all_genre_list):
    """ This function takes a list of genre with a dictionary that keeps track of the index of the genre
    INPUTS
    ------
    all_genre_list: list of genres
    genre_dict: dictionary of indexs
    
    OUTPUTS
    -------
    np array in {0, 1}
    """
    encode = np.zeros(20)
    all_genre_list = ast.literal_eval(all_genre_list)
    for genre in all_genre_list:
        if genre.lower() in genre_dict:
            encode[genre_dict[genre.lower()]] = 1
    return list(encode)

imdb_movie['all_genre_encode'] = imdb_movie['all_genre'].apply(multi_label_encoder)
imdb_movie['plot_list'] = imdb_movie['plot_list'].apply(ast.literal_eval)

# train test split
X_train, X_test, y_train, y_test = train_test_split(imdb_movie['plot'], 
                                                    imdb_movie['all_genre_encode'],
                                                    test_size = 0.2,
                                                    random_state = 209,
                                                    stratify = imdb_movie['genre'],
                                                    shuffle = True
                                                    )

# reshape y_train and y_test
y_train = np.array(list(y_train))
y_test = np.array(list(y_test))

In [3]:
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 300

### Load data

In [10]:
w2v_matrix = np.load('w2v_matrix.npy')
X_w2v_train = np.load('X_w2v_train.npy')
y_train = np.load('y_train.npy')
X_w2v_test = np.load('X_w2v_test.npy')
y_test = np.load('y_test.npy')

### GPU data preprocessing (don't rerun the following part on JupyterHub)

---

In [12]:
train_texts = X_train.apply(lambda x: 
                            re.sub('['+string.punctuation+']', 
                                   '', x.strip())).values
test_texts = X_test.apply(lambda x: 
                          re.sub('['+string.punctuation+']', 
                                 '', x.strip())).values

In [13]:
w2v_model = Word2Vec([t.split() for t in train_texts], size=EMBEDDING_DIM, min_count=1)

tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index
num_words = len(word_index) + 1

sequences = tokenizer.texts_to_sequences(train_texts)
X_w2v_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

test_sequences = tokenizer.texts_to_sequences(test_texts)
X_w2v_test = pad_sequences(test_sequences, 
                             maxlen=MAX_SEQUENCE_LENGTH)

In [14]:
w2v_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_vector = w2v_model.wv[word]
    else:
        embedding_vector = 0
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        w2v_matrix[i] = embedding_vector

In [101]:
np.save('w2v_matrix.npy', w2v_matrix)
np.save('X_w2v_train.npy', X_w2v_train)
np.save('y_train.npy', y_train)
np.save('X_w2v_test.npy', X_w2v_test)
np.save('y_test.npy', y_test)

### Models - Mean of word2vec embedding

In [15]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = EMBEDDING_DIM


    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in re.sub('['+string.punctuation+']', '', words.strip()) if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

mean_embed = MeanEmbeddingVectorizer(w2v_model.wv)
word2vec_X_train = mean_embed.transform(X_train)
word2vec_X_test = mean_embed.transform(X_test)

In [16]:
word2vec_X_train.shape

(3489, 300)

### Random Forests

In [17]:
rf_tuning_parameters = {'estimator__n_estimators':[50, 100, 200], 
                        'estimator__max_depth':[100, 200, 500]}

In [20]:
rf = OneVsRestClassifier(RandomForestClassifier())
rf = GridSearchCV(rf, param_grid = rf_tuning_parameters, cv=5, n_jobs=-1)
rf.fit(word2vec_X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          n_jobs=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'estimator__n_estimators': [50, 100, 200], 'estimator__max_depth': [100, 200, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
rf.best_params_

{'estimator__max_depth': 200, 'estimator__n_estimators': 200}

In [24]:
# save model
filename = 'rf_w2v.pkl'
pickle.dump(rf, open(filename, 'wb'))

In [25]:
# load the model from disk
loaded_rf = pickle.load(open(filename, 'rb'))

In [26]:
rf_train_pred = loaded_rf.predict(word2vec_X_train)
rf_test_pred = loaded_rf.predict(word2vec_X_test)

In [27]:
# evaluation
print('Random Forest Avg Accuracy on train : {}'.format(score(y_train, rf_train_pred, 'avg')))
print('Random Forest Exact Accuracy on train : {}'.format(score(y_train, rf_train_pred, 'exact')))
print('Random Forest Precision on train : {}'.format(score(y_train, rf_train_pred, 'precision')))
print('Random Forest Recall on train : {}'.format(score(y_train, rf_train_pred, 'recall')))
print('Random Forest Hit Rate on train : {}'.format(score(y_train, rf_train_pred, 'hit')))
print('Random Forest F1 Score on train : {}'.format(score(y_train, rf_train_pred, 'f1')))

Random Forest Avg Accuracy on train : 1.0
Random Forest Exact Accuracy on train : 1.0
Random Forest Precision on train : 1.0
Random Forest Recall on train : 1.0
Random Forest Hit Rate on train : 1.0
Random Forest F1 Score on train : 1.0


In [28]:
# evaluation
print('Random Forest Avg Accuracy on test : {}'.format(score(y_test, rf_test_pred, 'avg')))
print('Random Forest Exact Accuracy on test : {}'.format(score(y_test, rf_test_pred, 'exact')))
print('Random Forest Precision on test : {}'.format(score(y_test, rf_test_pred, 'precision')))
print('Random Forest Recall on test : {}'.format(score(y_test, rf_test_pred, 'recall')))
print('Random Forest Hit Rate on test : {}'.format(score(y_test, rf_test_pred, 'hit')))
print('Random Forest F1 Score on test : {}'.format(score(y_test, rf_test_pred, 'f1')))

Random Forest Avg Accuracy on test : 0.8361970217640321
Random Forest Exact Accuracy on test : 0.013745704467353952
Random Forest Precision on test : 0.1545772624185405
Random Forest Recall on test : 0.050955823328328154
Random Forest Hit Rate on test : 0.40778923253150057
Random Forest F1 Score on test : 0.07664568111507343


### Logistic Regression

In [30]:
lr = OneVsRestClassifier(LogisticRegressionCV(cv = 5))
lr.fit(word2vec_X_train, y_train)
lr_train_pred = lr.predict(word2vec_X_train)
lr_test_pred = lr.predict(word2vec_X_test)

In [31]:
# save model
filename = 'lr_w2v.pkl'
pickle.dump(lr, open(filename, 'wb'))

In [32]:
# load the model from disk
loaded_lr = pickle.load(open(filename, 'rb'))

In [33]:
# evaluation
print('Logistic Regression Avg Accuracy on train : {}'.format(score(y_train, lr_train_pred, 'avg')))
print('Logistic Regression Exact Accuracy on train : {}'.format(score(y_train, lr_train_pred, 'exact')))
print('Logistic Regression Precision on train : {}'.format(score(y_train, lr_train_pred, 'precision')))
print('Logistic Regression Recall on train : {}'.format(score(y_train, lr_train_pred, 'recall')))
print('Logistic Regression Hit Rate on train : {}'.format(score(y_train,lr_train_pred, 'hit')))
print('Logistic Regression F1 Score on train : {}'.format(score(y_train, lr_train_pred, 'f1')))

Logistic Regression Avg Accuracy on train : 0.8476784178847807
Logistic Regression Exact Accuracy on train : 0.022069360848380626
Logistic Regression Precision on train : 0.17117753447775025
Logistic Regression Recall on train : 0.038822209695275486
Logistic Regression Hit Rate on train : 0.4044138721696761
Logistic Regression F1 Score on train : 0.06329045937446505


In [34]:
# evaluation
print('Logistic Regression Avg Accuracy on test : {}'.format(score(y_test, lr_test_pred, 'avg')))
print('Logistic Regression Exact Accuracy on test : {}'.format(score(y_test, lr_test_pred, 'exact')))
print('Logistic Regression Precision on test : {}'.format(score(y_test, lr_test_pred, 'precision')))
print('Logistic Regression Recall on test : {}'.format(score(y_test, lr_test_pred, 'recall')))
print('Logistic Regression Hit Rate on test : {}'.format(score(y_test, lr_test_pred, 'hit')))
print('Logistic Regression F1 Score on test : {}'.format(score(y_test, lr_test_pred, 'f1')))

Logistic Regression Avg Accuracy on test : 0.8456471935853379
Logistic Regression Exact Accuracy on test : 0.018327605956471937
Logistic Regression Precision on test : 0.05637432188065099
Logistic Regression Recall on test : 0.039566905309413196
Logistic Regression Hit Rate on test : 0.4066437571592211
Logistic Regression F1 Score on test : 0.04649841410336052


### SVM

In [35]:
svm_tuning_parameters = {'estimator__C': [0.1, 5, 50, 100, 1000],
                         'estimator__kernel': ['rbf']}
svm = OneVsRestClassifier(SVC())
svm = GridSearchCV(svm, param_grid = svm_tuning_parameters, cv=5, n_jobs=-1)

In [36]:
svm.fit(word2vec_X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'estimator__C': [0.1, 5, 50, 100, 1000], 'estimator__kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [37]:
# save model
filename = 'svm_word2vec.pkl'
pickle.dump(svm, open(filename, 'wb'))

In [38]:
loaded_svm = pickle.load(open(filename, 'rb'))

In [39]:
svm_train_pred = loaded_svm.predict(word2vec_X_train)
svm_test_pred = loaded_svm.predict(word2vec_X_test)

In [40]:
# evaluation
print('SVM Avg Accuracy on train : {}'.format(score(y_train, svm_train_pred, 'avg')))
print('SVM Exact Accuracy on train : {}'.format(score(y_train, svm_train_pred, 'exact')))
print('SVM Precision on train : {}'.format(score(y_train, svm_train_pred, 'precision')))
print('SVM Recall on train : {}'.format(score(y_train, svm_train_pred, 'recall')))
print('SVM Hit Rate on train : {}'.format(score(y_train,lr_train_pred, 'hit')))
print('SVM F1 Score on train : {}'.format(score(y_train, lr_train_pred, 'f1')))

SVM Avg Accuracy on train : 0.8427199770707939
SVM Exact Accuracy on train : 0.02722843221553454
SVM Precision on train : 0.026769848094009747
SVM Recall on train : 0.05
SVM Hit Rate on train : 0.4044138721696761
SVM F1 Score on train : 0.06329045937446505


In [41]:
# evaluation
print('SVM Avg Accuracy on test : {}'.format(score(y_test, svm_test_pred, 'avg')))
print('SVM Exact Accuracy on test : {}'.format(score(y_test, svm_test_pred, 'exact')))
print('SVM Precision on test : {}'.format(score(y_test, svm_test_pred, 'precision')))
print('SVM Recall on test : {}'.format(score(y_test, svm_test_pred, 'recall')))
print('SVM Hit Rate on test : {}'.format(score(y_test, svm_test_pred, 'hit')))
print('SVM F1 Score on test : {}'.format(score(y_test, svm_test_pred, 'f1')))

SVM Avg Accuracy on test : 0.8403780068728522
SVM Exact Accuracy on test : 0.026345933562428408
SVM Precision on test : 0.026403207331042382
SVM Recall on test : 0.05
SVM Hit Rate on test : 0.5280641466208477
SVM F1 Score on test : 0.034557721139430286


### LSTM

In [19]:
lstm_w2v = Sequential()
e = Embedding(w2v_matrix.shape[0], EMBEDDING_DIM, 
              weights=[w2v_matrix], 
              input_length=MAX_SEQUENCE_LENGTH, 
              trainable=False)
lstm_w2v.add(e)
# lstm_w2v.add(Dropout(0.5))
# lstm_w2v.add(Conv1D(40, 2))
lstm_w2v.add(LSTM(80, dropout = 0.5, recurrent_dropout=0.5))
# lstm_w2v.add(Dense(40, activation='relu'))
lstm_w2v.add(Dense(20, activation='sigmoid'))
lstm_w2v.compile(optimizer=Adam(lr=0.002), loss='binary_crossentropy', metrics=['acc'])

In [4]:
lstm_w2v.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['acc'])
lstm_w2v.fit(X_w2v_train, y_train, validation_data= (X_w2v_test, y_test),
               batch_size=128,
               epochs=120);

In [44]:
# lstm_w2v.save('lstm_w2v_55_25_150')

In [4]:
lstm_w2v = load_model('lstm_w2v_55_25_150')

In [5]:
lstm_w2v.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 300)          13495200  
_________________________________________________________________
lstm_4 (LSTM)                (None, 80)                121920    
_________________________________________________________________
dense_4 (Dense)              (None, 20)                1620      
Total params: 13,618,740
Trainable params: 123,540
Non-trainable params: 13,495,200
_________________________________________________________________


In [6]:
lstm_pred_w2v_test = lstm_w2v.predict(X_w2v_test)
# evaluation
print('Word2Vec Avg Accuracy on test : {}'.format(score_thres(y_test, lstm_pred_w2v_test, method='avg')))
print('Word2Vec Exact Accuracy on test : {}'.format(score_thres(y_test, lstm_pred_w2v_test, method='exact')))
print('Word2Vec Precision on test : {}'.format(score_thres(y_test, lstm_pred_w2v_test, method='precision')))
print('Word2Vec Recall on test : {}'.format(score_thres(y_test, lstm_pred_w2v_test, method='recall')))
print('Word2Vec Hit Rate on test : {}'.format(score_thres(y_test, lstm_pred_w2v_test, method='hit')))
print('Word2VecF1 on test : {}'.format(score_thres(y_test, lstm_pred_w2v_test, method='f1')))


Word2Vec Avg Accuracy on test : 0.8624856815578466
Word2Vec Exact Accuracy on test : 0.061855670103092786
Word2Vec Precision on test : 0.5525787967604396
Word2Vec Recall on test : 0.2585768591891645
Word2Vec Hit Rate on test : 0.7800687285223368
Word2VecF1 on test : 0.3522975859758274
