## Data Science Final Project : Glove
### Group Members : Jiachang Shi, Boyuan Sun, Xiangru Shu

### Library Used

In [87]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
import string
import ast
import re
import warnings
from meter import *
warnings.filterwarnings('ignore')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten, LSTM, Dropout, RNN
from keras.models import Model, Sequential
from keras.optimizers import SGD, RMSprop, Adam

### Data Proprocessing

In [88]:
# read in data
imdb_movie = pd.read_csv('data/imdb_multilabel.csv')

# drop movies with unknown plot
imdb_movie['plot'] = imdb_movie['plot'].apply(lambda sentence: 'drop' if (('plot ' in sentence.lower()) 
                                              and ('unknown' in sentence.lower()))
                                              or (len(sentence.split()) < 100) 
                                              else sentence)
imdb_movie = imdb_movie[imdb_movie['plot'] != 'drop']

# single-label encoding
genre_dict = dict(zip(imdb_movie.genre.unique(), range(20)))
genre_dict['sci-fi'] = 8
imdb_movie['genre_code'] = imdb_movie.genre.replace(genre_dict).values

# multi-label encoded as an array
def multi_label_encoder(all_genre_list):
    """ This function takes a list of genre with a dictionary that keeps track of the index of the genre
    INPUTS
    ------
    all_genre_list: list of genres
    genre_dict: dictionary of indexs
    
    OUTPUTS
    -------
    np array in {0, 1}
    """
    encode = np.zeros(20)
    all_genre_list = ast.literal_eval(all_genre_list)
    for genre in all_genre_list:
        if genre.lower() in genre_dict:
            encode[genre_dict[genre.lower()]] = 1
    return list(encode)

imdb_movie['all_genre_encode'] = imdb_movie['all_genre'].apply(multi_label_encoder)
imdb_movie['plot_list'] = imdb_movie['plot_list'].apply(ast.literal_eval)

# train test split
X_train, X_test, y_train, y_test = train_test_split(imdb_movie['plot'], 
                                                    imdb_movie['all_genre_encode'],
                                                    test_size = 0.2,
                                                    random_state = 209,
                                                    stratify = imdb_movie['genre'],
                                                    shuffle = True
                                                    )

# reshape y_train and y_test
y_train = np.array(list(y_train))
y_test = np.array(list(y_test))

###  Glove Representation of Words

In [89]:
with open("glove.6B.300d.txt", "rb") as lines:
    w2v = {line.decode("utf-8").split()[0]: np.array(line.split()[1:]).astype(float)
           for line in lines}

In [90]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 300


    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in re.sub('['+string.punctuation+']', '', words.strip()) if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

mean_embed = MeanEmbeddingVectorizer(w2v)
glove_X_train = mean_embed.transform(X_train)
glove_X_test = mean_embed.transform(X_test)

In [91]:
# from collections import defaultdict
# class TfidfEmbeddingVectorizer(object):
#     def __init__(self, word2vec):
#         self.word2vec = word2vec
#         self.word2weight = None
#         self.dim = 300

#     def fit(self, X):
#         tfidf = TfidfVectorizer(analyzer=lambda x: x)
#         tfidf.fit(X)
#         # if a word was never seen - it must be at least as infrequent
#         # as any of the known words - so the default idf is the max of 
#         # known idf's
#         max_idf = max(tfidf.idf_)
#         self.word2weight = defaultdict(
#             lambda: max_idf,
#             [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

#         return self

#     def transform(self, X):
#         return np.array([
#                 np.mean([self.word2vec[w] * self.word2weight[w]
#                          for w in words if w in self.word2vec] or
#                         [np.zeros(self.dim)], axis=0)
#                 for words in X
#             ])
    
# tf_embed = TfidfEmbeddingVectorizer(w2v)
# tf_embed.fit(X_train)
# glove_X_train = tf_embed.transform(X_train)
# glove_X_test = tf_embed.transform(X_test)

In [92]:
glove_X_train.shape

(3489, 300)

### Random Forests

In [93]:
rf = OneVsRestClassifier(RandomForestClassifier())
rf.fit(glove_X_train, y_train)

rf_train_pred = rf.predict(glove_X_train)
rf_test_pred = rf.predict(glove_X_test)

In [94]:
# evaluation
print('Random Forest Avg Accuracy on train : {}'.format(score(y_train, rf_train_pred, 'avg')))
print('Random Forest Exact Accuracy on train : {}'.format(score(y_train, rf_train_pred, 'exact')))
print('Random Forest Precision on train : {}'.format(score(y_train, rf_train_pred, 'precision')))
print('Random Forest Recall on train : {}'.format(score(y_train, rf_train_pred, 'recall')))

Random Forest Avg Accuracy on train : 0.9825738033820578
Random Forest Exact Accuracy on train : 0.707079392376039
Random Forest Precision on train : 0.9991640440168593
Random Forest Recall on train : 0.849986741459887


In [95]:
# evaluation
print('Random Forest Avg Accuracy on test : {}'.format(score(y_test, rf_test_pred, 'avg')))
print('Random Forest Exact Accuracy on test : {}'.format(score(y_test, rf_test_pred, 'exact')))
print('Random Forest Precision on test : {}'.format(score(y_test, rf_test_pred, 'precision')))
print('Random Forest Recall on test : {}'.format(score(y_test, rf_test_pred, 'recall')))

Random Forest Avg Accuracy on test : 0.8313287514318443
Random Forest Exact Accuracy on test : 0.014891179839633447
Random Forest Precision on test : 0.25802607561588775
Random Forest Recall on test : 0.06957369331005318


### Logistic Regression

In [97]:
lr = OneVsRestClassifier(LogisticRegressionCV(cv = 5))
lr.fit(glove_X_train, y_train)
lr_train_pred = lr.predict(glove_X_train)
lr_test_pred = lr.predict(glove_X_test)

In [100]:
# evaluation
print('Logistic Regression Avg Accuracy on train : {}'.format(score(y_train, lr_train_pred, 'avg')))
print('Logistic Regression Exact Accuracy on train : {}'.format(score(y_train, lr_train_pred, 'exact')))
print('Logistic Regression Precision on train : {}'.format(score(y_train, lr_train_pred, 'precision')))
print('Logistic Regression Recall on train : {}'.format(score(y_train, lr_train_pred, 'recall')))

Logistic Regression Avg Accuracy on train : 0.850902837489252
Logistic Regression Exact Accuracy on train : 0.025508741759816565
Logistic Regression Precision on train : 0.49102516368430926
Logistic Regression Recall on train : 0.06456805635475486


In [98]:
# evaluation
print('Logistic Regression Avg Accuracy on test : {}'.format(score(y_test, lr_test_pred, 'avg')))
print('Logistic Regression Exact Accuracy on test : {}'.format(score(y_test, lr_test_pred, 'exact')))
print('Logistic Regression Precision on test : {}'.format(score(y_test, lr_test_pred, 'precision')))
print('Logistic Regression Recall on test : {}'.format(score(y_test, lr_test_pred, 'recall')))

Logistic Regression Avg Accuracy on test : 0.8460481099656357
Logistic Regression Exact Accuracy on test : 0.024054982817869417
Logistic Regression Precision on test : 0.2552112387653064
Logistic Regression Recall on test : 0.059678851243138896


### SVM

In [103]:
svm_tuning_parameters = {'estimator__C': [5, 50, 100, 1000],
                         'estimator__kernel': ['rbf']}

In [104]:
svm = OneVsRestClassifier(SVC())
svm = GridSearchCV(svm, param_grid = svm_tuning_parameters, cv=5, n_jobs=-1)
svm.fit(glove_X_train, y_train)
svm_train_pred = svm.predict(glove_X_train)
svm_test_pred = svm.predict(glove_X_test)

SVM Accuracy on Train : 0.431303669008587
SVM Accuracy on Test : 0.08580343213728549


In [105]:
svm.best_params_

{'estimator__C': 100, 'estimator__kernel': 'rbf'}

In [106]:
# evaluation
print('SVM Avg Accuracy on train : {}'.format(score(y_train, svm_train_pred, 'avg')))
print('SVM Exact Accuracy on train : {}'.format(score(y_train, svm_train_pred, 'exact')))
print('SVM Precision on train : {}'.format(score(y_train, svm_train_pred, 'precision')))
print('SVM Recall on train : {}'.format(score(y_train, svm_train_pred, 'recall')))

SVM Avg Accuracy on train : 0.932211163153786
SVM Exact Accuracy on train : 0.431303669008587
SVM Precision on train : 0.938492996003603
SVM Recall on train : 0.4921213354419586


In [107]:
# evaluation
print('SVM Avg Accuracy on test : {}'.format(score(y_test, svm_test_pred, 'avg')))
print('SVM Exact Accuracy on test : {}'.format(score(y_test, svm_test_pred, 'exact')))
print('SVM Precision on test : {}'.format(score(y_test, svm_test_pred, 'precision')))
print('SVM Recall on test : {}'.format(score(y_test, svm_test_pred, 'recall')))

SVM Avg Accuracy on test : 0.8849453978159126
SVM Exact Accuracy on test : 0.08580343213728549
SVM Precision on test : 0.7342857827952918
SVM Recall on test : 0.28096075952803456


In [None]:
# evaluation
print('SVM Avg Accuracy on test : {}'.format(score(y_test, svm_test_pred, 'avg')))
print('SVM Exact Accuracy on test : {}'.format(score(y_test, svm_test_pred, 'exact')))
print('SVM Precision on test : {}'.format(score(y_test, svm_test_pred, 'precision')))
print('SVM Recall on test : {}'.format(score(y_test, svm_test_pred, 'recall')))

### Embedding Layer

In [111]:
embeddings_index = {}
with open("glove.6B.300d.txt", "rb") as f:
    for line in f:
        values = line.decode("utf-8").split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [112]:
# MAX_WORDS = 40000
MAX_SEQUENCE_LENGTH = 300
EMBEDDING_DIM = 300

In [113]:
train_texts = X_train.apply(lambda x: 
                            re.sub('['+string.punctuation+']', 
                                   '', x.strip())).values

In [114]:
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

In [115]:
sequences = tokenizer.texts_to_sequences(train_texts)
X_glove_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [116]:
#
test_texts = X_test.apply(lambda x: 
                           re.sub('['+string.punctuation+']', 
                                  '', x.strip())).values
test_sequences = tokenizer.texts_to_sequences(test_texts)
X_glove_test = pad_sequences(test_sequences, 
                             maxlen=MAX_SEQUENCE_LENGTH)

In [117]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
#     if i >= MAX_WORDS:
#         continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [131]:
X_glove_train.shape

(3489, 300)

### GPU Data Preparation

In [132]:
np.save('embedding_matrix.npy', embedding_matrix)
np.save('X_glove_train.npy', X_glove_train)
np.save('y_train.npy', y_train)
np.save('X_glove_test.npy', X_glove_test)
np.save('y_test.npy', y_test)

### load data

In [133]:
embedding_matrix = np.load('embedding_matrix.npy')
X_glove_train = np.load('X_glove_train.npy')
y_train = np.load('y_train.npy')
X_glove_test = np.load('X_glove_test.npy')
y_test = np.load('y_test.npy')

In [134]:
# MAX_WORDS = 40000
MAX_SEQUENCE_LENGTH = 300
EMBEDDING_DIM = 300

In [136]:
num_words = 44984

## CNN

In [126]:
model = Sequential()
e = Embedding(num_words, EMBEDDING_DIM, 
              weights=[embedding_matrix], 
              input_length=MAX_SEQUENCE_LENGTH, 
              trainable=False)
model.add(e)
model.add(Dropout(0.5))
model.add(Conv1D(32, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(20, activation='sigmoid'))

In [127]:
# compile the model
model.compile(optimizer=Adam(lr=0.0005), loss='binary_crossentropy', metrics=['acc'])

In [128]:
model.fit(X_glove_train, y_train,
          batch_size=64,
          epochs=20, validation_data = (X_glove_test, y_test))

Train on 3489 samples, validate on 873 samples
Epoch 1/20

KeyboardInterrupt: 

In [123]:
pred_glove_test = model.predict(X_glove_test)
pred_glove_train = model.predict(X_glove_train)

In [124]:
# evaluation
print('GloVE Embedding Accuracy on train : {}'.format(score_thres(y_train, pred_glove_train, method='avg')))
print('GloVE Embedding Exact Accuracy on train : {}'.format(score_thres(y_train, pred_glove_train, method='exact')))
print('GloVE Embedding Precision on train : {}'.format(score_thres(y_train, pred_glove_train, method='precision')))
print('GloVE Embedding Recall on train : {}'.format(score_thres(y_train, pred_glove_train, method='recall')))

GloVE Embedding Accuracy on train : 1.0
GloVE Embedding Exact Accuracy on train : 1.0
GloVE Embedding Precision on train : 1.0
GloVE Embedding Recall on train : 1.0


In [125]:
# evaluation
print('GloVE Embedding Accuracy on test : {}'.format(score_thres(y_test, pred_glove_test, method='avg')))
print('GloVE Embedding Exact Accuracy on test : {}'.format(score_thres(y_test, pred_glove_test, method='exact')))
print('GloVE Embedding Precision on test : {}'.format(score_thres(y_test, pred_glove_test, method='precision')))
print('GloVE Embedding Recall on test : {}'.format(score_thres(y_test, pred_glove_test, method='recall')))

GloVE Embedding Accuracy on test : 0.8747422680412372
GloVE Embedding Exact Accuracy on test : 0.06872852233676977
GloVE Embedding Precision on test : 0.5830024639655649
GloVE Embedding Recall on test : 0.3606915678387714


## CNN + LSTM

In [31]:
lstm = Sequential()
e = Embedding(num_words, EMBEDDING_DIM, 
              weights=[embedding_matrix], 
              input_length=MAX_SEQUENCE_LENGTH, 
              trainable=False)
lstm.add(e)
lstm.add(Dropout(0.2))
lstm.add(Conv1D(32, 5, activation='relu'))
lstm.add(LSTM(100, recurrent_dropout=0.2))
lstm.add(Dense(20, activation='sigmoid'))
# compile the model

In [32]:
lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 300)          12000000  
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 96, 32)            48032     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_5 (Dense)              (None, 20)                2020      
Total params: 12,103,252
Trainable params: 103,252
Non-trainable params: 12,000,000
_________________________________________________________________


In [33]:
lstm.compile(optimizer=Adam(lr=0.0005), loss='binary_crossentropy', metrics=['acc'])
lstm.fit(X_glove_train, y_train, validation_data= (X_glove_test, y_test),
          batch_size=128,
          epochs=20)

Train on 3489 samples, validate on 873 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a3183fa58>

In [266]:
lstm_pred_glove_test = lstm.predict(X_glove_test)
# evaluation
print('LSTM Embedding Accuracy on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='avg')))
print('LSTM Embedding Exact Accuracy on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='exact')))
print('LSTM Embedding Precision on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='precision')))
print('LSTM Embedding Recall on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='recall')))

LSTM Embedding Accuracy on test : 0.8847877358490567
LSTM Embedding Exact Accuracy on test : 0.09355345911949685
LSTM Embedding Precision on test : 0.5810460229570971
LSTM Embedding Recall on test : 0.4279486141494283


## LSTM

In [270]:
lstm_glove = Sequential()
e = Embedding(num_words, EMBEDDING_DIM, 
              weights=[embedding_matrix], 
              input_length=MAX_SEQUENCE_LENGTH, 
              trainable=False)
lstm_glove.add(e)
lstm_glove.add(LSTM(100, dropout = 0.2, recurrent_dropout=0.2))
lstm_glove.add(Dense(20, activation='sigmoid'))

In [272]:
lstm_glove.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 100, 300)          6000000   
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_33 (Dense)             (None, 20)                2020      
Total params: 6,162,420
Trainable params: 162,420
Non-trainable params: 6,000,000
_________________________________________________________________


In [None]:
lstm_glove.compile(optimizer=Adam(lr=0.0005), loss='binary_crossentropy', metrics=['acc'])

In [277]:
lstm_glove.fit(X_glove_train, y_train, validation_data= (X_glove_test, y_test),
          batch_size=128,
          epochs=20)

Train on 5084 samples, validate on 1272 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a9e723080>

In [34]:
lstm_pred_glove_test = lstm_glove.predict(X_glove_test)
# evaluation
print('LSTM Embedding Accuracy on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='avg')))
print('LSTM Embedding Exact Accuracy on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='exact')))
print('LSTM Embedding Precision on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='precision')))
print('LSTM Embedding Recall on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='recall')))

NameError: name 'lstm_glove' is not defined

## LSTM - 2 layers

In [278]:
lstm_glove_2layers = Sequential()
e = Embedding(num_words, EMBEDDING_DIM, 
              weights=[embedding_matrix], 
              input_length=MAX_SEQUENCE_LENGTH, 
              trainable=False)
lstm_glove_2layers.add(e)
#lstm_glove_2layers.add(LSTM(200, dropout = 0.2, recurrent_dropout=0.2, return_sequences=True))
lstm_glove_2layers.add(LSTM(200, dropout = 0.2, recurrent_dropout=0.2))
lstm_glove_2layers.add(Dense(20, activation='sigmoid'))

In [None]:
lstm_glove.compile(optimizer=Adam(lr=0.0005), loss='binary_crossentropy', metrics=['acc'])

### best model

In [280]:
### load test
from keras.models import load_model
load_model = load_model('lstm_glove_68_51') 

In [413]:
lstm_pred_glove_test = load_model.predict(X_glove_test)
# evaluation
print('LSTM Embedding Accuracy on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='avg')))
print('LSTM Embedding Exact Accuracy on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='exact')))
print('LSTM Embedding Precision on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='precision')))
print('LSTM Embedding Recall on test : {}'.format(score_thres(y_test, lstm_pred_glove_test, method='recall')))

LSTM Embedding Accuracy on test : 0.8899198167239404
LSTM Embedding Exact Accuracy on test : 0.11798396334478808
LSTM Embedding Precision on test : 0.6867441944938573
LSTM Embedding Recall on test : 0.5190011893170453
