# Models

In [3]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
import string
import ast
import re
import warnings
warnings.filterwarnings('ignore')

In [4]:
# read in data
imdb_movie = pd.read_csv('data/imdb_multilabel.csv')

In [5]:
imdb_movie.head()

Unnamed: 0.1,Unnamed: 0,title,imdb_id,topRank,bottomRank,metaScore,plot,rating,ratingCount,reviewCount,runningTimeInMinutes,userRatingCount,userScore,year,all_genre,genre,plot_list,genreCount,genre_code,all_genre_encode
0,0,"I, Tonya",tt5580036,930.0,17643.0,77.0,From the proverbial wrong side of the tracks i...,7.6,67667.0,46,120.0,235,7.8,2017.0,"['Biography', 'Comedy', 'Drama', 'Sport']",sport,"['From', 'the', 'proverbial', 'wrong', 'side',...",4,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
1,1,Cars 3,tt3606752,2256.0,11547.0,59.0,Blindsided by a new generation of blazing-fast...,6.8,41896.0,41,102.0,232,6.9,2017.0,"['Animation', 'Adventure', 'Comedy', 'Family',...",sport,"['Blindsided', 'by', 'a', 'new', 'generation',...",5,0,[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. ...
2,2,Creed,tt3076658,847.0,17840.0,82.0,Adonis Johnson is the son of the famous boxing...,7.6,193206.0,42,133.0,614,8.0,2015.0,"['Drama', 'Sport']",sport,"['Adonis', 'Johnson', 'is', 'the', 'son', 'of'...",2,0,[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
3,3,Battle of the Sexes,tt4622512,2303.0,11228.0,73.0,In the wake of the sexual revolution and the r...,6.8,27960.0,46,121.0,102,6.3,2017.0,"['Biography', 'Comedy', 'Drama', 'Sport']",sport,"['In', 'the', 'wake', 'of', 'the', 'sexual', '...",4,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
4,4,Borg McEnroe,tt5727282,,12891.0,57.0,The story of the 1980s tennis rivalry between ...,7.0,9800.0,13,107.0,0,,2017.0,"['Biography', 'Drama', 'Sport']",sport,"['The', 'story', 'of', 'the', '1980s', 'tennis...",3,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...


### Encoding

#### single-label encode

In [6]:
genre_dict = dict(zip(imdb_movie.genre.unique(), range(20)))

imdb_movie['genre_code'] = imdb_movie.genre.replace(genre_dict).values

In [7]:
imdb_movie.head()

Unnamed: 0.1,Unnamed: 0,title,imdb_id,topRank,bottomRank,metaScore,plot,rating,ratingCount,reviewCount,runningTimeInMinutes,userRatingCount,userScore,year,all_genre,genre,plot_list,genreCount,genre_code,all_genre_encode
0,0,"I, Tonya",tt5580036,930.0,17643.0,77.0,From the proverbial wrong side of the tracks i...,7.6,67667.0,46,120.0,235,7.8,2017.0,"['Biography', 'Comedy', 'Drama', 'Sport']",sport,"['From', 'the', 'proverbial', 'wrong', 'side',...",4,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
1,1,Cars 3,tt3606752,2256.0,11547.0,59.0,Blindsided by a new generation of blazing-fast...,6.8,41896.0,41,102.0,232,6.9,2017.0,"['Animation', 'Adventure', 'Comedy', 'Family',...",sport,"['Blindsided', 'by', 'a', 'new', 'generation',...",5,0,[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. ...
2,2,Creed,tt3076658,847.0,17840.0,82.0,Adonis Johnson is the son of the famous boxing...,7.6,193206.0,42,133.0,614,8.0,2015.0,"['Drama', 'Sport']",sport,"['Adonis', 'Johnson', 'is', 'the', 'son', 'of'...",2,0,[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
3,3,Battle of the Sexes,tt4622512,2303.0,11228.0,73.0,In the wake of the sexual revolution and the r...,6.8,27960.0,46,121.0,102,6.3,2017.0,"['Biography', 'Comedy', 'Drama', 'Sport']",sport,"['In', 'the', 'wake', 'of', 'the', 'sexual', '...",4,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
4,4,Borg McEnroe,tt5727282,,12891.0,57.0,The story of the 1980s tennis rivalry between ...,7.0,9800.0,13,107.0,0,,2017.0,"['Biography', 'Drama', 'Sport']",sport,"['The', 'story', 'of', 'the', '1980s', 'tennis...",3,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...


In [8]:
genre_dict['sci-fi'] = 17

#### multi-label encode as an array

In [9]:
def multi_label_encoder(all_genre_list):
    """ This function takes a list of genre with a dictionary that keeps track of the index of the genre
    INPUTS
    ------
    all_genre_list: list of genres
    genre_dict: dictionary of indexs
    
    OUTPUTS
    -------
    np array in {0, 1}
    """
    encode = np.zeros(20)
    all_genre_list = ast.literal_eval(all_genre_list)
    for genre in all_genre_list:
        if genre.lower() in genre_dict:
            encode[genre_dict[genre.lower()]] = 1
    return list(encode)

In [10]:
imdb_movie['all_genre_encode'] = imdb_movie['all_genre'].apply(multi_label_encoder)

In [11]:
imdb_movie['all_genre_encode'].iloc[0]

[1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0]

In [12]:
imdb_movie['plot_list'] = imdb_movie['plot_list'].apply(ast.literal_eval)

In [13]:
imdb_movie['plot_list'].iloc[0]

['From',
 'the',
 'proverbial',
 'wrong',
 'side',
 'of',
 'the',
 'tracks',
 'in',
 'Portland,',
 'Oregon,',
 'former',
 'competitive',
 'figure',
 'skater',
 'Tonya',
 'Harding',
 'was',
 'never',
 'fully',
 'accepted',
 'in',
 'the',
 'figure',
 'skating',
 'community',
 'for',
 'not',
 'inherently',
 'being',
 'the',
 'image',
 'of',
 'grace,',
 'breeding',
 'and',
 'privilege',
 'that',
 'the',
 'community',
 'wanted',
 'to',
 'portray,',
 'despite',
 'she',
 'being',
 'naturally',
 'gifted',
 'in',
 'the',
 'sport',
 'athletically.',
 'Despite',
 'ultimately',
 'garnering',
 'some',
 'success',
 'in',
 'figure',
 'skating',
 'being',
 'national',
 'champion,',
 'a',
 'world',
 'championship',
 'medalist,',
 'an',
 'Olympian,',
 'and',
 'being',
 'the',
 'first',
 'American',
 'woman',
 'to',
 'complete',
 'a',
 'Triple',
 'Axel',
 'in',
 'competition,',
 'she',
 'is',
 'arguably',
 'best',
 'known',
 'for',
 'her',
 'association',
 'to',
 '"the',
 'incident":',
 'the',
 'leg',
 '

In [14]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(imdb_movie['plot'], 
                                                                            imdb_movie['all_genre_encode'],
                                                                            test_size = 0.2,
                                                                            random_state = 209,
                                                                            )

### Evaluation Function

In [15]:
def score(y_true, y_score, method='avg'):
    if method == 'avg':
        return np.mean(np.mean(y_true == y_score, axis=1))
    elif method == 'exact':
        return metrics.accuracy_score(y_true, y_score)
    elif method == 'recall':
        return np.mean([metrics.recall_score(y_true[:,i], y_score[:,i]) for i in range(y_score.shape[1])])
    elif method == 'precision':
        return np.mean([metrics.precision_score(y_true[:,i], y_score[:,i]) for i in range(y_score.shape[1])])

## Baseline Model with Bag-of-Words Representation
### Naive Bayes 

In [16]:
# bag of words representation
vectorizer = CountVectorizer(stop_words='english')

In [17]:
naive_X_train = vectorizer.fit_transform(X_train)
naive_X_test = vectorizer.transform(X_test)

In [18]:
# reshape y_train and y_test
y_train = np.array(list(y_train))
y_test = np.array(list(y_test))

In [19]:
y_train.shape

(5124, 20)

In [20]:
y_train

array([[ 0.,  0.,  0., ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  1., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [21]:
nb = OneVsRestClassifier(MultinomialNB())
nb.fit(naive_X_train, y_train)

OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1)

In [22]:
naive_train_pred = nb.predict(naive_X_train)
naive_test_pred = nb.predict(naive_X_test)
print('Naive Bayes Accuracy on Train : {}'.format(metrics.accuracy_score(y_train, naive_train_pred)))
print('Naive Bayes Accuracy on Test : {}'.format(metrics.accuracy_score(y_test, naive_test_pred)))

Naive Bayes Accuracy on Train : 0.5915300546448088
Naive Bayes Accuracy on Test : 0.14508580343213728


In [23]:
nb.predict_proba(naive_X_train)[0]

array([  5.37012637e-33,   4.31795630e-31,   3.59365748e-27,
         2.63671879e-10,   1.11016946e-28,   1.01828361e-21,
         3.29233909e-25,   1.31202345e-21,   0.00000000e+00,
         1.39414154e-08,   1.33012783e-02,   4.52588102e-15,
         8.36423998e-01,   3.23838390e-14,   1.86421000e-18,
         3.69704282e-05,   8.66331663e-01,   9.99998836e-01,
         9.93130417e-01,   8.06854819e-08])

In [24]:
# evaluation
print('Naive Bayes Avg Accuracy on test : {}'.format(score(y_test, naive_test_pred, 'avg')))
print('Naive Bayes Exact Accuracy on test : {}'.format(score(y_test, naive_test_pred, 'exact')))
print('Naive Bayes Precision on test : {}'.format(score(y_test, naive_test_pred, 'precision')))
print('Naive Bayes Recall on test : {}'.format(score(y_test, naive_test_pred, 'recall')))

Naive Bayes Avg Accuracy on test : 0.8982059282371295
Naive Bayes Exact Accuracy on test : 0.14508580343213728
Naive Bayes Precision on test : 0.6978311158754985
Naive Bayes Recall on test : 0.4300237937094945


### Random Forests

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfc = OneVsRestClassifier(RandomForestClassifier())
rfc.fit(naive_X_train, y_train)
rfc_train_pred = rfc.predict(naive_X_train)
rfc_test_pred = rfc.predict(naive_X_test)
print('Random Forest Accuracy on Train : {}'.format(metrics.accuracy_score(y_train, rfc_train_pred)))
print('Random Forest Accuracy on Test : {}'.format(metrics.accuracy_score(y_test, rfc_test_pred)))

Random Forest Accuracy on Train : 0.7767369242779079
Random Forest Accuracy on Test : 0.0483619344773791


In [27]:
# evaluation
print('Random Forest Avg Accuracy on test : {}'.format(score(y_test, rfc_test_pred, 'avg')))
print('Random Forest Exact Accuracy on test : {}'.format(score(y_test, rfc_test_pred, 'exact')))
print('Random Forest Precision on test : {}'.format(score(y_test, rfc_test_pred, 'precision')))
print('Random Forest Recall on test : {}'.format(score(y_test, rfc_test_pred, 'recall')))

Random Forest Avg Accuracy on test : 0.8673556942277689
Random Forest Exact Accuracy on test : 0.0483619344773791
Random Forest Precision on test : 0.6802939829569972
Random Forest Recall on test : 0.16846224552879877


## GloVE

In [28]:
with open("glove.6B.300d.txt", "rb") as lines:
    w2v = {line.decode("utf-8").split()[0]: np.array(line.split()[1:]).astype(float)
           for line in lines}

In [29]:
# def mean_emb(sentence, n_embedding=300):
#     count = 0
#     emb = np.zeros(n_embedding)
#     for w in re.sub('['+string.punctuation+']', '', sentence.strip()).split():
#         if w in w2v:
#             emb += w2v[w]
#             count += 1
#     return emb / count

# def mean_transform(X, n_embedding=300):
#     transformed_x = np.zeros((len(X), n_embedding))
#     for i, sentence in enumerate(X):
#         transformed_x[i] = mean_emb(sentence, n_embedding)
#     return transformed_x

In [30]:
# glove_X_train = mean_transform(X_train)
# glove_X_test = mean_transform(X_test)

In [31]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 300


    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in re.sub('['+string.punctuation+']', '', words.strip()) if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

mean_embed = MeanEmbeddingVectorizer(w2v)
glove_X_train = mean_embed.transform(X_train)
glove_X_test = mean_embed.transform(X_test)

In [32]:
glove_X_train.shape

(5124, 300)

### Random Forests

In [33]:
rf = OneVsRestClassifier(RandomForestClassifier())
rf.fit(glove_X_train, y_train)

rf_train_pred = rf.predict(glove_X_train)
rf_test_pred = rf.predict(glove_X_test)
print('Random Forest Accuracy on Train : {}'.format(metrics.accuracy_score(y_train, rf_train_pred)))
print('Random Forest Accuracy on Test : {}'.format(metrics.accuracy_score(y_test, rf_test_pred)))

Random Forest Accuracy on Train : 0.703551912568306
Random Forest Accuracy on Test : 0.02106084243369735


In [34]:
# evaluation
print('Random Forest Avg Accuracy on test : {}'.format(score(y_test, rf_test_pred, 'avg')))
print('Random Forest Exact Accuracy on test : {}'.format(score(y_test, rf_test_pred, 'exact')))
print('Random Forest Precision on test : {}'.format(score(y_test, rf_test_pred, 'precision')))
print('Random Forest Recall on test : {}'.format(score(y_test, rf_test_pred, 'recall')))

Random Forest Avg Accuracy on test : 0.8397035881435257
Random Forest Exact Accuracy on test : 0.02106084243369735
Random Forest Precision on test : 0.21232926014672718
Random Forest Recall on test : 0.0602242071896536


### Logistic Regression

In [36]:
lr = OneVsRestClassifier(LogisticRegression())
lr.fit(glove_X_train, y_train)

lr_train_pred = lr.predict(glove_X_train)
lr_test_pred = lr.predict(glove_X_test)
print('Random Forest Accuracy on Train : {}'.format(metrics.accuracy_score(y_train, lr_train_pred)))
print('Random Forest Accuracy on Test : {}'.format(metrics.accuracy_score(y_test, lr_test_pred)))

Random Forest Accuracy on Train : 0.024785323965651834
Random Forest Accuracy on Test : 0.033541341653666144


In [37]:
# evaluation
print('Logistic Regression Avg Accuracy on test : {}'.format(score(y_test, lr_test_pred, 'avg')))
print('Logistic Regression Exact Accuracy on test : {}'.format(score(y_test, lr_test_pred, 'exact')))
print('Logistic Regression Precision on test : {}'.format(score(y_test, lr_test_pred, 'precision')))
print('Logistic Regression Recall on test : {}'.format(score(y_test, lr_test_pred, 'recall')))

Logistic Regression Avg Accuracy on test : 0.8517550702028079
Logistic Regression Exact Accuracy on test : 0.033541341653666144
Logistic Regression Precision on test : 0.13437848383500556
Logistic Regression Recall on test : 0.04199852722650322
