# Models

In [51]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
import string
import ast
import re

In [2]:
# read in data
imdb_movie = pd.read_csv('data/imdb_multilabel.csv')

In [3]:
imdb_movie.head()

Unnamed: 0.1,Unnamed: 0,title,imdb_id,topRank,bottomRank,metaScore,plot,rating,ratingCount,reviewCount,runningTimeInMinutes,userRatingCount,userScore,year,all_genre,genre,plot_list,genreCount,genre_code,all_genre_encode
0,0,"I, Tonya",tt5580036,930.0,17643.0,77.0,From the proverbial wrong side of the tracks i...,7.6,67667.0,46,120.0,235,7.8,2017.0,"['Biography', 'Comedy', 'Drama', 'Sport']",sport,"['From', 'the', 'proverbial', 'wrong', 'side',...",4,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
1,1,Cars 3,tt3606752,2256.0,11547.0,59.0,Blindsided by a new generation of blazing-fast...,6.8,41896.0,41,102.0,232,6.9,2017.0,"['Animation', 'Adventure', 'Comedy', 'Family',...",sport,"['Blindsided', 'by', 'a', 'new', 'generation',...",5,0,[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. ...
2,2,Creed,tt3076658,847.0,17840.0,82.0,Adonis Johnson is the son of the famous boxing...,7.6,193206.0,42,133.0,614,8.0,2015.0,"['Drama', 'Sport']",sport,"['Adonis', 'Johnson', 'is', 'the', 'son', 'of'...",2,0,[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
3,3,Battle of the Sexes,tt4622512,2303.0,11228.0,73.0,In the wake of the sexual revolution and the r...,6.8,27960.0,46,121.0,102,6.3,2017.0,"['Biography', 'Comedy', 'Drama', 'Sport']",sport,"['In', 'the', 'wake', 'of', 'the', 'sexual', '...",4,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
4,4,Borg McEnroe,tt5727282,,12891.0,57.0,The story of the 1980s tennis rivalry between ...,7.0,9800.0,13,107.0,0,,2017.0,"['Biography', 'Drama', 'Sport']",sport,"['The', 'story', 'of', 'the', '1980s', 'tennis...",3,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...


#### encode the genre as a single-label array and a multi-label array

#### single-label encode

In [4]:
genre_dict = dict(zip(imdb_movie.genre.unique(), range(20)))

imdb_movie['genre_code'] = imdb_movie.genre.replace(genre_dict).values

In [5]:
imdb_movie.head()

Unnamed: 0.1,Unnamed: 0,title,imdb_id,topRank,bottomRank,metaScore,plot,rating,ratingCount,reviewCount,runningTimeInMinutes,userRatingCount,userScore,year,all_genre,genre,plot_list,genreCount,genre_code,all_genre_encode
0,0,"I, Tonya",tt5580036,930.0,17643.0,77.0,From the proverbial wrong side of the tracks i...,7.6,67667.0,46,120.0,235,7.8,2017.0,"['Biography', 'Comedy', 'Drama', 'Sport']",sport,"['From', 'the', 'proverbial', 'wrong', 'side',...",4,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
1,1,Cars 3,tt3606752,2256.0,11547.0,59.0,Blindsided by a new generation of blazing-fast...,6.8,41896.0,41,102.0,232,6.9,2017.0,"['Animation', 'Adventure', 'Comedy', 'Family',...",sport,"['Blindsided', 'by', 'a', 'new', 'generation',...",5,0,[1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. ...
2,2,Creed,tt3076658,847.0,17840.0,82.0,Adonis Johnson is the son of the famous boxing...,7.6,193206.0,42,133.0,614,8.0,2015.0,"['Drama', 'Sport']",sport,"['Adonis', 'Johnson', 'is', 'the', 'son', 'of'...",2,0,[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
3,3,Battle of the Sexes,tt4622512,2303.0,11228.0,73.0,In the wake of the sexual revolution and the r...,6.8,27960.0,46,121.0,102,6.3,2017.0,"['Biography', 'Comedy', 'Drama', 'Sport']",sport,"['In', 'the', 'wake', 'of', 'the', 'sexual', '...",4,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...
4,4,Borg McEnroe,tt5727282,,12891.0,57.0,The story of the 1980s tennis rivalry between ...,7.0,9800.0,13,107.0,0,,2017.0,"['Biography', 'Drama', 'Sport']",sport,"['The', 'story', 'of', 'the', '1980s', 'tennis...",3,0,[1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...


In [6]:
genre_dict['sci-fi'] = 17

#### multi-label encode as an array

In [7]:
def multi_label_encoder(all_genre_list):
    """ This function takes a list of genre with a dictionary that keeps track of the index of the genre
    INPUTS
    ------
    all_genre_list: list of genres
    genre_dict: dictionary of indexs
    
    OUTPUTS
    -------
    np array in {0, 1}
    """
    encode = np.zeros(20)
    all_genre_list = ast.literal_eval(all_genre_list)
    for genre in all_genre_list:
        if genre.lower() in genre_dict:
            encode[genre_dict[genre.lower()]] = 1
    return list(encode)

In [8]:
imdb_movie['all_genre_encode'] = imdb_movie['all_genre'].apply(multi_label_encoder)

In [9]:
imdb_movie['all_genre_encode'].iloc[0]

[1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0]

In [10]:
imdb_movie['plot_list'] = imdb_movie['plot_list'].apply(ast.literal_eval)

In [11]:
imdb_movie['plot_list'].iloc[0]

['From',
 'the',
 'proverbial',
 'wrong',
 'side',
 'of',
 'the',
 'tracks',
 'in',
 'Portland,',
 'Oregon,',
 'former',
 'competitive',
 'figure',
 'skater',
 'Tonya',
 'Harding',
 'was',
 'never',
 'fully',
 'accepted',
 'in',
 'the',
 'figure',
 'skating',
 'community',
 'for',
 'not',
 'inherently',
 'being',
 'the',
 'image',
 'of',
 'grace,',
 'breeding',
 'and',
 'privilege',
 'that',
 'the',
 'community',
 'wanted',
 'to',
 'portray,',
 'despite',
 'she',
 'being',
 'naturally',
 'gifted',
 'in',
 'the',
 'sport',
 'athletically.',
 'Despite',
 'ultimately',
 'garnering',
 'some',
 'success',
 'in',
 'figure',
 'skating',
 'being',
 'national',
 'champion,',
 'a',
 'world',
 'championship',
 'medalist,',
 'an',
 'Olympian,',
 'and',
 'being',
 'the',
 'first',
 'American',
 'woman',
 'to',
 'complete',
 'a',
 'Triple',
 'Axel',
 'in',
 'competition,',
 'she',
 'is',
 'arguably',
 'best',
 'known',
 'for',
 'her',
 'association',
 'to',
 '"the',
 'incident":',
 'the',
 'leg',
 '

In [12]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(imdb_movie['plot'], 
                                                                            imdb_movie['all_genre_encode'],
                                                                            test_size = 0.2,
                                                                            random_state = 209,
                                                                            )

#### Evaluation

In [99]:
def score(y_true, y_score, method='avg'):
    if method == 'avg':
        return np.mean(np.mean(y_true == y_score, axis=1))
    elif method == 'exact':
        return metrics.average_precision_score(y_true, y_score)
    elif method == 'recall':
        return np.mean([metrics.recall_score(y_true[:,i], y_score[:,i]) for i in range(y_score.shape[1])])
    elif method == 'precision':
        return np.mean([metrics.precision_score(y_true[:,i], y_score[:,i]) for i in range(y_score.shape[1])])

## Baseline Model with bag-of-words 
### Naive Bayes 

In [13]:
# bag of words representation
vectorizer = CountVectorizer(stop_words='english')

In [14]:
naive_X_train = vectorizer.fit_transform(X_train)
naive_X_test = vectorizer.transform(X_test)

In [15]:
# reshape y_train and y_test
y_train = np.array(list(y_train))
y_test = np.array(list(y_test))

In [16]:
y_train.shape

(5124, 20)

In [17]:
y_train

array([[0., 0., 0., ..., 1., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
nb = OneVsRestClassifier(MultinomialNB())
nb.fit(naive_X_train, y_train)

  str(classes[c]))


OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1)

In [19]:
naive_train_pred = nb.predict(naive_X_train)
naive_test_pred = nb.predict(naive_X_test)
print('Naive Bayes Accuracy on Train : {}'.format(metrics.accuracy_score(y_train, naive_train_pred)))
print('Naive Bayes Accuracy on Test : {}'.format(metrics.accuracy_score(y_test, naive_test_pred)))

Naive Bayes Accuracy on Train : 0.5915300546448088
Naive Bayes Accuracy on Test : 0.14508580343213728


In [20]:
nb.predict_proba(naive_X_train)[0]

array([5.37012637e-33, 4.31795630e-31, 3.59365748e-27, 2.63671879e-10,
       1.11016946e-28, 1.01828361e-21, 3.29233909e-25, 1.31202345e-21,
       0.00000000e+00, 1.39414154e-08, 1.33012783e-02, 4.52588102e-15,
       8.36423998e-01, 3.23838390e-14, 1.86421000e-18, 3.69704282e-05,
       8.66331663e-01, 9.99998836e-01, 9.93130417e-01, 8.06854819e-08])

### Random Forests

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rfc = OneVsRestClassifier(RandomForestClassifier())
rfc.fit(naive_X_train, y_train)
rfc_train_pred = rfc.predict(naive_X_train)
rfc_test_pred = rfc.predict(naive_X_test)
print('Random Forest Accuracy on Train : {}'.format(metrics.accuracy_score(y_train, rfc_train_pred)))
print('Random Forest Accuracy on Test : {}'.format(metrics.accuracy_score(y_test, rfc_test_pred)))

  str(classes[c]))


Random Forest Accuracy on Train : 0.7792740046838408
Random Forest Accuracy on Test : 0.046021840873634944


## GloVE

In [24]:
with open("data/glove.6B.300d.txt", "rb") as lines:
    w2v = {line.decode("utf-8").split()[0]: np.array(line.split()[1:]).astype(float)
           for line in lines}

In [53]:
# def mean_emb(sentence, n_embedding=300):
#     count = 0
#     emb = np.zeros(n_embedding)
#     for w in re.sub('['+string.punctuation+']', '', sentence.strip()).split():
#         if w in w2v:
#             emb += w2v[w]
#             count += 1
#     return emb / count

# def mean_transform(X, n_embedding=300):
#     transformed_x = np.zeros((len(X), n_embedding))
#     for i, sentence in enumerate(X):
#         transformed_x[i] = mean_emb(sentence, n_embedding)
#     return transformed_x

In [63]:
# glove_X_train = mean_transform(X_train)
# glove_X_test = mean_transform(X_test)

In [58]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec

        self.dim = 300

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in re.sub('['+string.punctuation+']', '', words.strip()) if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

mean_embed = MeanEmbeddingVectorizer(w2v)
glove_X_train1 = mean_embed.transform(X_train)
glove_X_test1 = mean_embed.transform(X_test)

In [62]:
glove_X_train1.shape

(5124, 300)

In [64]:
rf = OneVsRestClassifier(RandomForestClassifier())
rf.fit(glove_X_train, y_train)

rf_train_pred = rf.predict(glove_X_train)
rf_test_pred = rf.predict(glove_X_test)
print('Glove Accuracy on Train : {}'.format(metrics.accuracy_score(y_train, rf_train_pred)))
print('Glove Accuracy on Test : {}'.format(metrics.accuracy_score(y_test, rf_test_pred)))

  str(classes[c]))


Glove Accuracy on Train : 0.7630757220921155
Glove Accuracy on Test : 0.0374414976599064


In [69]:
metrics.average_precision_score(y_test, rf_test_pred)

  recall = tps / tps[-1]


nan

In [75]:
np.mean(np.mean(y_test == rf_test_pred, axis=1))

(1282,)

In [77]:
dummy = np.zeros_like(y_test)
score(y_test, dummy)

0.8443447737909515

In [101]:
score(y_test, naive_test_pred, 'precision'), score(y_test, naive_test_pred, 'recall')

(0.6978311158754985, 0.4300237937094945)

In [102]:
score(y_test, rf_test_pred, 'precision'), score(y_test, rf_test_pred, 'recall')

  'precision', 'predicted', average, warn_for)


(0.5029178053427759, 0.12788670973685107)

In [103]:
score(y_test, dummy, 'precision'), score(y_test, dummy, 'recall')

  'precision', 'predicted', average, warn_for)


(0.0, 0.0)

In [98]:
score(y_train, naive_train_pred, 'recall')

0.8142066172114966