In [1]:
# -*- coding: utf-8 -*-

**TODO:**
* bag of means (word2vec)
* label distribution, c_hat vs c
* corpus on all movie descriptions vs only the test set (needed for on the fly classification)
* normalization
* test distinct pred_labels for specific movie in test data

In [2]:
import re

In [3]:
from functools import partial

In [4]:
import numpy as np
import pandas as pd

In [5]:
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
from sklearn.multiclass import OneVsRestClassifier # binary relevance
# from sklearn.multiclass import LabelPowerSetClassifier # not existing anymore

from skmultilearn.meta.br import BinaryRelevance
from skmultilearn.meta.lp import LabelPowerset 

In [7]:
# these are the metrics we want to use for evaluation
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

In [8]:
# actual estimators
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.naive_bayes import GaussianNB, BernoulliNB

In [9]:
# scoring metrics used for evaluation. namely precision, accuracy, hamming loss (recall)
# and f_1-score with several different averages
METRICS = ['precision_macro', 'recall_macro', 'f1_macro', 'precision_micro', 'recall_micro', 'f1_micro']

'\nMETRICS = {\n    "hamming_loss": hamming_loss,\n    "subset_accuracy": accuracy_score,\n    "precision": precision_score,\n    "macro-f1": partial(f1_score, average="macro"),\n    "samples-f1": partial(f1_score, average="samples"),\n    "weighted-f1": partial(f1_score, average="weighted"),\n    "micro-f1": partial(f1_score, average="micro"),\n}\n'

## Step 1: Prepare Training Data

In [10]:
data_csv = {
    'prepped_train_X_Doc2Vec_dbow_d100_n5_mc2_t2':'prepped_train_X_Doc2Vec_dbow_d100_n5_mc2_t2.csv',
    'prepped_train_X_Doc2Vec_dm_c_d100_n5_w5_mc2_t2': 'prepped_train_X_Doc2Vec_dm_c_d100_n5_w5_mc2_t2.csv',
    'prepped_train_X_Doc2Vec_dm_m_d100_n5_w10_mc2_t2': 'prepped_train_X_Doc2Vec_dm_m_d100_n5_w10_mc2_t2.csv',
    'prepped_train_X_Doc2Vec_dbow_dmc': 'prepped_train_X_dbow_dmc.csv',
    'prepped_train_X_Doc2Vec_dbow_dmm': 'prepped_train_X_dbow_dmm.csv',
    'prepped_train_X_bow_tfidf': 'prepped_train_X_bow_tfidf.csv',
    'prepped_train_X_bigrams': 'prepped_train_X_bigrams.csv',
    'prepped_train_X_trigrams': 'prepped_train_X_trigrams.csv',
    'prepped_train_X_bigrams_tfidf': 'prepped_train_X_bigrams_tfidf.csv',
    'prepped_train_X_trigrams_tfidf': 'prepped_train_X_trigrams_tfidf.csv',
    'prepped_train_X_bow': 'prepped_train_X_bow.csv'
}

In [11]:
# feature vectors per word model
train_data = {}

for f in data_csv.keys():
    d = pd.read_csv('../data/' + data_csv[f])

    train_data[f] = d

In [12]:
y = pd.read_csv('../data/prepped_train_y.csv').as_matrix()

# remove the first column containing index numbers
y = np.delete(y, 0, 1)

In [13]:
OVR_ESTIMATORS = {
    "OVR Random Forest": OneVsRestClassifier(RandomForestClassifier(n_estimators = 100)),
    "OVR LinearSVC": OneVsRestClassifier(LinearSVC(random_state=1)),
    "OVR Gaussian Naive Bayes": OneVsRestClassifier(GaussianNB()),
    "OVR Bernoulli Naive Bayes": OneVsRestClassifier(BernoulliNB())
}
BR_ESTIMATORS = {
    "Random Forest": BinaryRelevance(RandomForestClassifier(n_estimators = 100)),
    "LinearSVC": BinaryRelevance(LinearSVC(random_state=1)),
    "Gaussian Naive Bayes": BinaryRelevance(GaussianNB()),
    "Bernoulli Naive Bayes": BinaryRelevance(BernoulliNB())
}
LP_ESTIMATORS = {
    "LP Random Forest": LabelPowerset(RandomForestClassifier(n_estimators = 100)),
    "LP LinearSVC": LabelPowerset(LinearSVC(random_state=1)),
    "LP Gaussian Naive Bayes": LabelPowerset(GaussianNB()),
    "LP Bernoulli Naive Bayes": LabelPowerset(BernoulliNB())
}

# merge all dicts
ESTIMATORS = BR_ESTIMATORS.copy()
ESTIMATORS.update(LP_ESTIMATORS)
# ESTIMATORS.update(OVR_ESTIMATOR)
#ESTIMATORS = OVR_ESTIMATORS.copy()

In [None]:
from sklearn.utils.multiclass import check_classification_targets
from sklearn.base import clone as sk_clone
import time

def train(e, X, y):
    """
    Train all the estimators on the current dataset.
    The fit method should reset internals anyway.
    """
    e.fit(X, y)

def test(e, X, y):
    """ calculating metrics based on the training set """
    for metric in METRICS:
        cv = cross_validation.ShuffleSplit(len(y), random_state=0)
        scores = cross_validation.cross_val_score(e, X, y, cv=cv, scoring=metric)

        print "\t\tmean %s: %s" % (metric, scores.sum() / 10)

def run_est(X, y):
    """
    Train and test the estimators on the given dataset
    """
    tic = time.time()

    # all means of given METRICS
    means = []

    for e_name, e in ESTIMATORS.items():
        print "\t-> testing ", e_name

        ms = test(e, X, y)
        print "\t-> %ds elapsed for testing" % (time.time() - tic,)

        means.append(ms)

    return means

In [None]:
# binary relevance training
ESTIMATORS = BR_ESTIMATORS.copy()
for k in train_data.keys():
    # convert pandas dataframe to np.array
    X = train_data[k].as_matrix()
    # remove continuous index numbers
    X = np.delete(X, 0, 1)

    assert(X.shape[0] == len(y))

    print "[#] Dataset: " + k
    run_est(X, y)

In [None]:
# label powerset training
ESTIMATORS = LP_ESTIMATORS

for k in train_data.keys():
    # convert pandas dataframe to np.array
    X = train_data[k].as_matrix()
    # remove continuous index numbers
    X = np.delete(X, 0, 1)

    assert(X.shape[0] == len(y))

    print "[#] Dataset: " + k
    run_est(X, y)

In [None]:
import os
os.system('say "Done."')

In [45]:
import pickle

# let's save the models
for e_name, e in ESTIMATORS.items():
    with open('../data/estimator_' + e_name + '.pkl', 'wb') as f:
        pickle.dump(e, f)

## Real World Examples

In [None]:
def print_labels(labels):
    atmos = ["food_for_thought", "funny", "action", "emotional", "romantic", "dark", "brutal", "thrilling"]
    
    for a, l in zip(atmos, labels):
        if l:
            print a

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
# n-grams (for bow is n = 1)
from sklearn.feature_extraction.text import CountVectorizer

# the normal n-gram models are typically high dimensional sparse vector spaces
# with *lots* of zeroes.
# i.e. "bow_tfidf" will be of shape (2146, 7000),
# while "bow" will be (2146, 40008). 2146 is the 
# number of examples, the second number is the size
# of the vector representing each training document.
named_models = {
    'bow': CountVectorizer(min_df=1),
    'bigrams': CountVectorizer(ngram_range=(1, 2),stop_words='english'),
    'trigrams': CountVectorizer(ngram_range=(1, 3), stop_words='english'),
    'bow_tfidf': TfidfVectorizer(stop_words='english'),
    'bigrams_tfidf': TfidfVectorizer(ngram_range=(1, 2), stop_words='english'),
    'trigrams_tfidf': TfidfVectorizer(ngram_range=(1, 3), stop_words='english'),
}

In [47]:
atmosphere_train_data = pd.read_csv('../data/atmosphere_train.csv', delimiter=",")

In [48]:
corpus = []

for idx, row in atmosphere_train_data.iterrows():
    if idx > 0:# and idx < 50:
        corpus.append(row["descr"])

# train the models on the corpus
X = []

for model in named_models.keys():
    # train on the corpus
    named_models[model].fit_transform(corpus)
    
    # extract test data
    x = named_models[model].transform(corpus)
    X += [x]

In [49]:
def make_wordlist(plot):
    """
    Function that cleans the movie description text. Removes
    non-alphabetical letters.
    """
    # first step is to remove non-alphabetical characters
    text = re.sub("[^a-zA-Z]", " ", plot)
    
    return text.lower().split()

def get_feature_vec(plot, model):
    return model.transform(plot)

In [69]:
# let's test an actual movie! movie: "her"
plot = """Theodore is a lonely man in the final stages of his divorce. When he's not working as a letter writer, his down time is spent playing video games and occasionally hanging out with friends. He decides to purchase the new OS1, which is advertised as the world's first artificially intelligent operating system, "It's not just an operating system, it's a consciousness," the ad states. Theodore quickly finds himself drawn in with Samantha, the voice behind his OS1. As they start spending time together they grow closer and closer and eventually find themselves in love. Having fallen in love with his OS, Theodore finds himself dealing with feelings of both great joy and doubt. As an OS, Samantha has powerful intelligence that she uses to help Theodore in ways others hadn't, but how does she help him deal with his inner conflict of being in love with an OS?"""
X_i = get_feature_vec(make_wordlist(plot), named_models['bow_tfidf']).toarray()


print "food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling"
for d in data[10][5]:
    y_pred = d.predict(X_i)

    print "# ", y_pred[0]

food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling
#  [0 0 0 0 1 1 1 0]
#  [1 1 1 1 0 0 1 1]
#  [1 1 1 1 0 0 1 1]
#  [1 1 1 1 0 0 1 1]


In [98]:
# movie: forrest gump
plot = """Forrest Gump is a simple man with a low I.Q. but good intentions. He is running through childhood with his best and only friend Jenny. His 'mama' teaches him the ways of life and leaves him to choose his destiny. Forrest joins the army for service in Vietnam, finding new friends called Dan and Bubba, he wins medals, creates a famous shrimp fishing fleet, inspires people to jog, starts a ping-pong craze, create the smiley, write bumper stickers and songs, donating to people and meeting the president several times. However, this is all irrelevant to Forrest who can only think of his childhood sweetheart Jenny Curran. Who has messed up her life. Although in the end all he wants to prove is that anyone can love anyone."""
X_i = get_feature_vec(make_wordlist(plot), named_models['bow_tfidf']).toarray()

print "food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling"
for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i)

    print "# ", e_name, y_pred[0]

food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling
#  BR Gaussian Naive Bayes [1 1 0 1 1 1 0 0]
#  BR LinearSVC [1 1 1 1 0 0 1 1]
#  BR Random Forest [1 1 1 1 0 0 1 1]
#  BR Bernoulli Naive Bayes [1 1 1 1 0 0 0 1]
