In [1]:
# -*- coding: utf-8 -*-

In [3]:
import re

In [4]:
from functools import partial

In [5]:
from gensim.models import Word2Vec

In [6]:
import numpy as np
import pandas as pd

In [7]:
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
from sklearn.multiclass import OneVsRestClassifier
# from sklearn.multiclass import LabelPowerSetClassifier

In [9]:
# these are the metrics we want to use for evaluation
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

In [10]:
# actual estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB

In [28]:
# scoring metrics used for evaluation. namely precision, accuracy, hamming loss (recall)
# and f_1-score with several different averages
METRICS = ['precision_macro', 'recall_macro', 'f1_macro', 'precision_micro', 'recall_micro', 'f1_micro']

"""
METRICS = {
    "hamming_loss": hamming_loss,
    "subset_accuracy": accuracy_score,
    "precision": precision_score,
    "macro-f1": partial(f1_score, average="macro"),
    "micro-f1": partial(f1_score, average="micro"),
}
"""

'\nMETRICS = {\n    "hamming_loss": hamming_loss,\n    "subset_accuracy": accuracy_score,\n    "precision": precision_score,\n    "macro-f1": partial(f1_score, average="macro"),\n    "micro-f1": partial(f1_score, average="micro"),\n}\n'

## Step 1: Prepare Training Data

In [12]:
test_data = pd.read_csv('../data/atmosphere_train.csv', delimiter=",")

In [13]:
# loading the already trained word embedding model
model = Word2Vec.load("../prep/200features_40minwords_10context")

The $j$-th component of the label vector is $1$ if the $j$-th label value is greater or equal to three. $0$ is used otherwise.

$v_{ij} = I[l_{ij} \geq 3]$

In [14]:
def make_wordlist(text, remove_stops=False):
    """
    Function that cleans the movie description text. Removes
    non-alphabetical letters and optionally english stop words.
    """
    # first step is to remove non-alphabetical characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    words = text.lower().split()
    
    if remove_stops:
        stops = set(stopwords.words("english"))
        return [w for w in words if not w in stops]
    
    return words

In [15]:
def get_feature_vec(words, f_size, model):
    n = .0
    x_i = np.zeros(f_size, dtype="float32")
    
    # internal word list of word2vec
    idx2words = set(model.index2word)
    
    s = filter(lambda e: e in idx2words, words)

    for w in s:
        n += 1.
        x_i = np.add(x_i, model[w])
    
    return np.divide(x_i, n)

In [16]:
def raw_row_to_X_y(row, f_size, model):
    """
    Outputs tuple for an instance containing of the (Xi, yi) feature vector/label vector pair.
    The label vector is given by the equation above.
    """
    
    # prepare the labels
    labels = row["labels"].split(",")

    y = [
        int("atmosphere_food_for_thought" in labels),
        int("atmosphere_funny" in labels),
        int("atmosphere_action" in labels),
        int("atmosphere_emotional" in labels),
        int("atmosphere_romantic" in labels),
        int("atmosphere_dark" in labels),
        int("atmosphere_brutal" in labels),
        int("atmosphere_thrilling" in labels)
    ]
    
    # create feature vector with word2vec model
    X = get_feature_vec(make_wordlist(row["descr"]), f_size, model)
    
    return (X, y)

In [17]:
X = []
y = []

for idx, movie in test_data.iterrows():
    if idx > 1:# and idx < 50:
        t = raw_row_to_X_y(movie, 200, model)
        X.append(t[0])
        y.append(t[1])

y = np.array(y)

In [38]:
import time

def train(e, X, y):
    """
    Train all the estimators on the current dataset.
    The fit method should reset internals anyway.
    """
    e.fit(X, y)
        

def test(e, X, y):
    # calculating metrics based on the training set
    for metric in METRICS:
        scores = cross_validation.cross_val_score(e, X, y, cv=10, scoring=metric)

        print "\t\tmean %s: %s" % (metric, scores.sum() / len(scores))


def run_est(X, y):
    """
    Prepare, train and test the estimators on the given dataset.
    """
    tic = time.time()

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y,
                                            test_size=0.1, random_state=0)

    # all means of given METRICS
    means = []

    for e_name, e in ESTIMATORS.items():
        # create new estimator with equivalent parameters as the current one
        e_ = e # .clone() FIXME: clone the classifier?

        print "\t-> training + testing ", e_name

        train(e_, X_train, y_train)
        print "\t-> %ds elapsed for training" % (time.time() - tic,)

        ms = test(e_, X_test, y_test)
        print "\t-> %ds elapsed for testing" % (time.time() - tic,)

        means.append(ms)

    return (X_train, y_train, X_test, y_test, means)

In [39]:
ESTIMATORS = {
    "OVR Random Forest": OneVsRestClassifier(RandomForestClassifier(n_estimators = 100)),
    "OVR LinearSVC": OneVsRestClassifier(LinearSVC(random_state=1)),
    "OVR Gaussian Naive Bayes": OneVsRestClassifier(GaussianNB()),
    "OVR Bernoulli Naive Bayes": OneVsRestClassifier(BernoulliNB()),
}

In [None]:
data = run_est(X, y)

	-> training + testing  OVR Gaussian Naive Bayes
	-> 0s elapsed for training
		mean precision_macro: 0.710877334471
		mean recall_macro: 0.636445378667
		mean f1_macro: 0.656347760863
		mean precision_micro: 0.723994611371
		mean recall_micro: 0.647858788645
		mean f1_micro: 0.680624210158
	-> 1s elapsed for testing
	-> training + testing  OVR LinearSVC
	-> 1s elapsed for training
		mean precision_macro: 0.70814716657
		mean recall_macro: 0.704340730541
		mean f1_macro: 0.679625088525
		mean precision_micro: 0.71852231449
		mean recall_micro: 0.752661325719
		mean f1_micro: 0.733229381496
	-> 5s elapsed for testing
	-> training + testing  OVR Random Forest
	-> 18s elapsed for training
		mean precision_macro: 0.681784115223
		mean recall_macro: 0.717555665654
		mean f1_macro: 0.693270063145

In [36]:
# let's test an actual movie!
plot = """Theodore is a lonely man in the final stages of his divorce. When he's not working as a letter writer, his down time is spent playing video games and occasionally hanging out with friends. He decides to purchase the new OS1, which is advertised as the world's first artificially intelligent operating system, "It's not just an operating system, it's a consciousness," the ad states. Theodore quickly finds himself drawn in with Samantha, the voice behind his OS1. As they start spending time together they grow closer and closer and eventually find themselves in love. Having fallen in love with his OS, Theodore finds himself dealing with feelings of both great joy and doubt. As an OS, Samantha has powerful intelligence that she uses to help Theodore in ways others hadn't, but how does she help him deal with his inner conflict of being in love with an OS?"""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

print "food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling"
for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print e_name, ": ", y_pred[0]

food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling
OVR Gaussian Naive Bayes :  [0 1 0 1 1 0 0 0]
OVR LinearSVC :  [1 1 0 1 1 0 0 1]
OVR Random Forest :  [1 1 0 1 1 0 0 1]
OVR Bernoulli Naive Bayes :  [0 1 0 1 1 0 0 0]


In [37]:
plot = """In Paris, the aristocratic and intellectual Philippe is a quadriplegic millionaire who is interviewing candidates for the position of his carer, with his red-haired secretary Magalie. Out of the blue, the rude African Driss cuts the line of candidates and brings a document from the Social Security and asks Phillipe to sign it to prove that he is seeking a job position so he can receive his unemployment benefit. Philippe challenges Driss, offering him a trial period of one month to gain experience helping him. Then Driss can decide whether he would like to stay with him or not. Driss accepts the challenge and moves to the mansion, changing the boring life of Phillipe and his employees."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

print "food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling"
for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print e_name, ": ", y_pred[0]

food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling
OVR Gaussian Naive Bayes :  [0 1 0 0 0 0 0 0]
OVR LinearSVC :  [1 1 1 1 0 0 0 1]
OVR Random Forest :  [1 1 1 1 0 0 0 1]
OVR Bernoulli Naive Bayes :  [0 1 0 1 1 0 0 0]
