In [1]:
# -*- coding: utf-8 -*-

In [2]:
import re

In [3]:
from functools import partial

In [4]:
from gensim.models import Word2Vec

In [5]:
import numpy as np
import pandas as pd

In [6]:
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
from sklearn.multiclass import OneVsRestClassifier
# from sklearn.multiclass import LabelPowerSetClassifier

In [8]:
# these are the metrics we want to use for evaluation
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

In [9]:
# actual estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB

In [10]:
# scoring metrics used for evaluation. namely precision, accuracy, hamming loss (recall)
# and f_1-score with several different averages
METRICS = {
    "hamming_loss": hamming_loss,
    "subset_accuracy": accuracy_score,
    "precision": precision_score,
    "macro-f1": partial(f1_score, average="macro"),
    "samples-f1": partial(f1_score, average="samples"),
    "weighted-f1": partial(f1_score, average="weighted"),
    "micro-f1": partial(f1_score, average="micro"),
}

## Step 1: Prepare Training Data

In [11]:
test_data = pd.read_csv('../data/atmosphere_train.csv', delimiter=",")

In [12]:
# loading the already trained word embedding model
model = Word2Vec.load("../prep/200features_40minwords_10context")

The $j$-th component of the label vector is $1$ if the $j$-th label value is greater or equal to three. $0$ is used otherwise.

$v_{ij} = I[l_{ij} \geq 3]$

In [13]:
def make_wordlist(text, remove_stops=False):
    """
    Function that cleans the movie description text. Removes
    non-alphabetical letters and optionally english stop words.
    """
    # first step is to remove non-alphabetical characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    words = text.lower().split()
    
    if remove_stops:
        stops = set(stopwords.words("english"))
        return [w for w in words if not w in stops]
    
    return words

In [14]:
def get_feature_vec(words, f_size, model):
    n = .0
    x_i = np.zeros(f_size, dtype="float32")
    
    # internal word list of word2vec
    idx2words = set(model.index2word)
    
    s = filter(lambda e: e in idx2words, words)

    for w in s:
        n += 1.
        x_i = np.add(x_i, model[w])
    
    return np.divide(x_i, n)

In [15]:
def raw_row_to_X_y(row, f_size, model):
    """
    Outputs tuple for an instance containing of the (Xi, yi) feature vector/label vector pair.
    The label vector is given by the equation above.
    """
    
    # prepare the labels
    labels = row["labels"].split(",")

    y = [
        int("atmosphere_food_for_thought" in labels),
        int("atmosphere_funny" in labels),
        int("atmosphere_action" in labels),
        int("atmosphere_emotional" in labels),
        int("atmosphere_romantic" in labels),
        int("atmosphere_dark" in labels),
        int("atmosphere_brutal" in labels),
        int("atmosphere_thrilling" in labels)
    ]
    
    # create feature vector with word2vec model
    X = get_feature_vec(make_wordlist(row["descr"]), f_size, model)
    
    return (X, y)

In [16]:
X = []
y = []

for idx, movie in test_data.iterrows():
    if idx > 1:# and idx < 50:
        t = raw_row_to_X_y(movie, 200, model)
        X.append(t[0])
        y.append(t[1])
y = np.array(y)

In [17]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=0)

In [18]:
ESTIMATORS = {
    "Random Forest": OneVsRestClassifier(RandomForestClassifier(n_estimators = 100)).fit(X_train, y_train),
    "LinearSVC": OneVsRestClassifier(LinearSVC(random_state=1)).fit(X_train, y_train),
    "Gaussian Naive Bayes": OneVsRestClassifier(GaussianNB()).fit(X_train, y_train),
    "Bernoulli Naive Bayes": OneVsRestClassifier(BernoulliNB()).fit(X_train, y_train),
}

In [19]:
# calculating metrics based on the training set
for e_name, e in ESTIMATORS.items():
    print "# ", e_name
    y_pred = e.predict(X_test)

    for metric_name, metric in METRICS.items():
        print "%s: %s " % (metric_name, metric(y_test , y_pred))

#  LinearSVC
micro-f1: 0.776015857284 
precision: 0.74575432547 
macro-f1: 0.739539497611 
samples-f1: 0.75784992785 
subset_accuracy: 0.093023255814 

  sample_weight=sample_weight)
  sample_weight=sample_weight)



weighted-f1: 0.767463422506 
hamming_loss: 0.262790697674 
#  Bernoulli Naive Bayes
micro-f1: 0.713462545836 
precision: 0.735508761207 
macro-f1: 0.69740386028 
samples-f1: 0.69151388663 
subset_accuracy: 0.0744186046512 
weighted-f1: 0.717538346106 
hamming_loss: 0.318023255814 
#  Random Forest
micro-f1: 0.785534907082 
precision: 0.762112104718 
macro-f1: 0.760258006023 
samples-f1: 0.766326283536 
subset_accuracy: 0.125581395349 
weighted-f1: 0.782480859522 
hamming_loss: 0.248255813953 
#  Gaussian Naive Bayes
micro-f1: 0.71696094168 
precision: 0.756424797341 
macro-f1: 0.704559903765 
samples-f1: 0.691873243036 
subset_accuracy: 0.093023255814 
weighted-f1: 0.720507400855 
hamming_loss: 0.307558139535 


  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [20]:
# let's test an actual movie!
her_plot = """Theodore is a lonely man in the final stages of his divorce. When he's not working as a letter writer, his down time is spent playing video games and occasionally hanging out with friends. He decides to purchase the new OS1, which is advertised as the world's first artificially intelligent operating system, "It's not just an operating system, it's a consciousness," the ad states. Theodore quickly finds himself drawn in with Samantha, the voice behind his OS1. As they start spending time together they grow closer and closer and eventually find themselves in love. Having fallen in love with his OS, Theodore finds himself dealing with feelings of both great joy and doubt. As an OS, Samantha has powerful intelligence that she uses to help Theodore in ways others hadn't, but how does she help him deal with his inner conflict of being in love with an OS?"""
her_X_i = get_feature_vec(make_wordlist(her_plot), 200, model)

print "food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling"
for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(her_X_i)
    print "# ", e_name, y_pred[0]
    print y_pred.shape

food_for_thought, funny, action, emotional, romantic, dark, brutal, thrilling
#  LinearSVC [1 1 0 1 1 0 0 1]
(200, 8)
#  Bernoulli Naive Bayes [0 1 0 1 1 0 0 0]
(200, 8)
#  Random Forest [1 1 0 1 1 0 0 1]
(200, 8)
#  Gaussian Naive Bayes [0 1 0 1 1 0 0 0]
(200, 8)


In [46]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

In [49]:
bow = []

for idx, movie in test_data.iterrows():
    if idx > 1:
        bow.append(" ".join(make_wordlist(movie["descr"])))
bow

['danny ocean reunites with his old flame and the rest of his merry band of thieves in a caper concerning three huge heists in rome paris and amsterdam but europol agent isabel lahiri is hot on their heels',
 'when three friends finally come to after a raucous night of bachelor party revelry they find a baby in the closet and a tiger in the bathroom but they can t seem to locate their best friend doug who s supposed to be tying the knot launching a frantic search for doug the trio perseveres through a nasty hangover to try to make it to the church on time',
 'a young boy wins a tour through the most magnificent chocolate factory in the world led by the world s most unusual candy maker',
 'when the curator of the louvre is found murdered in the famed museum s hallowed halls harvard professor robert langdon and cryptographer sophie neve must untangle a deadly web of deceit involving the works of leonardo da vinci',
 'having never fully recovered from a prom date that became a total disas

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=0)