In [1]:
# -*- coding: utf-8 -*-

In [2]:
import re

In [3]:
from functools import partial

In [4]:
from gensim.models import Word2Vec

In [5]:
import numpy as np
import pandas as pd

In [6]:
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
from skmultilearn.meta.br import BinaryRelevance
from skmultilearn.meta.lp import LabelPowerset 

In [8]:
# these are the metrics we want to use for evaluation
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

In [18]:
# actual estimators
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [10]:
# scoring metrics used for evaluation. namely precision, accuracy, hamming loss (recall)
# and f_1-score with several different averages
METRICS = ['precision_macro', 'recall_macro', 'f1_macro', 'precision_micro', 'recall_micro', 'f1_micro']

'\nMETRICS = {\n    "hamming_loss": hamming_loss,\n    "subset_accuracy": accuracy_score,\n    "precision": precision_score,\n    "macro-f1": partial(f1_score, average="macro"),\n    "micro-f1": partial(f1_score, average="micro"),\n}\n'

## Step 1: Prepare Training Data

In [11]:
test_data = pd.read_csv('../data/atmosphere_train.csv', delimiter=",")

In [12]:
# loading the already trained word embedding model
model = Word2Vec.load("../prep/300features_40minwords_10context")

The $j$-th component of the label vector is $1$ if the $j$-th label value is greater or equal to three. $0$ is used otherwise.

$v_{ij} = I[l_{ij} \geq 3]$

In [13]:
def make_wordlist(text, remove_stops=False):
    """
    Function that cleans the movie description text. Removes
    non-alphabetical letters and optionally english stop words.
    """
    # first step is to remove non-alphabetical characters
    text = re.sub("[^a-zA-Z]", " ", text)
    
    words = text.lower().split()
    
    if remove_stops:
        stops = set(stopwords.words("english"))
        return [w for w in words if not w in stops]
    
    return words

In [14]:
def get_feature_vec(words, f_size, model):
    n = .0
    x_i = np.zeros(f_size, dtype="float32")
    
    # internal word list of word2vec
    idx2words = set(model.index2word)
    
    s = filter(lambda e: e in idx2words, words)

    for w in s:
        n += 1.
        x_i = np.add(x_i, model[w])
    
    return np.divide(x_i, n)

In [15]:
def raw_row_to_X_y(row, f_size, model):
    """
    Outputs tuple for an instance containing of the (Xi, yi) feature vector/label vector pair.
    The label vector is given by the equation above.
    """
    
    # prepare the labels
    labels = row["labels"].split(",")

    y = [
        int("atmosphere_food_for_thought" in labels),
        int("atmosphere_funny" in labels),
        int("atmosphere_action" in labels),
        int("atmosphere_emotional" in labels),
        int("atmosphere_romantic" in labels),
        int("atmosphere_dark" in labels),
        int("atmosphere_brutal" in labels),
        int("atmosphere_thrilling" in labels)
    ]
    
    # create feature vector with word2vec model
    X = get_feature_vec(make_wordlist(row["descr"]), f_size, model)
    
    return (X, y)

In [16]:
X = []
y = []

for idx, movie in test_data.iterrows():
    if idx > 1:
        t = raw_row_to_X_y(movie, 300, model)
        X.append(t[0])
        y.append(t[1])

y = np.array(y)

In [17]:
import time

def train(e, X, y):
    """
    Train all the estimators on the current dataset.
    The fit method should reset internals anyway.
    """
    e.fit(X, y)
        

def test(e, X, y):
    # calculating metrics based on the training set
    for metric in METRICS:
        scores = cross_validation.cross_val_score(e, X, y, cv=10, scoring=metric)

        print "\t\tmean %s: %s" % (metric, scores.sum() / len(scores))


def run_est(X, y):
    """
    Prepare, train and test the estimators on the given dataset.
    """
    tic = time.time()

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y,
                                            test_size=0.1, random_state=0)

    # all means of given METRICS
    means = []

    for e_name, e in ESTIMATORS.items():
        # create new estimator with equivalent parameters as the current one
        e_ = e # .clone() FIXME: clone the classifier?

        print "\t-> training + testing ", e_name

        train(e_, X_train, y_train)
        print "\t-> %ds elapsed for training" % (time.time() - tic,)

        ms = test(e_, X_test, y_test)
        print "\t-> %ds elapsed for testing" % (time.time() - tic,)

        means.append(ms)

    return (X_train, y_train, X_test, y_test, means)

In [19]:
BR_ESTIMATORS = {
    #"BR Random Forest": BinaryRelevance(RandomForestClassifier(n_estimators = 100)),
    #"BR LinearSVC": BinaryRelevance(LinearSVC(random_state=1)),
    #"BR Gaussian Naive Bayes": BinaryRelevance(GaussianNB()),
    #"BR Bernoulli Naive Bayes": BinaryRelevance(BernoulliNB()),
    "BR LinearSVM": BinaryRelevance(svm.SVC(kernel='linear', probability=True))
}
LP_ESTIMATORS = {
    #"LP Random Forest": LabelPowerset(RandomForestClassifier(n_estimators = 100)),
    #"LP LinearSVC": LabelPowerset(LinearSVC(random_state=1)),
    #"LP Gaussian Naive Bayes": LabelPowerset(GaussianNB()),
    #"LP Bernoulli Naive Bayes": LabelPowerset(BernoulliNB()),
    "LP LinearSVM": LabelPowerset(svm.SVC(kernel='linear', probability=True))
}

# merge all dicts
# ESTIMATORS = BR_ESTIMATORS.copy()
# ESTIMATORS.update(LP_ESTIMATORS)
# ESTIMATORS.update(OVR_ESTIMATOR)
#ESTIMATORS = LP_ESTIMATORS.copy()

In [21]:
ESTIMATORS = LP_ESTIMATORS.copy()
data = run_est(X, y)

	-> training + testing  LP LinearSVM
	-> 13s elapsed for training
		mean precision_macro: 0.451352813853
		mean recall_macro: 0.75
		mean f1_macro: 0.553609353336
		mean precision_micro: 0.601803751804
		mean recall_micro: 0.803875678274
		mean f1_micro: 0.687302810125
	-> 36s elapsed for testing


In [22]:
ESTIMATORS = BR_ESTIMATORS.copy()
data = run_est(X, y)

	-> training + testing  BR LinearSVM
	-> 49s elapsed for training
		mean precision_macro: 0.591517896931
		mean recall_macro: 0.695348401598
		mean f1_macro: 0.602270939252
		mean precision_micro: 0.673959331204
		mean recall_micro: 0.774159196396
		mean f1_micro: 0.718461619928
	-> 80s elapsed for testing


In [107]:
from sklearn import preprocessing
X_normalized = preprocessing.normalize(X, norm='l2')
data = run_est(X_normalized, y)

	-> training + testing  OVR Gaussian Naive Bayes
	-> 0s elapsed for training
		mean precision_macro: 0.708004513868
		mean recall_macro: 0.643365078119
		mean f1_macro: 0.658558061253
		mean precision_micro: 0.720409322982
		mean recall_micro: 0.65420049392
		mean f1_micro: 0.682610524093
	-> 2s elapsed for testing
	-> training + testing  OVR LinearSVC
	-> 3s elapsed for training
		mean precision_macro: 0.69828177681
		mean recall_macro: 0.706454682014
		mean f1_macro: 0.682575866338
		mean precision_micro: 0.719455060709
		mean recall_micro: 0.749756364628
		mean f1_micro: 0.731668948538
	-> 10s elapsed for testing
	-> training + testing  OVR Random Forest
	-> 30s elapsed for training
		mean precision_macro: 0.685258792742
		mean recall_macro: 0.70580813139
		mean f1_macro: 0.683321498422
		mean precision_micro: 0.720142806777
		mean recall_micro: 0.756564608885
		mean f1_micro: 0.741582402595
	-> 306s elapsed for testing
	-> training + testing  OVR Bernoulli Naive Bayes
	-> 307s elap

In [48]:
def print_labels(labels):
    atmos = ["food_for_thought", "funny", "action", "emotional", "romantic", "dark", "brutal", "thrilling"]
    ats = []
    
    for a, l in zip(atmos, labels):
        if l:
            ats.append(a)
    print ','.join(ats)

In [106]:
# let's test an actual movie!
plot = """Theodore is a lonely man in the final stages of his divorce. When he's not working as a letter writer, his down time is spent playing video games and occasionally hanging out with friends. He decides to purchase the new OS1, which is advertised as the world's first artificially intelligent operating system, "It's not just an operating system, it's a consciousness," the ad states. Theodore quickly finds himself drawn in with Samantha, the voice behind his OS1. As they start spending time together they grow closer and closer and eventually find themselves in love. Having fallen in love with his OS, Theodore finds himself dealing with feelings of both great joy and doubt. As an OS, Samantha has powerful intelligence that she uses to help Theodore in ways others hadn't, but how does she help him deal with his inner conflict of being in love with an OS?"""
ys = []

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])

    try:
        print ["food_for_thought", "funny", "action", "emotional", "romantic", "dark", "brutal", "thrilling"]
        print e.predict_proba(X_i.reshape(1,-1))[0]
        ys += [e.predict_proba(X_i.reshape(1,-1))[0]]
    except:
        pass

print "means"
for i in range(0,8):
    a = [ys[0][i], ys[1][i], ys[2][i]]
    print sum(a) / 3

->  OVR Gaussian Naive Bayes
funny,romantic
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[  9.36186838e-12   1.00000000e+00   7.40274917e-05   2.84274658e-07
   9.58459953e-01   1.45838440e-09   2.07169483e-09   5.25604542e-08]
->  OVR LinearSVC
funny,thrilling
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
->  OVR Random Forest
funny,action,thrilling
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[ 0.2980119   0.87392857  0.58349856  0.4053254   0.28361111  0.11227381
  0.22160132  0.61161255]
->  OVR Bernoulli Naive Bayes
funny,romantic
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[  2.02092454e-05   9.99996098e-01   2.70660183e-03   1.07987025e-02
   8.92001349e-01   6.21757080e-05   1.42615937e-05   8.20677117e-04]
means
0.0993440380056
0.957974889715
0.19542639544
0.138708127876
0.711

In [103]:
plot = """In Paris, the aristocratic and intellectual Philippe is a quadriplegic millionaire who is interviewing candidates for the position of his carer, with his red-haired secretary Magalie. Out of the blue, the rude African Driss cuts the line of candidates and brings a document from the Social Security and asks Phillipe to sign it to prove that he is seeking a job position so he can receive his unemployment benefit. Philippe challenges Driss, offering him a trial period of one month to gain experience helping him. Then Driss can decide whether he would like to stay with him or not. Driss accepts the challenge and moves to the mansion, changing the boring life of Phillipe and his employees."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)
ys = []

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])

    try:
        print ["food_for_thought", "funny", "action", "emotional", "romantic", "dark", "brutal", "thrilling"]
        print e.predict_proba(X_i.reshape(1,-1))[0]
        ys += [e.predict_proba(X_i.reshape(1,-1))[0]]
    except:
        pass

print "means"
for i in range(0,8):
    a = [ys[0][i], ys[1][i], ys[2][i]]
    print sum(a) / 3

->  OVR Gaussian Naive Bayes
funny
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[  4.03274939e-04   9.98498123e-01   1.19921116e-01   1.83141993e-01
   4.09833452e-02   1.31541334e-02   1.80469442e-05   7.63107041e-02]
->  OVR LinearSVC
food_for_thought,funny,action,emotional,thrilling
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
->  OVR Random Forest
food_for_thought,funny,action,emotional,thrilling
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[ 0.6265363   0.59740278  0.63141522  0.74914646  0.29734524  0.33130014
  0.37263095  0.81935714]
->  OVR Bernoulli Naive Bayes
funny,emotional,romantic
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[ 0.24435795  0.97975817  0.02872858  0.96303076  0.72018321  0.03704753
  0.00624693  0.17363003]
means
0.290432507289
0.858553023968
0.26002163975

In [105]:
plot = """Jim, Oz, Finch and Kevin are four friends who make a pact that before they graduate they will all lose their virginity. The hard job now is how to reach that goal by prom night. Whilst Oz begins singing to grab attention and Kevin tries to persuade his girlfriend, Finch tries any easy route of spreading rumors and Jim fails miserably. Whether it is being caught on top of a pie or on the Internet, Jim always end up with his trusty sex advice from his father. Will they achieve their goal of getting laid by prom night? or will they learn something much different."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)
ys = []

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])

    try:
        print ["food_for_thought", "funny", "action", "emotional", "romantic", "dark", "brutal", "thrilling"]
        print e.predict_proba(X_i.reshape(1,-1))[0]
        ys += [e.predict_proba(X_i.reshape(1,-1))[0]]
    except:
        pass

print "means"
for i in range(0,8):
    a = [ys[0][i], ys[1][i], ys[2][i]]
    print sum(a) / 3

->  OVR Gaussian Naive Bayes
funny,romantic
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[  9.36186838e-12   1.00000000e+00   7.40274917e-05   2.84274658e-07
   9.58459953e-01   1.45838440e-09   2.07169483e-09   5.25604542e-08]
->  OVR LinearSVC
funny,thrilling
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
->  OVR Random Forest
funny,action,thrilling
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[ 0.2980119   0.87392857  0.58349856  0.4053254   0.28361111  0.11227381
  0.22160132  0.61161255]
->  OVR Bernoulli Naive Bayes
funny,romantic
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[  2.02092454e-05   9.99996098e-01   2.70660183e-03   1.07987025e-02
   8.92001349e-01   6.21757080e-05   1.42615937e-05   8.20677117e-04]
means
0.0993440380056
0.957974889715
0.19542639544
0.138708127876
0.711

In [64]:
plot = """Death Row guards at a penitentiary, in the 1930's, have a moral dilemma with their job when they discover one of their prisoners, a convicted murderer, has a special gift."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])


->  OVR Gaussian Naive Bayes
food_for_thought,action,dark,brutal,thrilling
->  OVR LinearSVC
food_for_thought,action,emotional,dark,brutal,thrilling
->  OVR Random Forest
food_for_thought,action,emotional,dark,brutal,thrilling
->  OVR Bernoulli Naive Bayes
food_for_thought,action,dark,brutal,thrilling


In [65]:
plot = """Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination. Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])

->  OVR Gaussian Naive Bayes
food_for_thought,action,dark,brutal,thrilling
->  OVR LinearSVC
food_for_thought,action,emotional,dark,brutal,thrilling
->  OVR Random Forest
food_for_thought,action,dark,brutal,thrilling
->  OVR Bernoulli Naive Bayes
food_for_thought,action,dark,brutal,thrilling


In [66]:
plot = """An adaptation of F. Scott Fitzgerald's Long Island-set novel, where Midwesterner Nick Carraway is lured into the lavish world of his neighbor, Jay Gatsby. Soon enough, however, Carraway will see through the cracks of Gatsby's nouveau riche existence, where obsession, madness, and tragedy await."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])

->  OVR Gaussian Naive Bayes
action,emotional,dark,thrilling
->  OVR LinearSVC
food_for_thought,action,emotional,dark,thrilling
->  OVR Random Forest
food_for_thought,funny,action,emotional,thrilling
->  OVR Bernoulli Naive Bayes
food_for_thought,action,emotional,dark,thrilling


In [67]:
plot = """The script begins as a young Hughes directs one of Scorsese's favorite films, Hell's Angels. Hughes was so obsessed with perfection in the aerial sequences that he waits forever for perfect conditions, right down to cloud formations. The Aviator ends in 1946, when Hughes was still a dashing young man and romancing actresses like Ava Gardner and Katharine Hepburn."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])

->  OVR Gaussian Naive Bayes
funny,emotional,romantic
->  OVR LinearSVC
food_for_thought,funny,emotional,thrilling
->  OVR Random Forest
food_for_thought,funny,action,emotional,thrilling
->  OVR Bernoulli Naive Bayes
funny,emotional,romantic


In [104]:
plot = """84 years later, a 101-year-old woman named Rose DeWitt Bukater tells the story to her granddaughter Lizzy Calvert, Brock Lovett, Lewis Bodine, Bobby Buell and Anatoly Mikailavich on the Keldysh about her life set in April 10th 1912, on a ship called Titanic when young Rose boards the departing ship with the upper-class passengers and her mother, Ruth DeWitt Bukater, and her fiancé, Caledon Hockley. Meanwhile, a drifter and artist named Jack Dawson and his best friend Fabrizio De Rossi win third-class tickets to the ship in a game. And she explains the whole story from departure until the death of Titanic on its first and last voyage April 15th, 1912 at 2:20 in the morning."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])
    try:
        print ["food_for_thought", "funny", "action", "emotional", "romantic", "dark", "brutal", "thrilling"]
        print e.predict_proba(X_i.reshape(1,-1))[0]
    except:
        pass

->  OVR Gaussian Naive Bayes
food_for_thought,funny,emotional,romantic
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[  9.91403262e-01   6.18741479e-01   4.35746252e-05   9.99999898e-01
   9.99940118e-01   1.41430354e-03   5.22108293e-05   2.66620874e-01]
->  OVR LinearSVC
food_for_thought,emotional,romantic,thrilling
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
->  OVR Random Forest
food_for_thought,action,emotional,romantic,thrilling
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[ 0.69231828  0.26387516  0.72052617  1.          1.          0.25954075
  0.37003805  1.        ]
->  OVR Bernoulli Naive Bayes
food_for_thought,funny,emotional,romantic
['food_for_thought', 'funny', 'action', 'emotional', 'romantic', 'dark', 'brutal', 'thrilling']
[  9.56000993e-01   8.42366746e-01   3.12821475e-04   9.99974417e-01
   9.99772876e-01   6.92070

In [70]:
plot = """Former dentist, Dr. King Schultz, buys the freedom of a slave, Django, and trains him with the intent to make him his deputy bounty hunter. Instead, he is led to the site of Django's wife who is under the hands of Calvin Candie, a ruthless plantation owner."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])

->  OVR Gaussian Naive Bayes
food_for_thought,action,dark,brutal,thrilling
->  OVR LinearSVC
food_for_thought,funny,action,emotional,dark,brutal,thrilling
->  OVR Random Forest
food_for_thought,funny,action,emotional,brutal,thrilling
->  OVR Bernoulli Naive Bayes
food_for_thought,action,dark,brutal,thrilling


In [71]:
plot = """It's 1954, and up-and-coming U.S. marshal Teddy Daniels is assigned to investigate the disappearance of a patient from Boston's Shutter Island Ashecliffe Hospital. He's been pushing for an assignment on the island for personal reasons, but before long he wonders whether he hasn't been brought there as part of a twisted plot by hospital doctors whose radical treatments range from unethical to illegal to downright sinister. Teddy's shrewd investigating skills soon provide a promising lead, but the hospital refuses him access to records he suspects would break the case wide open. As a hurricane cuts off communication with the mainland, more dangerous criminals "escape" in the confusion, and the puzzling, improbable clues multiply, Teddy begins to doubt everything - his memory, his partner, even his own sanity."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])


->  OVR Gaussian Naive Bayes
action,dark,brutal,thrilling
->  OVR LinearSVC
food_for_thought,action,emotional,dark,brutal,thrilling
->  OVR Random Forest
food_for_thought,action,emotional,brutal,thrilling
->  OVR Bernoulli Naive Bayes
food_for_thought,action,dark,brutal,thrilling


In [72]:
plot = """Forrest Gump is a simple man with a low I.Q. but good intentions. He is running through childhood with his best and only friend Jenny. His 'mama' teaches him the ways of life and leaves him to choose his destiny. Forrest joins the army for service in Vietnam, finding new friends called Dan and Bubba, he wins medals, creates a famous shrimp fishing fleet, inspires people to jog, starts a ping-pong craze, create the smiley, write bumper stickers and songs, donating to people and meeting the president several times. However, this is all irrelevant to Forrest who can only think of his childhood sweetheart Jenny Curran. Who has messed up her life. Although in the end all he wants to prove is that anyone can love anyone."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])


->  OVR Gaussian Naive Bayes
funny,emotional,romantic
->  OVR LinearSVC
food_for_thought,funny,emotional,romantic,thrilling
->  OVR Random Forest
food_for_thought,funny,action,emotional,thrilling
->  OVR Bernoulli Naive Bayes
funny,emotional,romantic


In [73]:
plot = """In Paris, the aristocratic and intellectual Philippe is a quadriplegic millionaire who is interviewing candidates for the position of his carer, with his red-haired secretary Magalie. Out of the blue, the rude African Driss cuts the line of candidates and brings a document from the Social Security and asks Phillipe to sign it to prove that he is seeking a job position so he can receive his unemployment benefit. Philippe challenges Driss, offering him a trial period of one month to gain experience helping him. Then Driss can decide whether he would like to stay with him or not. Driss accepts the challenge and moves to the mansion, changing the boring life of Phillipe and his employees."""
X_i = get_feature_vec(make_wordlist(plot), 200, model)

for e_name, e in ESTIMATORS.items():
    y_pred = e.predict(X_i.reshape(1, -1))
    print "-> ", e_name
    print_labels(y_pred[0])


->  OVR Gaussian Naive Bayes
funny
->  OVR LinearSVC
food_for_thought,funny,action,emotional,thrilling
->  OVR Random Forest
food_for_thought,funny,action,emotional,thrilling
->  OVR Bernoulli Naive Bayes
funny,emotional,romantic
