In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.svm import LinearSVC

In [4]:
from nltk.tokenize.casual import TweetTokenizer
from nltk.stem import PorterStemmer

In [5]:
from tqdm import tqdm_notebook as p
from sklearn.metrics import recall_score, precision_score, f1_score

---

# The Data

In [6]:
df_train = pd.read_csv('train.csv', converters={'genres': eval})

In [173]:
df_test = pd.read_csv('df_test.csv').drop_duplicates().drop(columns=['movie', 'movie_name'])

In [7]:
df_train.head()

Unnamed: 0,id,movie,dialogue,genres
0,0,0,I thought you were in a meeting--? <BR> I am. ...,"[drama, romance]"
1,1,1,Are you sure you're okay? You're pale. <BR> I...,[drama]
2,2,2,Go on! Get out! <BR> Mom look don't say anythi...,[comedy]
3,3,3,I could have lost my fucking hands. <BR> That ...,"[mystery, thriller]"
4,4,4,Stick with me on this Gloria. I need you... <...,"[crime, thriller]"


In [174]:
df_test.head()

Unnamed: 0,id,dialogue,genres
0,0,Boy! Did you see the way Mama whopped that dep...,drama
1,1,"Gordon, the insurance people are balking on th...",drama
2,2,Very fancy. Did you design the bottle? <BR> W...,drama
3,3,It makes me so mad. Steven Schwimmer ready to ...,comedy drama
4,4,Something ought to loosen him up ... how comes...,action thriller


## We should validate with respect to movie id because dialogs in one movie are othen similar. 
Puttin dialog from movie witch was in train part into the validation part we can get too optimistic results

In [8]:
df_train.sort_values('movie', inplace=True)
df_train.reset_index(drop=True, inplace=True)

In [9]:
val = int(df_train.shape[0]*0.85)

In [10]:
df_train.loc[val]

id                                                      22212
movie                                                     370
dialogue    We have one motto: Peace on Earth. <BR> And Go...
genres                               [action, comedy, sci-fi]
Name: 31442, dtype: object

In [11]:
validation_part = df_train[df_train['movie'] == 371].index.min()

Join train and test:

In [14]:
df = pd.concat([df_train, df_test.drop(columns=['genres'])], sort=True)
df.drop(columns=["movie"], inplace=True)

In [15]:
1 - validation_part/df_train.shape[0]

0.14830634478656968

In [16]:
i_train = np.arange(0, validation_part)
i_val = np.arange(validation_part, df_train.shape[0])
i_test = np.arange(df_train.shape[0], df.shape[0])
i_full_train = np.arange(0, i_val.max())

Extract `x` and `y`:

In [17]:
x = df["dialogue"].values
y = df["genres"].values

In [156]:
y[:5]

array([list(['drama', 'romance']), list(['drama', 'romance']),
       list(['mystery', 'romance']), list(['drama', 'fantasy']),
       list(['drama', 'romance'])], dtype=object)

In [19]:
mlb = MultiLabelBinarizer()

In [20]:
yy_train = mlb.fit_transform(y[i_train])
yy_val = mlb.transform(y[i_val])
yy_test = mlb.transform(df_test['genres'].apply(lambda x: x.split(' ')).values)
yy = np.vstack((yy_train, yy_val, yy_test)).astype(int)

In [21]:
yy.shape

(46394, 20)

---
# Metric

In [22]:
def f1_scorer(estimator, x, y):
    return f1_score(y, estimator.predict(x), average='samples')

---
# Models

In [35]:
def evaluate(estimator, features, target, val_features, val_target, param, values):
    best_score = None
    best_value = None
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    for value in p(values):
        print(f"{param} = {value}")
        estimator.set_params(**{param: value})
        estimator.fit(features, target)
        train_score = f1_score(target, estimator.predict(features) , average='samples')
        print(f"train_score = {train_score:.3f}")
        test_score = cross_val_score(
            estimator, features, target, scoring=f1_scorer, cv=cv)
        m, s = test_score.mean(), test_score.std()
        val_score = f1_score(val_target, estimator.predict(val_features) , average='samples') 
        print(f"cross_val test score = {m:.3f} ± {s:.3f}")
        print(f"validation score = {val_score:.3f}")
        print()
        if best_score is None:
            best_score = val_score
            best_value = value
        elif val_score > best_score:
            print(f"new best! {value}\n")
            best_score = val_score
            best_value = value
    return best_value

In [25]:
def make_feature(estimator, features, target, method, shape, i_train, i_test, cv):
    result = np.zeros(shape, dtype=float)
    for i1, i2 in p(cv.split(features[i_train], target[i_train]), total=cv.get_n_splits()):
        estimator.fit(features[i_train][i1], target[i_train][i1])
        result[i_train[i2]] = getattr(estimator, method)(features[i_train][i2])
    estimator.fit(features[i_train], target[i_train])
    result[i_test] = getattr(estimator, method)(features[i_test])
    return result

## Tfidf features

In [26]:
tfidf_30k_1_1 = TfidfVectorizer(max_features=30000, ngram_range=(1, 1))

In [27]:
f_tfidf_30k_1_1 = tfidf_30k_1_1.fit_transform(x)

## Logistic regression

On a Tf-Idf from text.

In [28]:
lr = OneVsRestClassifier(LogisticRegression(random_state=43, C=5))

In [29]:
evaluate(lr, features=f_tfidf_30k_1_1[i_train], target=yy[i_train],
         val_features=f_tfidf_30k_1_1[i_val], val_target=yy[i_val], 
         param="estimator__penalty", values=['l1', 'l2'])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

estimator__penalty = l1
train_score = 0.845
cross_val test score = 0.612 ± 0.004
validation score = 0.519
estimator__penalty = l2
train_score = 0.795
cross_val test score = 0.600 ± 0.004
validation score = 0.492



'l1'

In [30]:
lr.set_params(estimator__penalty='l1');

In [34]:
evaluate(lr, features=f_tfidf_30k_1_1[i_train], target=yy[i_train],
         val_features=f_tfidf_30k_1_1[i_val], val_target=yy[i_val],
         param="estimator__C", values=[0.01, 5, 10, 20])

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

estimator__C = 0.01
train_score = 0.252
cross_val test score = 0.302 ± 0.001
validation score = 0.197

estimator__C = 5
train_score = 0.845
cross_val test score = 0.612 ± 0.004
validation score = 0.519

new best!
estimator__C = 10
train_score = 0.929
cross_val test score = 0.603 ± 0.003
validation score = 0.507

estimator__C = 20
train_score = 0.958
cross_val test score = 0.589 ± 0.002
validation score = 0.487




5

In [37]:
lr.set_params(estimator__C=5)

OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight={0: 1, 1: 5}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=43,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=1)

In [38]:
evaluate(lr, features=f_tfidf_30k_1_1[i_train], target=yy[i_train],
         val_features=f_tfidf_30k_1_1[i_val], val_target=yy[i_val],
         param="estimator__class_weight", values=[{0:1,1:5}, {0:1,1:10}, {0:1,1:15}, None])

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

estimator__class_weight = {0: 1, 1: 5}
train_score = 0.919
cross_val test score = 0.640 ± 0.001
validation score = 0.551

estimator__class_weight = {0: 1, 1: 10}
train_score = 0.906
cross_val test score = 0.634 ± 0.001
validation score = 0.544

estimator__class_weight = {0: 1, 1: 15}
train_score = 0.899
cross_val test score = 0.630 ± 0.001
validation score = 0.540

estimator__class_weight = None
train_score = 0.845
cross_val test score = 0.612 ± 0.004
validation score = 0.519




{0: 1, 1: 5}

In [39]:
lr.set_params(estimator__class_weight={0:1, 1:5})

OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight={0: 1, 1: 5}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=43,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=1)

## Tfidf features on preprocessed data

In [40]:
tokenizer = TweetTokenizer(reduce_len=True)

def preprocessor(row):
    row = re.sub('<BR>|<u>|</u>', '', row)
    row = tokenizer.tokenize(row)
    row = ' '.join(row)
    return row

In [41]:
x_preprocessed = df["dialogue"].apply(lambda x: preprocessor(x)).values

In [42]:
x[0]

'I thought you were in a meeting--? <BR> I am.  With you.'

In [43]:
x_preprocessed[0]

'I thought you were in a meeting - - ? I am . With you .'

In [44]:
tfidf_10k_1_1_v2 = TfidfVectorizer(max_features=10000, ngram_range=(1, 1))

In [45]:
f_tfidf_10k_1_1_v2 = tfidf_10k_1_1_v2.fit_transform(x_preprocessed)

## Logistic regression v.2

Tf-Idf on preprocessed text.

In [46]:
lr_v2 = OneVsRestClassifier(LogisticRegression(random_state=43, C=3))

In [48]:
evaluate(lr_v2, features=f_tfidf_10k_1_1_v2[i_train], target=yy[i_train],
         val_features=f_tfidf_10k_1_1_v2[i_val], val_target=yy[i_val],
         param="estimator__penalty", values=['l1', 'l2'])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

estimator__penalty = l1
train_score = 0.753
cross_val test score = 0.614 ± 0.003
validation score = 0.523

estimator__penalty = l2
train_score = 0.727
cross_val test score = 0.594 ± 0.004
validation score = 0.496



'l1'

In [49]:
lr_v2.set_params(estimator__penalty='l1');

In [50]:
evaluate(lr_v2, features=f_tfidf_10k_1_1_v2[i_train], target=yy[i_train],
         val_features=f_tfidf_10k_1_1_v2[i_val], val_target=yy[i_val],
         param="estimator__C", values=[3, 3.5, 4, 4.5, 5])

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

estimator__C = 3
train_score = 0.753
cross_val test score = 0.614 ± 0.003
validation score = 0.523

estimator__C = 3.5
train_score = 0.769
cross_val test score = 0.615 ± 0.004
validation score = 0.523

new best! 3.5

estimator__C = 4
train_score = 0.783
cross_val test score = 0.615 ± 0.003
validation score = 0.523

estimator__C = 4.5
train_score = 0.796
cross_val test score = 0.614 ± 0.003
validation score = 0.524

new best! 4.5

estimator__C = 5
train_score = 0.807
cross_val test score = 0.614 ± 0.003
validation score = 0.521



4.5

In [51]:
lr_v2.set_params(estimator__C=3.5)

OneVsRestClassifier(estimator=LogisticRegression(C=3.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=43, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [52]:
evaluate(lr_v2, features=f_tfidf_10k_1_1_v2[i_train], target=yy[i_train],
         val_features=f_tfidf_10k_1_1_v2[i_val], val_target=yy[i_val],
         param="estimator__class_weight", values=[{0:1,1:5}, {0:1,1:10}, {0:1,1:15}, None, 'balanced'])

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

estimator__class_weight = {0: 1, 1: 5}
train_score = 0.858
cross_val test score = 0.642 ± 0.002
validation score = 0.555

estimator__class_weight = {0: 1, 1: 10}
train_score = 0.835
cross_val test score = 0.630 ± 0.002
validation score = 0.542

estimator__class_weight = {0: 1, 1: 15}
train_score = 0.822
cross_val test score = 0.623 ± 0.002
validation score = 0.533

estimator__class_weight = None
train_score = 0.769
cross_val test score = 0.615 ± 0.004
validation score = 0.523

estimator__class_weight = balanced
train_score = 0.808
cross_val test score = 0.620 ± 0.002
validation score = 0.543



{0: 1, 1: 5}

In [53]:
lr_v2.set_params(estimator__class_weight={0:1, 1:5})

OneVsRestClassifier(estimator=LogisticRegression(C=3.5, class_weight={0: 1, 1: 5}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=43,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=1)

## Linear SVC

On a Tf-Idf from text.

In [54]:
svm = OneVsRestClassifier(LinearSVC(random_state=43, C=5, dual=False))

In [55]:
evaluate(svm, features=f_tfidf_30k_1_1[i_train], target=yy[i_train],
         val_features=f_tfidf_30k_1_1[i_val], val_target=yy[i_val],
         param="estimator__penalty", values=['l1', 'l2'])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

estimator__penalty = l1
train_score = 0.954
cross_val test score = 0.585 ± 0.001
validation score = 0.483

estimator__penalty = l2
train_score = 0.942
cross_val test score = 0.600 ± 0.002
validation score = 0.492

new best! l2



'l2'

In [56]:
svm = OneVsRestClassifier(LinearSVC(random_state=43, C=5))
evaluate(svm, features=f_tfidf_30k_1_1[i_train], target=yy[i_train],
         val_features=f_tfidf_30k_1_1[i_val], val_target=yy[i_val],
         param="estimator__loss", values=['hinge', 'squared_hinge'])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

estimator__loss = hinge
train_score = 0.893
cross_val test score = 0.615 ± 0.003
validation score = 0.512

estimator__loss = squared_hinge
train_score = 0.942
cross_val test score = 0.600 ± 0.002
validation score = 0.492



'hinge'

In [57]:
svm.set_params(estimator__penalty='l2', estimator__loss='hinge');

In [58]:
evaluate(svm, features=f_tfidf_30k_1_1[i_train], target=yy[i_train],
         val_features=f_tfidf_30k_1_1[i_val], val_target=yy[i_val],
         param="estimator__C", values=[2.5, 3, 3.5, 4, 4.5])

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

estimator__C = 2.5
train_score = 0.845
cross_val test score = 0.619 ± 0.005
validation score = 0.512

estimator__C = 3
train_score = 0.860
cross_val test score = 0.619 ± 0.004
validation score = 0.514

new best! 3

estimator__C = 3.5
train_score = 0.871
cross_val test score = 0.619 ± 0.005
validation score = 0.515

new best! 3.5

estimator__C = 4
train_score = 0.880
cross_val test score = 0.618 ± 0.004
validation score = 0.513

estimator__C = 4.5
train_score = 0.887
cross_val test score = 0.616 ± 0.003
validation score = 0.512



3.5

In [59]:
svm.set_params(estimator__C=3)

OneVsRestClassifier(estimator=LinearSVC(C=3, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=43, tol=0.0001, verbose=0),
          n_jobs=1)

In [60]:
evaluate(svm, features=f_tfidf_30k_1_1[i_train], target=yy[i_train],
         val_features=f_tfidf_30k_1_1[i_val], val_target=yy[i_val],
         param="estimator__class_weight", values=[{0:1,1:1.5}, {0:1,1:2}, None])

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

estimator__class_weight = {0: 1, 1: 1.5}
train_score = 0.894
cross_val test score = 0.637 ± 0.002
validation score = 0.538

estimator__class_weight = {0: 1, 1: 2}
train_score = 0.903
cross_val test score = 0.640 ± 0.002
validation score = 0.545

new best! {0: 1, 1: 2}

estimator__class_weight = None
train_score = 0.860
cross_val test score = 0.619 ± 0.004
validation score = 0.514



{0: 1, 1: 2}

In [61]:
svm.set_params(estimator__class_weight={0:1, 1:1.5})

OneVsRestClassifier(estimator=LinearSVC(C=3, class_weight={0: 1, 1: 1.5}, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=43, tol=0.0001, verbose=0),
          n_jobs=1)

## Linear SVC v.2

Tf-Idf on preprocessed text.

In [63]:
svm_v2 = OneVsRestClassifier(LinearSVC(random_state=43, C=5))
evaluate(svm_v2, features=f_tfidf_10k_1_1_v2[i_train], target=yy[i_train],
         val_features=f_tfidf_10k_1_1_v2[i_val], val_target=yy[i_val],
         param="estimator__loss", values=['hinge', 'squared_hinge'])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

estimator__loss = hinge
train_score = 0.835
cross_val test score = 0.612 ± 0.004
validation score = 0.520

estimator__loss = squared_hinge
train_score = 0.885
cross_val test score = 0.594 ± 0.003
validation score = 0.500



'hinge'

In [64]:
svm_v2 = OneVsRestClassifier(LinearSVC(random_state=43, C=5, dual=False))
evaluate(svm_v2, features=f_tfidf_10k_1_1_v2[i_train], target=yy[i_train],
         val_features=f_tfidf_10k_1_1_v2[i_val], val_target=yy[i_val],
         param="estimator__penalty", values=['l1', 'l2'])

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

estimator__penalty = l1
train_score = 0.904
cross_val test score = 0.578 ± 0.003
validation score = 0.484

estimator__penalty = l2
train_score = 0.885
cross_val test score = 0.594 ± 0.003
validation score = 0.500

new best! l2



'l2'

In [65]:
svm_v2.set_params(estimator__loss='hinge', estimator__penalty='l2', estimator__dual=True);

In [66]:
evaluate(svm_v2, features=f_tfidf_10k_1_1_v2[i_train], target=yy[i_train],
         val_features=f_tfidf_10k_1_1_v2[i_val], val_target=yy[i_val],
         param="estimator__C", values=[2.5, 3, 3.5, 4, 4.5])

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

estimator__C = 2.5
train_score = 0.794
cross_val test score = 0.616 ± 0.004
validation score = 0.521

estimator__C = 3
train_score = 0.806
cross_val test score = 0.616 ± 0.004
validation score = 0.521

new best! 3

estimator__C = 3.5
train_score = 0.815
cross_val test score = 0.616 ± 0.005
validation score = 0.522

new best! 3.5

estimator__C = 4
train_score = 0.823
cross_val test score = 0.614 ± 0.004
validation score = 0.520

estimator__C = 4.5
train_score = 0.830
cross_val test score = 0.613 ± 0.004
validation score = 0.519



3.5

In [67]:
svm_v2.set_params(estimator__C=3)

OneVsRestClassifier(estimator=LinearSVC(C=3, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=43, tol=0.0001, verbose=0),
          n_jobs=1)

In [68]:
evaluate(svm_v2, features=f_tfidf_10k_1_1_v2[i_train], target=yy[i_train],
         val_features=f_tfidf_10k_1_1_v2[i_val], val_target=yy[i_val],
         param="estimator__class_weight", values=[{0:1,1:1.5}, {0:1,1:2},
                                                  {0:1,1:2.5}, {0:1,1:3},
                                                  {0:1,1:3.5}, {0:1,1:4},
                                                  None, 'balanced'])

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

estimator__class_weight = {0: 1, 1: 1.5}
train_score = 0.847
cross_val test score = 0.638 ± 0.001
validation score = 0.549

estimator__class_weight = {0: 1, 1: 2}
train_score = 0.859
cross_val test score = 0.641 ± 0.002
validation score = 0.558

new best! {0: 1, 1: 2}

estimator__class_weight = {0: 1, 1: 2.5}
train_score = 0.861
cross_val test score = 0.640 ± 0.002
validation score = 0.555

estimator__class_weight = {0: 1, 1: 3}
train_score = 0.859
cross_val test score = 0.637 ± 0.001
validation score = 0.554

estimator__class_weight = {0: 1, 1: 3.5}
train_score = 0.856
cross_val test score = 0.635 ± 0.002
validation score = 0.551

estimator__class_weight = {0: 1, 1: 4}
train_score = 0.852
cross_val test score = 0.632 ± 0.002
validation score = 0.548

estimator__class_weight = None
train_score = 0.806
cross_val test score = 0.616 ± 0.004
validation score = 0.521

estimator__class_weight = balanced
train_score = 0.818
cross_val test score = 0.616 ± 0.001
validation score = 0.535



{0: 1, 1: 2}

In [69]:
lr_v2.set_params(estimator__class_weight={0:1, 1:1.5})

OneVsRestClassifier(estimator=LogisticRegression(C=3.5, class_weight={0: 1, 1: 1.5}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=43,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=1)

# Meta-feature.

In [70]:
cv = KFold(n_splits=10, random_state=42, shuffle=True)

In [72]:
lr_predict = make_feature(lr, f_tfidf_30k_1_1, yy, "predict", yy.shape,
                          i_full_train, i_test, cv)
lr_predict_proba = make_feature(lr, f_tfidf_30k_1_1, yy, "predict_proba",
                                yy.shape, i_full_train, i_test, cv)

lr_predict_v2 = make_feature(lr_v2, f_tfidf_10k_1_1_v2, yy, "predict", yy.shape,
                             i_full_train, i_test, cv)
lr_predict_proba_v2 = make_feature(lr_v2, f_tfidf_10k_1_1_v2, yy, "predict_proba",
                                   yy.shape, i_full_train, i_test, cv)

svm_predict = make_feature(svm, f_tfidf_30k_1_1, yy, "predict", yy.shape, i_full_train,
                           i_test, cv)
svm_predict_proba = make_feature(svm, f_tfidf_30k_1_1, yy, "decision_function", yy.shape,
                                 i_full_train, i_test, cv)

svm_predict_v2 = make_feature(svm_v2, f_tfidf_10k_1_1_v2, yy, "predict", yy.shape,
                              i_full_train, i_test, cv)
svm_predict_proba_v2 = make_feature(svm_v2, f_tfidf_10k_1_1_v2, yy, "decision_function",
                                    yy.shape, i_full_train, i_test, cv)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

In [73]:
svm_predict_proba_norm = (svm_predict_proba - svm_predict_proba.min(axis=1)[:, np.newaxis]) / (
    (svm_predict_proba.max(axis=1) - svm_predict_proba.min(axis=1)))[:, np.newaxis]

svm_predict_proba_v2_norm = (svm_predict_proba_v2 - svm_predict_proba_v2.min(axis=1)[:, np.newaxis]) / (
    (svm_predict_proba_v2.max(axis=1) - svm_predict_proba_v2.min(axis=1)))[:, np.newaxis]

In [74]:
f_all = np.hstack((lr_predict, lr_predict_proba, lr_predict_v2, lr_predict_proba_v2,
                   svm_predict, svm_predict_proba_norm, svm_predict_v2, svm_predict_proba_v2_norm))

In [75]:
f_all[i_val.max()-2].astype(int).reshape((8,20)).max(axis=0)

array([1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [76]:
yy[i_val.max()-2]

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## LightGBM

As a second-level model.

In [77]:
lgbm = OneVsRestClassifier(LGBMClassifier(random_state=42))

In [79]:
evaluate(lgbm, features=f_all[i_train], target=yy[i_train],
         val_features=f_all[i_val], val_target=yy[i_val],
         param="estimator__n_estimators", values=[50, 70, 100, 130])

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

estimator__n_estimators = 50
train_score = 0.721
cross_val test score = 0.648 ± 0.003
validation score = 0.608

estimator__n_estimators = 70
train_score = 0.744
cross_val test score = 0.649 ± 0.003
validation score = 0.607

estimator__n_estimators = 100
train_score = 0.780
cross_val test score = 0.648 ± 0.002
validation score = 0.603

estimator__n_estimators = 130
train_score = 0.812
cross_val test score = 0.647 ± 0.002
validation score = 0.606



50

In [80]:
lgbm.set_params(estimator__n_estimators=70);

In [81]:
evaluate(lgbm, features=f_all[i_train], target=yy[i_train],
         val_features=f_all[i_val], val_target=yy[i_val],
         param="estimator__num_leaves", values=[6, 7, 8, 9, 10, 11, 12])

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

estimator__num_leaves = 6
train_score = 0.664
cross_val test score = 0.651 ± 0.003
validation score = 0.613

estimator__num_leaves = 7
train_score = 0.665
cross_val test score = 0.649 ± 0.003
validation score = 0.613

estimator__num_leaves = 8
train_score = 0.668
cross_val test score = 0.650 ± 0.002
validation score = 0.610

estimator__num_leaves = 9
train_score = 0.673
cross_val test score = 0.649 ± 0.003
validation score = 0.610

estimator__num_leaves = 10
train_score = 0.676
cross_val test score = 0.650 ± 0.003
validation score = 0.610

estimator__num_leaves = 11
train_score = 0.680
cross_val test score = 0.649 ± 0.003
validation score = 0.611

estimator__num_leaves = 12
train_score = 0.683
cross_val test score = 0.650 ± 0.002
validation score = 0.611



6

In [82]:
lgbm.set_params(estimator__num_leaves=6);

In [83]:
evaluate(lgbm, features=f_all[i_train], target=yy[i_train],
         val_features=f_all[i_val], val_target=yy[i_val],
         param="estimator__min_data_in_leaf", values=np.arange(50, 131, 20))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

estimator__min_data_in_leaf = 50
train_score = 0.664
cross_val test score = 0.652 ± 0.003
validation score = 0.614

estimator__min_data_in_leaf = 70
train_score = 0.663
cross_val test score = 0.653 ± 0.003
validation score = 0.613

estimator__min_data_in_leaf = 90
train_score = 0.665
cross_val test score = 0.653 ± 0.003
validation score = 0.615

new best! 90

estimator__min_data_in_leaf = 110
train_score = 0.664
cross_val test score = 0.652 ± 0.003
validation score = 0.613

estimator__min_data_in_leaf = 130
train_score = 0.664
cross_val test score = 0.653 ± 0.003
validation score = 0.614



90

In [84]:
lgbm.set_params(estimator__min_data_in_leaf=90);

In [85]:
f_lgbm_predict_proba = make_feature(lgbm, f_all, yy, "predict_proba", yy.shape, i_full_train, i_test, cv)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

In [86]:
f_lgbm_predict = make_feature(lgbm, f_all, yy, "predict", yy.shape, i_full_train, i_test, cv)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

One threshold for all genres works bad:

In [87]:
f1_score(yy, f_lgbm_predict, average='samples')

0.5144229959949664

In [88]:
preds = lgbm.predict(f_all[i_val])

In [89]:
lgbm_proba = lgbm.predict_proba(f_all[i_val])

## Threshold selection

In [90]:
scores = {}
for k in range(20):
    li = []
    for treshold in range(100):
        li.append(f1_score(yy[:,k], f_lgbm_predict_proba[:,k] > treshold / 100))
    scores[k] = np.argmax(li) / 100

In [91]:
lgbm_preds = list(map(lambda row: np.array([1 if proba >= max(0.37, scores[k])
                                            else 0 for k, proba in enumerate(row)]), f_lgbm_predict_proba))
lgbm_preds = np.vstack(lgbm_preds)

In [95]:
y_pred = lgbm_preds[i_test].astype('int')
f1_score(yy_test, y_pred, average='samples')

0.6146970149682051

In [96]:
', '.join(list(mlb.inverse_transform(lgbm_preds[0:1])[0]))

'drama'

In [97]:
lgbm.fit(f_all[i_full_train], yy[i_full_train])

OneVsRestClassifier(estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=130,
        min_split_gain=0.0, n_estimators=70, n_jobs=-1, num_leaves=6,
        objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0),
          n_jobs=1)

In [98]:
import pickle

In [99]:
with open("models.pickle", "wb") as f:
    pickle.dump({
        'lgbm': lgbm,
        'lr_1': lr,
        'lr_2': lr_v2,
        'svm_1': svm,
        'svm_2': svm_v2,
        'tfidf_1': tfidf_30k_1_1,
        'tfidf_2': tfidf_10k_1_1_v2,
        'mlb': mlb,
        'scores': scores
    }, f)

---

# Predictions

In [100]:
test = df_test["dialogue"].values
tfidf_1 = tfidf_30k_1_1.transform(test)
tfidf_2 = tfidf_10k_1_1_v2.transform(test)
        
lr_predict_1 = lr.predict(tfidf_1)
lr_predict_proba_1 = lr.predict_proba(tfidf_1)
        
lr_predict_2 = lr_v2.predict(tfidf_2)
lr_predict_proba_2 = lr_v2.predict_proba(tfidf_2)
        
svm_predict_1 = svm.predict(tfidf_1)
svm_predict_proba_1 = svm.decision_function(tfidf_1)
        
svm_predict_2 = svm_v2.predict(tfidf_2)
svm_predict_proba_2 = svm_v2.decision_function(tfidf_2)
        
svm_predict_proba_1_norm = (svm_predict_proba_1 - svm_predict_proba_1.min(axis=1)[:, np.newaxis]) / (
    (svm_predict_proba_1.max(axis=1) - svm_predict_proba_1.min(axis=1)))[:, np.newaxis]

svm_predict_proba_2_norm = (svm_predict_proba_2 - svm_predict_proba_2.min(axis=1)[:, np.newaxis]) / (
    (svm_predict_proba_2.max(axis=1) - svm_predict_proba_2.min(axis=1)))[:, np.newaxis]
        
f_all = np.hstack((lr_predict_1, lr_predict_proba_1,
                   lr_predict_2, lr_predict_proba_2,
                   svm_predict_1, svm_predict_proba_1_norm,
                   svm_predict_2, svm_predict_proba_2_norm))
        
lgbm_predict_proba = lgbm.predict_proba(f_all)
lgbm_preds = list(map(lambda row: np.array([1 if proba >= max(0.37, scores[k])
                                            else 0 for k, proba in enumerate(row)]), lgbm_predict_proba))

In [101]:
result = np.vstack(lgbm_preds)

In [102]:
result[sum(result.T)==0] = np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Without replacement of zero-predictions:

In [154]:
print(f"{f1_score(yy_test, np.vstack(lgbm_preds), average='samples'):.3}")

0.615


Replacement of zero-predictions with most popular genre - "drama" makes predictions a bit better:

In [155]:
print(f"{f1_score(yy_test, result, average='samples'):.3}")

0.618


In [167]:
df_preds = pd.DataFrame(columns=['id', 'genres'])
df_preds['id'] = np.arange(df_test.shape[0])

In [168]:
df_preds["genres"] = [" ".join(z) for z in mlb.inverse_transform(result)]

In [169]:
df_preds.head()

Unnamed: 0,id,genres
0,0,drama
1,1,drama
2,2,drama
3,3,drama romance
4,4,action thriller


$\textbf {Some predictions are correct}$

In [108]:
n = 1
print('dialog:\n{}'.format(test_answ.iloc[n,1]), '\n')
print('actual genres:\n', test_answ.iloc[n,2], '\n')
print('predicted genres:\n', df_preds.iloc[n,1])

dialog:
Gordon, the insurance people are balking on the logging trucks... <BR> Tell those spineless toads we'll self-insure if they don't write it... You fire 33 vice presidents and nothing changes...  You eating twinkies today, Bud, or are you schtupping some stewardess... 

actual genres:
 drama 

predicted genres:
 drama


In [109]:
n = 10
print('dialog:\n{}'.format(test_answ.iloc[n,1]), '\n')
print('actual genres:\n', test_answ.iloc[n,2], '\n')
print('predicted genres:\n', df_preds.iloc[n,1])

dialog:
Go onto the next question. <BR> Good advice for the French not for the English!  Do you think God hates the English? <BR> I don't know but you're all men of the church... why not ask Him yourself? 

actual genres:
 drama war 

predicted genres:
 drama war


$\textbf {Some predictions are absolutely wrong}$

In [147]:
n = 100
print('dialog:\n{}'.format(test_answ.iloc[n,1]), '\n')
print('actual genres:\n', test_answ.iloc[n,2], '\n')
print('predicted genres:\n', df_preds.iloc[n,1])

dialog:
I'm sorry. I don't know what else to say except I'm sorry. <BR> No I'm the one who's sorry. I misjudged you. Now if you'll excuse me I have some oozing to do. 

actual genres:
 thriller 

predicted genres:
 comedy drama


$\textbf {And some predictions are partly correct}$

In [146]:
n = 1000
print('dialog:\n{}'.format(test_answ.iloc[n,1]), '\n')
print('actual genres:\n', test_answ.iloc[n,2], '\n')
print('predicted genres:\n', df_preds.iloc[n,1])

dialog:
When you slaughter a goat and wrench its heart out with your bare hands do you then summon hellfire? <BR> I mean what are you actually doing here? What is the hidden agenda? 

actual genres:
 comedy drama 

predicted genres:
 crime drama
