### Modeling using OneVsRest
---
**Goal:** Fit multi-label classification model on the train set. Finally, score on test set.

use google colab to run

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
os.chdir("/content/gdrive/MyDrive/Colab/github/IMDbXMTC")

In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
%matplotlib inline
import joblib

Import train and test dataframes from previous step. They are large files.

In [18]:
%%time
train = pd.read_csv('data/netflix_train_dataframe.tsv', sep='\t', index_col=0)
test = pd.read_csv('data/netflix_test_dataframe.tsv', sep='\t', index_col=0)

CPU times: user 891 ms, sys: 41.5 ms, total: 933 ms
Wall time: 975 ms


Put the genre and features names (which aren't words) into lists for easy use.

In [19]:
cols = list(train.columns.values)

In [20]:
genre_cols = cols[-42:]
print(len(genre_cols))
print(genre_cols)

42
['g_Independent Movies', 'g_Faith & Spirituality', 'g_Documentaries', 'g_LGBTQ Movies', 'g_International TV Shows', 'g_TV Thrillers', 'g_TV Dramas', 'g_Stand-Up Comedy & Talk Shows', 'g_Thrillers', 'g_Anime Features', 'g_Science & Nature TV', 'g_TV Horror', 'g_Movies', 'g_Korean TV Shows', 'g_Teen TV Shows', 'g_Action & Adventure', 'g_Crime TV Shows', 'g_Anime Series', 'g_Cult Movies', 'g_Docuseries', 'g_Sci-Fi & Fantasy', 'g_TV Sci-Fi & Fantasy', 'g_Dramas', 'g_Sports Movies', 'g_TV Comedies', 'g_Horror Movies', 'g_Stand-Up Comedy', 'g_British TV Shows', 'g_Music & Musicals', 'g_TV Action & Adventure', 'g_Spanish-Language TV Shows', 'g_TV Mysteries', 'g_Reality TV', 'g_TV Shows', 'g_Comedies', 'g_Romantic TV Shows', 'g_Romantic Movies', "g_Kids' TV", 'g_Classic Movies', 'g_International Movies', 'g_Classic & Cult TV', 'g_Children & Family Movies']


In [21]:
f_names = cols[:2]

Separate out X and y out of our train and test .tsv files. We want JUST the genre columns for `y` and everything except the genre columns for `X`.

In [22]:
#X_train = train[train.columns[~train.columns.isin(genre_cols)]]
y_train = train[train.columns[ train.columns.isin(genre_cols)]]
X_train = train[train.columns[~train.columns.isin(genre_cols + f_names)]]

X_test = test[test.columns[~test.columns.isin(genre_cols + f_names)]]
y_test = test[test.columns[ test.columns.isin(genre_cols)]]
#X_test = test[test.columns[~test.columns.isin(genre_cols)]]

---

Before running a model, we need to scale our data. Both standard and min-max were tested, but standard scaler came out on top.

In [23]:
%%time
# Scale data (Standard Scaler)
from sklearn.preprocessing import StandardScaler
my_standard_scaler = StandardScaler().fit(X_train)
X_train_s = my_standard_scaler.transform(X_train)
X_test_s = my_standard_scaler.transform(X_test)

#joblib.dump(my_standard_scaler, 'models/my_standard_scaler.pkl')

CPU times: user 101 ms, sys: 22.2 ms, total: 123 ms
Wall time: 123 ms


In [24]:
# Scale data (MinMax Scaler)
from sklearn.preprocessing import MinMaxScaler
my_minmax_scaler = MinMaxScaler().fit(X_train)
X_train_mm = my_minmax_scaler.transform(X_train)
X_test_mm = my_minmax_scaler.transform(X_test)

#joblib.dump(my_minmax_scaler, 'models/my_minmax_scaler.pkl')

---

### Please note
MANY models were tested and pkl'd. Below is the optimized model. After that, everything below it is testing of other models, scalers, score grading, and tuning hyperparameters. I normally would not include all of them, but they remain for completeness.

In the end, OneVsRest with Logistic Regression (C=0.01, solver='lbfgs') when scaled with a standard scaler was the best option.

In [25]:
import joblib
#my_model = joblib.load('models/my_1vr_linear_svc_default.pkl')

In [26]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [27]:
from sklearn.model_selection import cross_val_score
my_log_model = OneVsRestClassifier(LogisticRegression(random_state=123, solver='lbfgs', max_iter=3000, C=0.01, n_jobs=-1), n_jobs=-1)

scores = cross_val_score(my_log_model, X_train_s, y_train, cv = 5)
print(scores)

for i in range(len(scores)) :
    print(f"Fold {i+1}: {scores[i]}")
print(f"Average Score:{np.mean(scores)}")

[0.08402725 0.10370931 0.08629826 0.08856927 0.07948524]
Fold 1: 0.08402725208175625
Fold 2: 0.10370931112793338
Fold 3: 0.08629825889477669
Fold 4: 0.08856926570779712
Fold 5: 0.07948523845571537
Average Score:0.08841786525359575


In [28]:
%%time
my_log_model = OneVsRestClassifier(LogisticRegression(random_state=123, solver='lbfgs', max_iter=3000, C=0.01, n_jobs=-1), n_jobs=-1).fit(X_train_s, y_train)

CPU times: user 214 ms, sys: 49.7 ms, total: 264 ms
Wall time: 15.8 s


In [29]:
y_train_pred = my_log_model.predict(X_train_s)
y_train_proba = my_log_model.predict_proba(X_train_s)
y_test_pred = my_log_model.predict(X_test_s)
y_test_proba = my_log_model.predict_proba(X_test_s)

In [30]:
from sklearn.metrics import accuracy_score
print(f'Training score: {accuracy_score(y_train, y_train_pred):0.5f}')
print(f'    Test score: {accuracy_score(y_test, y_test_pred):0.5f}')

Training score: 0.24557
    Test score: 0.08583


In [31]:
y_pred_df = pd.DataFrame(y_test_pred, columns=genre_cols)

# Test set predictions
for g in genre_cols:
    score = accuracy_score(y_test[g], y_pred_df[g])
    print(f'{score:0.4f}  {g}')

0.8987  g_Independent Movies
0.9927  g_Faith & Spirituality
0.9314  g_Documentaries
0.9914  g_LGBTQ Movies
0.8279  g_International TV Shows
0.9923  g_TV Thrillers
0.8946  g_TV Dramas
0.9923  g_Stand-Up Comedy & Talk Shows
0.9223  g_Thrillers
0.9932  g_Anime Features
0.9905  g_Science & Nature TV
0.9891  g_TV Horror
0.9936  g_Movies
0.9837  g_Korean TV Shows
0.9927  g_Teen TV Shows
0.9005  g_Action & Adventure
0.9387  g_Crime TV Shows
0.9805  g_Anime Series
0.9936  g_Cult Movies
0.9655  g_Docuseries
0.9696  g_Sci-Fi & Fantasy
0.9927  g_TV Sci-Fi & Fantasy
0.7439  g_Dramas
0.9791  g_Sports Movies
0.9214  g_TV Comedies
0.9623  g_Horror Movies
0.9777  g_Stand-Up Comedy
0.9687  g_British TV Shows
0.9605  g_Music & Musicals
0.9782  g_TV Action & Adventure
0.9791  g_Spanish-Language TV Shows
0.9886  g_TV Mysteries
0.9709  g_Reality TV
0.9995  g_TV Shows
0.8102  g_Comedies
0.9569  g_Romantic TV Shows
0.9187  g_Romantic Movies
0.9605  g_Kids' TV
0.9868  g_Classic Movies
0.7039  g_International 

In [32]:
joblib.dump(my_log_model, 'models/my_logistic_model.pkl')

['models/my_logistic_model.pkl']


---

## Below is model testing and optimizing(On going)

In [None]:
%%time
my_model = OneVsRestClassifier(LinearSVC(random_state=123, max_iter=3000), n_jobs=-1).fit(X_train_s, y_train)

In [None]:
# EXPORT AND SAVE THE MODEL
joblib.dump(my_model, 'models/netflix_1vr_linear_svc_default.pkl')

In [None]:
y_pred = my_model.predict(X_test_s)

NameError: ignored

In [None]:
y_train_pred = my_model.predict(X_train_s)

In [None]:
my_model.multilabel_
#my_model.predict_proba(X_train_s)

In [None]:
from sklearn.metrics import accuracy_score
print(f'Training score: {accuracy_score(y_train, y_train_pred):0.5f}')
print(f'    Test score: {accuracy_score(y_test, y_pred):0.5f}')

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import multilabel_confusion_matrix

In [None]:
# Confusion Matrix
cm = multilabel_confusion_matrix(y_test, y_pred)

g_cm_list = []
for g in cm:
    g_cm_list.append(pd.DataFrame(g, columns=['Predicted Negative (0)', 'Predicted Positive (1)'], \
                 index=['True Negative (0)','True Positive (1)']))

g_cm_list[10].values

In [None]:
y_train_pred_df = pd.DataFrame(y_train_pred)
y_train_pred_df.columns = genre_cols
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.columns = genre_cols

In [None]:
test_acc_dict = {}
# Test set predictions
for g in genre_cols:
    score = accuracy_score(y_test[g], y_pred_df[g])
    test_acc_dict.update( {g[2:] : score} )
    print(f'{score:0.4f}  {g}')

In [None]:
test_scores = pd.DataFrame.from_dict(test_acc_dict, orient='index', columns=['score'])

In [None]:
test_scores.to_csv('test_scores_last_model.csv', index_label='genre')

In [None]:
coefs = my_model.coef_

In [None]:
coef_df = pd.DataFrame(coefs, index=genre_cols, columns=X_train.columns)
coef_tdf = coef_df.T
coef_tdf

In [None]:
coef_tdf.to_csv('my_1vr_linear_svc_default_coef.tsv', sep='\t')

In [None]:
%%time
my_log_model = OneVsRestClassifier(LogisticRegression(random_state=123, max_iter=3000), n_jobs=-1).fit(X_train_s, y_train)

# EXPORT AND SAVE THE MODEL
joblib.dump(my_log_model, 'models/my_1vr_logreg_default.pkl')

In [None]:
%%time
my_log_model_mm = OneVsRestClassifier(LogisticRegression(random_state=123, max_iter=3000, C=0.01), n_jobs=-1).fit(X_train_mm, y_train)

# EXPORT AND SAVE THE MODEL
joblib.dump(my_log_model_mm, 'models/my_1vr_logreg_minmax_0.01.pkl')

In [None]:
y_pred_log_mm = my_log_model_mm.predict(X_test_mm)
y_train_pred_log_mm = my_log_model_mm.predict(X_train_mm)
from sklearn.metrics import accuracy_score
print(f'Train: {accuracy_score(y_train, y_train_pred_log_mm)}')
print(f' Test: {accuracy_score(y_test, y_pred_log)}')

y_train_pred_log_mm_df = pd.DataFrame(y_train_pred_log_mm)
y_train_pred_log_mm_df.columns = genre_cols

y_pred_log_mm_df = pd.DataFrame(y_pred_log_mm)
y_pred_log_mm_df.columns = genre_cols

#test_acc_dict = {}
# Test set predictions
for g in genre_cols:
    score = accuracy_score(y_test[g], y_pred_log_mm_df[g])
    #test_acc_dict.update( {g[2:] : score} )
    print(f'{score:0.4f}  {g}')

In [None]:
y_pred_log = my_log_model.predict(X_test_s)

In [None]:
y_train_pred_log = my_log_model.predict(X_train_s)

In [None]:
my_log_model.multilabel_
#my_model.predict_proba(X_train_s)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_train_pred_log)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_log)

In [None]:
y_train_pred_log_df = pd.DataFrame(y_train_pred_log)
y_train_pred_log_df.columns = genre_cols

y_pred_log_df = pd.DataFrame(y_pred_log)
y_pred_log_df.columns = genre_cols

In [None]:
test_acc_dict = {}
# Test set predictions
for g in genre_cols:
    score = accuracy_score(y_test[g], y_pred_log_df[g])
    test_acc_dict.update( {g[2:] : score} )
    print(f'{score:0.4f}  {g}')

In [None]:
test_scores_log = pd.DataFrame.from_dict(test_acc_dict, orient='index', columns=['score'])

In [None]:
test_scores_log.to_csv('test_scores_model1.csv', index_label='genre')

In [None]:
coef_df = pd.DataFrame(coefs, index=genre_cols, columns=X_train.columns)
coef_tdf = coef_df.T
coef_tdf

In [None]:
coef_tdf.to_csv('my_1vr_logreg_default_coef.tsv', sep='\t')

In [None]:
%%time

c_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
train_scores = []
test_scores = []

for c_val in c_values:
    my_log_model = OneVsRestClassifier(LogisticRegression(random_state=123, solver='sag', max_iter=3000, C=c_val, n_jobs=-1), n_jobs=-1).fit(X_train_s, y_train)

    # EXPORT AND SAVE THE MODEL
    joblib.dump(my_log_model, f'models/my_1vr_logreg_sag_{c_val}.pkl')

    # Make predictions
    y_train_pred_log = my_log_model.predict(X_train_s)
    y_pred_log = my_log_model.predict(X_test_s)

    #my_log_model.multilabel_
    #my_model.predict_proba(X_train_s)

    # Check overall accuracies
    from sklearn.metrics import accuracy_score
    train_acc = accuracy_score(y_train, y_train_pred_log)
    test_acc = accuracy_score(y_test, y_pred_log)
    train_scores.append(train_acc)
    test_scores.append(test_acc)
    print(f'C:  {c_val}')
    print(f'Train score: {train_acc:0.5f}')
    print(f' Test score: {test_acc:0.5f}')

    y_train_pred_log_df = pd.DataFrame(y_train_pred_log)
    y_train_pred_log_df.columns = genre_cols

    y_pred_log_df = pd.DataFrame(y_pred_log)
    y_pred_log_df.columns = genre_cols

    test_acc_dict = {}
    # Test genre set predictions
    for g in genre_cols:
        score = accuracy_score(y_test[g], y_pred_log_df[g])
        test_acc_dict.update( {g[2:] : score} )
        print(f'{score:0.4f}  {g}')

    # Export genre scores
    test_scores_log = pd.DataFrame.from_dict(test_acc_dict, orient='index', columns=['score'])
    test_scores_log.to_csv(f'test_scores_logreg_sag_{c_val}.csv', index_label='genre')

In [None]:
plt.figure()
plt.plot(c_values, train_scores, label='train')
plt.plot(c_values, test_scores, label='test')
plt.xscale('log')
plt.show

In [None]:
train_scores

In [None]:
test_scores