# MultiLabel Classifier

In [1]:
import numpy as np
import pickle

from scipy.stats import entropy
from scipy.sparse import save_npz, load_npz

from sklearn.multiclass import OneVsRestClassifier

from sklearn.externals import joblib

In [2]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
# https://www.youtube.com/watch?v=3mHy4OSyRf0 à 17min19
# https://stackoverflow.com/questions/15880133/jensen-shannon-divergence

def JS_Divergence(P, Q):
    _P = P / np.linalg.norm(P, ord=1)
    _Q = Q / np.linalg.norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

## Chargement des données

In [4]:
X_train = load_obj("datas/X_train")
X_test = load_obj("datas/X_test")

taglist = load_obj("datas/taglist")

y_train_clean = load_obj("datas/y_train")
y_test_clean = load_obj("datas/y_test")

mlb = joblib.load("models/MultiLabelBinarizer")
y_train = mlb.transform(y_train_clean)
y_test = mlb.transform(y_test_clean)

In [5]:
# preparation de la tfidf de test

# tfidf = joblib.load("models/TfidfVectorizer")  # chargement du tfidf trained sur le train set
# tfidfMatrix_test = tfidf.transform(X_test)
# save_npz('datas/tfidfMatrix_test.npz', tfidfMatrix_test)

In [6]:
tfidfMatrix = load_npz('datas/tfidfMatrix.npz')             # on a deja le test set de calculé precedemment
tfidfMatrix_test = load_npz('datas/tfidfMatrix_test.npz')

## 1ere evaluation des Modèles

In [7]:
# autre que http://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics

def to_index(matrix):
    matrix = matrix.tolist()
    result = []
    for row in matrix:
        r = []
        for index, col in enumerate(row):
            if col == 1:
                r.append(index)
        result.append(r)
    return result

def score_custom(y_true, y_pred, nb_elem=5):
    sum_score = 0
    y_pred = np.argsort(y_pred, axis=1)[:, -nb_elem:]
    for i in range(y_pred.shape[0]):
        intersection = len(np.intersect1d(y_pred[i], y_true[i]))
        nb_choice = len(y_true[i])
        sum_score += intersection/nb_choice
    return sum_score/y_pred.shape[0]

y_test_index = to_index(y_test)
y_train_index = to_index(y_train)

## Multiclass avec OVR

### SGDC

In [28]:
from sklearn.linear_model import SGDClassifier

mdl = SGDClassifier(loss="log", max_iter=5, tol=None)
ovr = OneVsRestClassifier(mdl)
ovr.fit(tfidfMatrix, y_train)

proba_train = ovr.predict_proba(tfidfMatrix)
proba_test = ovr.predict_proba(tfidfMatrix_test)

print("train_score = ", score_custom(y_train_index, proba_train))
print("test_score = ", score_custom(y_test_index, proba_test))

train_score =  0.6268096311461413
test_score =  0.6057808982742091


### AdaBoostClassifier

In [None]:
# from sklearn.ensemble import AdaBoostClassifier

# mdl = SGDClassifier(loss="log", max_iter=5, tol=None)
# ens = AdaBoostClassifier(n_estimators=5)
# ovr = OneVsRestClassifier(ens)
# ovr.fit(tfidfMatrix, y_train)

# proba_train = ovr.predict_proba(tfidfMatrix)
# proba_test = ovr.predict_proba(tfidfMatrix_test)

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

Très très lent

### GradientBoostingClassifier

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier

# mdl = GradientBoostingClassifier(loss="deviance", n_estimators=10, max_depth=3)
# ovr = OneVsRestClassifier(mdl)
# ovr.fit(tfidfMatrix, y_train)

# proba_train = ovr.predict_proba(tfidfMatrix)
# proba_test = ovr.predict_proba(tfidfMatrix_test)

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

Entraine des arbres de decision (tres tres lent)

### GaussianProcessClassifier

In [None]:
# from sklearn.gaussian_process import GaussianProcessClassifier

# mdl = GaussianProcessClassifier(multi_class="one_vs_rest", n_jobs = -1)
# ovr = OneVsRestClassifier(mdl)
# ovr.fit(tfidfMatrix, y_train)

# proba_train = ovr.predict_proba(tfidfMatrix)
# proba_test = ovr.predict_proba(tfidfMatrix_test)

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

Necessite des matrices non sparse => Memory Error

## Multilabel

### ExtraTreesClassifier

In [138]:
from sklearn.ensemble import ExtraTreesClassifier

mdl = ExtraTreesClassifier(n_estimators=20, max_depth=13)
mdl.fit(tfidfMatrix, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=13, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [139]:
proba = mdl.predict_proba(tfidfMatrix_test)

In [150]:
proba[22]

array([[ 0.99859832,  0.00140168],
       [ 0.99805606,  0.00194394],
       [ 0.99897252,  0.00102748],
       ..., 
       [ 0.99875011,  0.00124989],
       [ 0.99859832,  0.00140168],
       [ 0.99890041,  0.00109959]])

La prediction est tj proche de 1

### RandomForestClassifier

In [151]:
from sklearn.ensemble import RandomForestClassifier

mdl = RandomForestClassifier(n_estimators=20, max_depth=13)
mdl.fit(tfidfMatrix, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [152]:
proba = mdl.predict_proba(tfidfMatrix_test)

In [155]:
proba[10]

array([[  9.99036105e-01,   9.63895235e-04],
       [  9.99279174e-01,   7.20825757e-04],
       [  9.99612224e-01,   3.87776437e-04],
       ..., 
       [  9.99115480e-01,   8.84520323e-04],
       [  9.99075808e-01,   9.24192488e-04],
       [  9.98936830e-01,   1.06317025e-03]])

La prediction est tj proche de 1

### MLPClassifier

In [33]:
# from sklearn.neural_network import MLPClassifier

mdl = MLPClassifier(hidden_layer_sizes=(200, 100), early_stopping=True)
mdl.fit(tfidfMatrix, y_train)

proba_train = mdl.predict_proba(tfidfMatrix)
proba_test = mdl.predict_proba(tfidfMatrix_test)

print("train_score = ", score_custom(y_train_index, proba_train))
print("test_score = ", score_custom(y_test_index, proba_test))

train_score =  0.8205158163933511
test_score =  0.6889105656197734


<b>Attention</b>, cette precision n'est peut-être pas tres accurate car on a 3000 features (col_tfidf) en entrée et 773 en sorties (nb_classes). De ce fait, le MLP a 3000 layer puis 200 puis 100 puis 773. On perd beacoup d'information a cause des hidden layers. Cependant, au vu du resultat une évalusation sera faite avec Keras (plus rapide car sur GPU) par la suite

### KNeighborsClassifier

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# mdl = KNeighborsClassifier(n_neighbors=5, metric=JS_Divergence)
# mdl.fit(tfidfMatrix.todense(), y_train)

# proba_train = ovr.predict_proba(tfidfMatrix.todense())
# proba_test = ovr.predict_proba(tfidfMatrix_test.todense())

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

OOB car il faut des marices denses

### RidgeClassifierCV

In [None]:
# from sklearn.linear_model import RidgeClassifierCV

# mdl = RidgeClassifierCV()
# mdl.fit(tfidfMatrix, y_train)

# proba = mdl.predict(tfidfMatrix_test)
# print(score_custom(y_index, proba))

OOB car modele lienaire avec inversion de X*tX

## Fine tuning

In [34]:
from sklearn.model_selection import ParameterGrid

parameters = {
    "loss": ["log"],
    "max_iter": [10, 20],
    "tol":[1e-3],
    "tol":[1e-3],
    "n_jobs":[-1],
    "penalty": [None, "l1", "l2"]
}

for params in ParameterGrid(parameters):
    print(params)
    mdl = SGDClassifier(**params)
    ovr = OneVsRestClassifier(mdl)
    ovr.fit(tfidfMatrix, y_train)
    
    proba_train = ovr.predict_proba(tfidfMatrix)
    proba_test = ovr.predict_proba(tfidfMatrix_test)

    print("train_score = ", score_custom(y_train_index, proba_train))
    print("test_score = ", score_custom(y_test_index, proba_test), "\n")

{'loss': 'log', 'max_iter': 10, 'n_jobs': -1, 'penalty': None, 'tol': 0.001}
train_score =  0.7745893390133173
test_score =  0.7068510379819415 

{'loss': 'log', 'max_iter': 10, 'n_jobs': -1, 'penalty': 'l1', 'tol': 0.001}
train_score =  0.6661014261982294
test_score =  0.6609254296637892 

{'loss': 'log', 'max_iter': 10, 'n_jobs': -1, 'penalty': 'l2', 'tol': 0.001}
train_score =  0.6315917723046102
test_score =  0.6101582877764785 

{'loss': 'log', 'max_iter': 20, 'n_jobs': -1, 'penalty': None, 'tol': 0.001}
train_score =  0.7753182924223373
test_score =  0.706694894057977 

{'loss': 'log', 'max_iter': 20, 'n_jobs': -1, 'penalty': 'l1', 'tol': 0.001}
train_score =  0.6642151084641562
test_score =  0.6597823989709679 

{'loss': 'log', 'max_iter': 20, 'n_jobs': -1, 'penalty': 'l2', 'tol': 0.001}
train_score =  0.6308397267544997
test_score =  0.6092982456140463 



Les régularisation diminue le score mais rapproche le train et test set au niveau des resultats. Au final la regualrisation permet de mieux généraliser mais avec de moins bons resultats. Le nombre d'iteration n'aide pas particulierement a partir de 10 itérations. On va garder pour l'API le modèle linéaire sans régularisation car le test set reste plus haut.

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ParameterGrid

parameters = {
    "early_stopping": [True],
    "hidden_layer_sizes": [(2000), (2000, 1000), (200, 100)],
    "validation_fraction":[0.1],
    "activation":["identity", "logistic", "tanh", "relu"]
}

for params in ParameterGrid(parameters):
    print(params)
    mdl = MLPClassifier(**params)
    mdl.fit(tfidfMatrix, y_train)

    proba = mdl.predict_proba(tfidfMatrix_test)
    print(score_custom(y_index, proba))

{'activation': 'identity', 'early_stopping': True, 'hidden_layer_sizes': 2000, 'validation_fraction': 0.1}


In [16]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.models import load_model

nb_input = tfidfMatrix_test.shape[1]
nb_output = y_test.shape[1]

model = Sequential()
model.add(Dense(1500, input_shape=(nb_input,)))
model.add(Dense(nb_output, activation="sigmoid"))

model.compile(optimizer='Nadam',
              loss='binary_crossentropy',
              metrics=["top_k_categorical_accuracy"])

early = EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='auto')

In [17]:
model.fit(x=tfidfMatrix.todense(), y=y_train, batch_size=100, epochs=20, validation_split=0.2, callbacks=[early])

Train on 38685 samples, validate on 9672 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x239040b4cc0>

In [18]:
model.save('my_model.h5')

In [8]:
from keras.models import load_model

model = load_model('my_model.h5')
proba_train = model.predict(tfidfMatrix.todense(), batch_size=1000)
proba_test = model.predict(tfidfMatrix_test.todense(), batch_size=1000)

Using TensorFlow backend.


In [9]:
print("train_score = ", score_custom(y_train_index, proba_train))
print("test_score = ", score_custom(y_test_index, proba_test), "\n")

train_score =  0.8602663523377974
test_score =  0.7215771608246868 



Overfitting malgré early stop ? Par contre meilleur perf mais impossible à passer en prod :(