# MultiLabel Classifier

In [1]:
import numpy as np
import pickle

from scipy.stats import entropy
from scipy.sparse import save_npz, load_npz

from sklearn.multiclass import OneVsRestClassifier

from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
# https://www.youtube.com/watch?v=3mHy4OSyRf0 à 17min19
# https://stackoverflow.com/questions/15880133/jensen-shannon-divergence

def JS_Divergence(P, Q):
    _P = P / np.linalg.norm(P, ord=1)
    _Q = Q / np.linalg.norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [4]:
# autre que http://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics

def to_index(matrix):
    matrix = matrix.tolist()
    result = []
    for row in matrix:
        r = []
        for index, col in enumerate(row):
            if col == 1:
                r.append(index)
        result.append(r)
    return result

def score_custom(y_true, y_pred, nb_elem=5):
    sum_score = 0
    y_pred = np.argsort(y_pred, axis=1)[:, -nb_elem:]
    for i in range(y_pred.shape[0]):
        intersection = len(np.intersect1d(y_pred[i], y_true[i]))
        nb_choice = len(y_true[i])
        sum_score += intersection/nb_choice
    return sum_score/y_pred.shape[0]

## Chargement des données

In [5]:
X_train = load_obj("datas/X_train")
X_test = load_obj("datas/X_test")

taglist = load_obj("datas/taglist")

y_train_clean = load_obj("datas/y_train")
y_test_clean = load_obj("datas/y_test")

mlb = joblib.load("models/MultiLabelBinarizer")
y_train = mlb.transform(y_train_clean)
y_test = mlb.transform(y_test_clean)

y_test_index = to_index(y_test)
y_train_index = to_index(y_train)

In [6]:
# preparation de la tfidf de test

# tfidf = joblib.load("models/TfidfVectorizer")  # chargement du tfidf trained sur le train set
# tfidfMatrix_test = tfidf.transform(X_test)
# save_npz('datas/tfidfMatrix_test.npz', tfidfMatrix_test)

In [7]:
tfidfMatrix = load_npz('datas/tfidfMatrix.npz')             # on a deja le test set de calculé precedemment
tfidfMatrix_test = load_npz('datas/tfidfMatrix_test.npz')

# 1ere evaluation des Modèles

## Multiclass avec MultiOutputClassifier

### SGDC

In [28]:
# from sklearn.linear_model import SGDClassifier

# mdl = SGDClassifier(loss="log", max_iter=5, tol=None)
# ovr = OneVsRestClassifier(mdl)
# ovr.fit(tfidfMatrix, y_train)

# proba_train = ovr.predict_proba(tfidfMatrix)
# proba_test = ovr.predict_proba(tfidfMatrix_test)

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

train_score =  0.6268096311461413
test_score =  0.6057808982742091


In [130]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier

# https://github.com/scikit-learn/scikit-learn/issues/10113
class Simple(SGDClassifier):
    # be sure to add the rest of the parameters if you need them
    def __init__(self, loss='log'):
        super(Simple, self).__init__(loss=loss, penalty=None, max_iter=10, tol=None)
    
    def predict_proba(self, X):
        return super(Simple, self).predict_proba(X)

# mdl = SGDClassifier(loss="log", max_iter=10, tol=None)
moc = MultiOutputClassifier(Simple())
moc.fit(tfidfMatrix, y_train)

proba_train = moc.predict_proba(tfidfMatrix)
proba_test = moc.predict_proba(tfidfMatrix_test)

train = np.delete(proba_train, 0, axis=2)[:, :, 0].T
test = np.delete(proba_test, 0, axis=2)[:, :, 0].T

print("train_score = ", score_custom(y_train_index, train))
print("train_score = ", score_custom(y_test_index, test))

train_score =  0.7933666963073261
train_score =  0.7140818951656324


### AdaBoostClassifier

In [None]:
# from sklearn.ensemble import AdaBoostClassifier

# mdl = SGDClassifier(loss="log", max_iter=5, tol=None)
# ens = AdaBoostClassifier(n_estimators=5)
# ovr = OneVsRestClassifier(ens)
# ovr.fit(tfidfMatrix, y_train)

# proba_train = ovr.predict_proba(tfidfMatrix)
# proba_test = ovr.predict_proba(tfidfMatrix_test)

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

Très très lent

### GradientBoostingClassifier

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier

# mdl = GradientBoostingClassifier(loss="deviance", n_estimators=10, max_depth=3)
# ovr = OneVsRestClassifier(mdl)
# ovr.fit(tfidfMatrix, y_train)

# proba_train = ovr.predict_proba(tfidfMatrix)
# proba_test = ovr.predict_proba(tfidfMatrix_test)

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

Entraine des arbres de decision (tres tres lent)

### GaussianProcessClassifier

In [None]:
# from sklearn.gaussian_process import GaussianProcessClassifier

# mdl = GaussianProcessClassifier(multi_class="one_vs_rest", n_jobs = -1)
# ovr = OneVsRestClassifier(mdl)
# ovr.fit(tfidfMatrix, y_train)

# proba_train = ovr.predict_proba(tfidfMatrix)
# proba_test = ovr.predict_proba(tfidfMatrix_test)

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

Necessite des matrices non sparse => Memory Error

## Multilabel

### ExtraTreesClassifier

In [63]:
from sklearn.ensemble import ExtraTreesClassifier

mdl = ExtraTreesClassifier(n_estimators=20, max_depth=13)
mdl.fit(tfidfMatrix, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=13, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [64]:
proba_train = mdl.predict_proba(tfidfMatrix)
proba_test = mdl.predict_proba(tfidfMatrix_test)

train = np.delete(proba_train, 0, axis=2)[:, :, 0]
test = np.delete(proba_test, 0, axis=2)[:, :, 0]

print("train_score = ", score_custom(y_train_index, train.T))
print("train_score = ", score_custom(y_test_index, test.T))

train_score =  0.37547373355116703
train_score =  0.3616882838460878


### RandomForestClassifier

In [65]:
from sklearn.ensemble import RandomForestClassifier

mdl = RandomForestClassifier(n_estimators=20, max_depth=13)
mdl.fit(tfidfMatrix, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [66]:
proba_train = mdl.predict_proba(tfidfMatrix)
proba_test = mdl.predict_proba(tfidfMatrix_test)

train = np.delete(proba_train, 0, axis=2)[:, :, 0]
test = np.delete(proba_test, 0, axis=2)[:, :, 0]

print("train_score = ", score_custom(y_train_index, train.T))
print("train_score = ", score_custom(y_test_index, test.T))

train_score =  0.4712651046728819
train_score =  0.4517043627398417


### MLPClassifier

In [33]:
# from sklearn.neural_network import MLPClassifier

mdl = MLPClassifier(hidden_layer_sizes=(200, 100), early_stopping=True)
mdl.fit(tfidfMatrix, y_train)

proba_train = mdl.predict_proba(tfidfMatrix)
proba_test = mdl.predict_proba(tfidfMatrix_test)

print("train_score = ", score_custom(y_train_index, proba_train))
print("test_score = ", score_custom(y_test_index, proba_test))

train_score =  0.8205158163933511
test_score =  0.6889105656197734


<b>Attention</b>, cette precision n'est peut-être pas tres accurate car on a 3000 features (col_tfidf) en entrée et 773 en sorties (nb_classes). De ce fait, le MLP a 3000 layer puis 200 puis 100 puis 773. On perd beacoup d'information a cause des hidden layers. Cependant, au vu du resultat une évalusation sera faite avec Keras (plus rapide car sur GPU) par la suite

### KNeighborsClassifier

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# mdl = KNeighborsClassifier(n_neighbors=5, metric=JS_Divergence)
# mdl.fit(tfidfMatrix.todense(), y_train)

# proba_train = ovr.predict_proba(tfidfMatrix.todense())
# proba_test = ovr.predict_proba(tfidfMatrix_test.todense())

# print("train_score = ", score_custom(y_train_index, proba_train))
# print("test_score = ", score_custom(y_test_index, proba_test))

OOB car il faut des marices denses

### RidgeClassifierCV

In [None]:
# from sklearn.linear_model import RidgeClassifierCV

# mdl = RidgeClassifierCV()
# mdl.fit(tfidfMatrix, y_train)

# proba = mdl.predict(tfidfMatrix_test)
# print(score_custom(y_index, proba))

OOB car modele lienaire avec inversion de X*tX

## Fine tuning

In [16]:
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import ShuffleSplit
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier
    

# https://github.com/scikit-learn/scikit-learn/issues/10113
class Classifier(SGDClassifier):
    def __init__(self, penalty=None, alpha=0.0001):
        super(Classifier, self).__init__(n_jobs=-1, 
                                         loss="log",
                                         penalty=penalty, 
                                         alpha=alpha, 
                                         max_iter=20,
                                         tol=1e-3)
    
    def predict_proba(self, X):
        return super(Classifier, self).predict_proba(X)

parameters_list = [
{
    "penalty": ["l1", "l2"],
    "alpha": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
}, {
    "penalty": "none"
}
]

rs = ShuffleSplit(n_splits=3, test_size=.2, random_state=0)

for params in ParameterGrid(parameters_list):
    print(params)
    avg_train = []
    avg_test = []
    mdl = Classifier(**params)
    moc = MultiOutputClassifier(mdl)
    for train_index, test_index in rs.split(tfidfMatrix):
        X_train_cv, X_test_cv = tfidfMatrix[train_index], tfidfMatrix[test_index]
        y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
        y_indexed_train =  [y_train_index[i] for i in train_index]
        y_indexed_test =  [y_train_index[i] for i in test_index]
        
        moc.fit(X_train_cv, y_train_cv)
        
        proba_train = moc.predict_proba(X_train_cv)
        proba_test = moc.predict_proba(X_test_cv)
        
        pred_train = np.delete(proba_train, 0, axis=2)[:, :, 0].T
        pred_test = np.delete(proba_test, 0, axis=2)[:, :, 0].T

        score_train = score_custom(y_indexed_train, pred_train)
        score_test  = score_custom(y_indexed_test, pred_test)

        avg_train.append(score_train)
        avg_test.append(score_test)
    print("Avg train :", sum(avg_train)/len(avg_train))
    print("Avg test :", sum(avg_test)/len(avg_test))

#     score_train, score_test = custom_predict(moc.predict_proba(tfidfMatrix_test), None, y_test, None)
#     print("validation_score = ", score_train, "\n")

{'alpha': 1e-06, 'penalty': 'l1'}
Avg train : 0.9877822296899462
Avg test : 0.6494928085653927
{'alpha': 1e-06, 'penalty': 'l2'}
Avg train : 0.9816750678557588
Avg test : 0.6803481986949774
{'alpha': 1e-05, 'penalty': 'l1'}
Avg train : 0.8290205793230494
Avg test : 0.731448166528819
{'alpha': 1e-05, 'penalty': 'l2'}
Avg train : 0.8770924705240523
Avg test : 0.7281930658946861
{'alpha': 0.0001, 'penalty': 'l1'}
Avg train : 0.6656441629687199
Avg test : 0.6605757742854554
{'alpha': 0.0001, 'penalty': 'l2'}
Avg train : 0.6378524550141541
Avg test : 0.6067755720981541
{'alpha': 0.001, 'penalty': 'l1'}
Avg train : 0.3847140005457325
Avg test : 0.38721165334068885
{'alpha': 0.001, 'penalty': 'l2'}
Avg train : 0.38658265477576964
Avg test : 0.3845918343902252
{'alpha': 0.01, 'penalty': 'l1'}
Avg train : 0.18403285798398925
Avg test : 0.1868457632570514
{'alpha': 0.01, 'penalty': 'l2'}
Avg train : 0.2618663564689171
Avg test : 0.2624753009833658


TypeError: 'NoneType' object is not iterable

In [17]:
mdl = Classifier(penalty="l1", alpha=1e-4)
moc = MultiOutputClassifier(mdl)

moc.fit(tfidfMatrix, y_train)

proba_train = moc.predict_proba(tfidfMatrix)
proba_test = moc.predict_proba(tfidfMatrix_test)

train = np.delete(proba_train, 0, axis=2)[:, :, 0].T
test = np.delete(proba_test, 0, axis=2)[:, :, 0].T

print("train_score = ", score_custom(y_train_index, train))
print("train_score = ", score_custom(y_test_index, test))

train_score =  0.6672873972055308
train_score =  0.6618855182763589


In [18]:
joblib.dump(moc, "models/MOC")

['models/MOC']

Les régularisation diminue le score mais rapproche le train et test set au niveau des resultats. Au final la regualrisation permet de mieux généraliser mais avec de moins bons resultats. Le nombre d'iteration n'aide pas particulierement a partir de 10 itérations. On va garder pour l'API le modèle linéaire sans régularisation car le test set reste plus haut.

### Results

In [33]:
index_eval = 0

elem = y_test_clean[index_eval]
y_pred = moc.predict_proba(tfidfMatrix_test[index_eval])

y_pred = np.delete(y_pred, 0, axis=2)[:, :, 0].T
classes = np.argsort(y_pred, axis=1)[:, -5:].tolist()[0][::-1]
mlb = joblib.load("models/MultiLabelBinarizer")
classname = mlb.classes_
tags = [classname[classnum] for classnum in classes]

print("Les tags associés au post choisis sont : ")
for tag in elem:
    print("\t", tag)
print("\nLa prédiction du SGDC est :")
for index, tag in enumerate(tags):
    print("\t{} ({:.2f})%".format(tag, 100*y_pred[0][classes[index]]))

Les tags associés au post choisis sont : 
	 python
	 iterator
	 iteration

La prédiction du SGDC est :
	python (45.54)%
	c++ (21.22)%
	c (3.86)%
	java (3.71)%
	.net (2.32)%


### Analyse poids

In [35]:
import nltk

stemmer = nltk.stem.PorterStemmer()
tokenizer = nltk.RegexpTokenizer(r'(\w+|\d+)')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = tokenizer.tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

tfidf = joblib.load("models/TfidfVectorizer")

vocab = []
for i in range(len(tfidf.vocabulary_)):
    for key, pos in tfidf.vocabulary_.items():
        if pos == i:
            vocab.append(key)
            continue

In [43]:
mlb = joblib.load("models/MultiLabelBinarizer")
classname = mlb.classes_
for i, c in enumerate(classname):
    print(i, c)

0 .htaccess
1 .net
2 .net-2.0
3 .net-3.5
4 .net-4.0
5 3d
6 64bit
7 abstract-class
8 accessibility
9 actionscript-3
10 active-directory
11 activerecord
12 ado.net
13 agile
14 air
15 ajax
16 algorithm
17 amazon-ec2
18 amazon-s3
19 amazon-web-services
20 android
21 android-actionbar
22 android-activity
23 android-emulator
24 android-fragments
25 android-gradle
26 android-intent
27 android-layout
28 android-listview
29 android-ndk
30 android-studio
31 android-viewpager
32 angular
33 angularjs
34 angularjs-directive
35 animation
36 annotations
37 ant
38 apache
39 apache-spark
40 api
41 app-config
42 architecture
43 arraylist
44 arrays
45 artificial-intelligence
46 asp-classic
47 asp.net
48 asp.net-ajax
49 asp.net-core
50 asp.net-mvc
51 asp.net-mvc-3
52 asp.net-mvc-4
53 asp.net-web-api
54 assemblies
55 assembly
56 async-await
57 asynchronous
58 attributes
59 audio
60 authentication
61 autocomplete
62 automated-tests
63 automatic-ref-counting
64 automation
65 awk
66 azure
67 backbone.js
68 ba

In [36]:
for i in [423, 506, 176, 16, 281, 285, 259, 547, 548, 549]:
    classifieur = moc.estimators_[i]
    index = np.argsort(classifieur.coef_, axis=1)[:, -5:].tolist()[0][::-1]
    print(classname[i] , [vocab[i] for i in index])

machine-learning ['zoom', 'fatal', 'feedback', 'feed', 'featur']
pandas ['panda', 'datafram', 'seri', 'feedback', 'feed']
dataset ['dataset', 'zoom', 'feel', 'feed', 'featur']
algorithm ['algorithm', 'n', 'number', 'tree', 'given']
geolocation ['citi', 'zoom', 'fatal', 'feedback', 'feed']
git ['git', 'commit', 'branch', 'repositori', 'repo']
flexbox ['zoom', 'fault', 'feel', 'feedback', 'feed']
python ['python', 'numpi', 'panda', 'django', 'matplotlib']
python-2.7 ['python', 'zoom', 'feed', 'feasibl', 'fb']
python-3.x ['python', '3', 'zoom', 'featur', 'feasibl']


## MLP

In [6]:
y_train_clean = load_obj("datas/y_train")
y_test_clean = load_obj("datas/y_test")

mlb = joblib.load("models/MultiLabelBinarizer")
y_train = mlb.transform(y_train_clean)
y_test = mlb.transform(y_test_clean)

y_test_index = to_index(y_test)
y_train_index = to_index(y_train)

tfidfMatrix = load_npz('datas/tfidfMatrix.npz')             # on a deja le test set de calculé precedemment
tfidfMatrix_test = load_npz('datas/tfidfMatrix_test.npz')

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.models import load_model
from keras import regularizers

In [10]:
# https://stackoverflow.com/questions/44495698/keras-difference-between-kernel-and-activity-regularizers

nb_input = tfidfMatrix_test.shape[1]
nb_output = y_test.shape[1]

model = Sequential()
model.add(Dense(1500, input_shape=(nb_input,), kernel_regularizer=regularizers.l1(0.0000005)))  # kernel_regularizer=regularizers.l2(0.01)  https://keras.io/regularizers/
model.add(Dense(nb_output, activation="sigmoid"))

model.compile(optimizer='Nadam',
              loss='binary_crossentropy',
              metrics=["top_k_categorical_accuracy"])  # top 5 par defaut

early = EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=0, mode='auto')

In [11]:
model.fit(x=tfidfMatrix.todense(), y=y_train, batch_size=100, epochs=30, validation_split=0.2, callbacks=[early])

Train on 38685 samples, validate on 9672 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30


<keras.callbacks.History at 0x21cb3b93f98>

In [12]:
model.save('models/my_model.h5')

In [8]:
from keras.models import load_model

model = load_model('models/my_model.h5')
proba_train = model.predict(tfidfMatrix.todense(), batch_size=1000)
proba_test = model.predict(tfidfMatrix_test.todense(), batch_size=1000)

In [9]:
print("train_score = ", score_custom(y_train_index, proba_train))
print("test_score = ", score_custom(y_test_index, proba_test), "\n")

train_score =  0.6713774634489525
test_score =  0.6457433808554145 



Sans résularisation on a aussi de l'overfitting malgré un Early Stop (85% train vs 72 % test). Par contre avec regularisation, on atteint des résultat meilleur (notament avec l2 = 5e-5). Avec une l1 regularisation a 5e-6, on arrive environ aux perfs du SGDC mais avec un tout petit peu d'overfitting (67% train vs 64.5% test) 

### Résultats

In [15]:
index_eval = 0

elem = y_test_clean[index_eval]
y_pred = model.predict(tfidfMatrix_test[index_eval].todense())
classes = np.argsort(y_pred, axis=1)[:, -5:].tolist()[0][::-1]
mlb = joblib.load("models/MultiLabelBinarizer")
classname = mlb.classes_
tags = [classname[classnum] for classnum in classes]

print("Les tags associés au post choisis sont : ")
for tag in elem:
    print("\t{}".format(tag))
print("\nLa prédiction du MLPClassifier est :")
for index, tag in enumerate(tags):
    print("\t{} ({:.2f})%".format(tag, y_pred[0][classes[index]]))

Les tags associés au post choisis sont : 
	python
	iterator
	iteration

La prédiction du SGDC est :
	python (0.72)%
	c++ (0.24)%
	performance (0.11)%
	optimization (0.02)%
	.net (0.02)%
