# PART 3 - Machine Learning models
#### Dans cette partie, nous allons diviser nos données, créer des features (OneHot, TF-IDF) et construire nos modèles (Baselines et modèles améliorés)

In [1]:
import pandas as pd 
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import spacy
from urllib.parse import urlparse
import re
import regex
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer 
# nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
from unidecode import unidecode
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
from sklearn.compose import ColumnTransformer
import fastparquet

In [2]:
data = Path("../data/").glob("*.parquet") 
data = list(data)

[print(parquet.name) for parquet in data]

part-00000-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet
part-00001-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet
part-00002-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet
part-00003-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet
part-00004-1b8fcd71-6348-4510-a9dc-bdd7dcf82f2d-c000.snappy.parquet


[None, None, None, None, None]

In [3]:
df = pd.concat((pd.read_parquet(parquet, engine='fastparquet') for parquet in data))
pd.set_option("max_colwidth", None)


In [4]:
df = df.reset_index(drop=True)
df 

Unnamed: 0,url,target,day
0,https://www.cdiscount.com/bricolage/electricite/batterie-plomb-6v-4ah-ova51023e-pour-toplux/f-16614-ova2009927775303.html,"[1831, 1751, 1192, 745, 1703]",4
1,https://www.mystalk.net/profile/vitoriafcorrea,"[847, 978, 582, 1381, 529]",4
2,https://www.lequipe.fr/Tennis/TennisFicheJoueur1500000000003017.html,"[20, 1077, 294]",4
3,http://m.jeuxvideo.com/forums/42-32625-60180057-1-0-1-0-la-guilde-fourmi-legionnaire-recrute.htm,"[381, 935, 1343, 622, 933]",4
4,https://context.reverso.net/traduction/espagnol-francais/Para+ir,"[692, 1265, 725, 1264, 1266]",4
...,...,...,...
67590,https://www.jeu-concours.biz/gagner-cafetiere-expresso.html,"[1276, 65, 1113]",1
67591,https://www.sto.cx/book-186042-471.html,"[608, 617, 1033, 220, 1021]",1
67592,http://jeu.info/solution/4-images-1-mot-niveau-2023-a-2060.html,"[381, 925, 622, 1494, 937]",16
67593,https://grossesse.aufeminin.com/forum/levres-gonflees-et-accouchement-fd4242229,"[638, 253, 419, 558, 401]",1


In [5]:
def url_parse(url):
    parse_result = urlparse(url)
    result = [parse_result.scheme, parse_result.netloc, parse_result.path, parse_result.params, parse_result.query, parse_result.fragment]
    return result

df_parsed = pd.concat([df, 
                       pd.DataFrame(list(map(url_parse, df.url)),
                    columns= ['scheme','netloc','path','params','quer','fragment'],
                   index=df.url.index) 
                       ], axis=1)
df_parsed.drop(['params', 'quer', 'fragment'], axis=1, inplace=True)

In [6]:
class PathTokenizer():
    """ A simple class to tokenize the URL with a various combination of functions 
        """
    
    def __init__(self):
        self.stemmer = SnowballStemmer(language='french')
        self.stopwords = [unidecode(x) for x in stopwords.words('french')]
        self.special_words = ['htm', 'php', 'aspx', 'html']

    def _clean_text(self, text:str):
        """ 
        remove the symbols from the a url  
        """
        if isinstance(text, str):
            regex = '(\d+|[A-Z][a-z]*)|[+;,\s.!:\'/_%#&$@?~*]|-'
            t = list(filter(None, re.split(regex, text)))
            return t
        else:
            raise TypeError("text must be list")
    
    def _lowercase_text(self, tokens: list):
        if isinstance(tokens, list):
            return [t.lower() for t in tokens]
        else:
            raise TypeError("text must be list")
    
    def _remove_stopwords(self, tokens:list):
        if isinstance(tokens, list):
            return [t for t in tokens if t not in self.stopwords]
        else:
            raise TypeError("tokens must be a list")
            
    def _remove_single(self, tokens: list):
        "remove single elements from list "
        if isinstance(tokens, list):
            return [t for t in tokens if len(t)>1]
        else:
            raise TypeError("tokens must be a list")
            
    def _remove_specials(self, tokens:list):
        if isinstance(tokens, list):
            return [t for t in tokens if t not in self.special_words]
        else:
            raise TypeError("tokens must be a list")
    
    def _remove_numbers(self, tokens:list):
        if isinstance(tokens, list):
            # return [x for x in text if not any(x1.isdigit() for x1 in x)]
            return [t for t in tokens if not t.isdigit()]
        else:
            raise TypeError("tokens must be a list") 
            
    def _stem_text(self, tokens:list):
        if isinstance(tokens, list):        
            return [self.stemmer.stem(token) for token in tokens]
        else:
            raise TypeError("tokens must be a list")        
        
    def _tokenize_text(self, text:str):
        return word_tokenize(text, language='french')
        
    def _join_words(self, text:list):
        """ build a sentence from a list of words and separates them with a sapce"""
        return " ".join(text)
    
    def _split_words(self, text:str):
        return text.split(' ')
    
    def clean_df(self, df_column, funcs_list):
        "Apply multiple functions on a column of a dataframe"
        for func in funcs_list:
            df_column = df_column.apply(func)
        return df_column

In [7]:
def split_netloc(netloc:str):
    splited_netloc = netloc.rsplit('.', 2)
    if len(splited_netloc) == 2:
        splited_netloc.insert(0, "www")
    return splited_netloc

df_parsed_2 = pd.concat([df_parsed, 
                       pd.DataFrame(list(map(split_netloc, df_parsed.netloc)),
                    columns= ['sous_domaine','domaine','top_domaine'],
                   index=df_parsed.netloc.index) 
                       ], axis=1)

path_tokenizer = PathTokenizer()

funcs = [path_tokenizer._clean_text, path_tokenizer._remove_numbers, path_tokenizer._remove_single, path_tokenizer._stem_text,
        path_tokenizer._lowercase_text, path_tokenizer._remove_specials, path_tokenizer._remove_stopwords ]

df_parsed_2['tokens_path'] = path_tokenizer.clean_df(df_parsed_2.path, funcs)

df_cleaned = df_parsed_2.drop(['url', 'path', 'scheme', 'netloc'], axis=1)

mlb = MultiLabelBinarizer()
targets_encoded = pd.DataFrame(mlb.fit_transform(df_cleaned.target),
                   columns=mlb.classes_,
                   index=df_cleaned.target.index)
df_cleaned_2 = pd.concat([df_cleaned, targets_encoded], axis=1)
df_cleaned_3 = df_cleaned_2.copy(deep=True).drop(columns=['target'])
df_cleaned_3["tokens_path"] = df_cleaned_3.tokens_path.apply(path_tokenizer._join_words)

In [8]:
df_cleaned_3.head(5)

Unnamed: 0,day,sous_domaine,domaine,top_domaine,tokens_path,100,1000,1001,1002,1003,...,990,991,992,993,994,995,996,997,998,999
0,4,www,cdiscount,com,bricolag electricit batter plomb ah ova toplux ova,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,www,mystalk,net,profil vitoriafcorr,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,www,lequipe,fr,ten ten fich joueur,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,m,jeuxvideo,com,forum guild fourm legionnair recrut,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,context,reverso,net,traduct espagnol franc ir,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Trois approches peuvent etre prises pour la colonne sous_domaine:
- La prétraiter (enlever les www et les lettres, tokenizer...) et l'ajouter à la colonne path pour former une description
- L'enlever 
- La prétraiter afin de construire une feature catégorielle.
- Construire directement une feature catégorielle sans prétraitement

Pour l'instant nous optons pour la 2eme approche

In [41]:
from sklearn.model_selection import train_test_split


y = df_cleaned_3.iloc[:, 5:]
x = df_cleaned_3.iloc[:, : 5].drop(['sous_domaine'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

print(X_train.shape)
print(X_test.shape)
X_train

(50696, 4)
(16899, 4)


Unnamed: 0,day,domaine,top_domaine,tokens_path
9231,1,rakuten,com,victor gadoury monnai francais
17680,7,programme-tv,net,new tv lemiss terr quand elis lucet fait trembl respons supermarch
17283,4,linternaute,fr,musiqu biograph benjamin britten biograph court dat citat
64034,18,jeuxvideo,com,forum avis locat objet
27471,9,allocine,fr,film fichefilm critiqu spectateur recent
...,...,...,...,...
20276,12,automobile,fr,voitur ford fiest trend dr pk nav vhc car pgn pg srt dat sro desc frn dmg tru pg vipcar
63782,13,lafaimdesbananes,com,tourt poulet
49492,16,lejdc,fr,pougu eau actualit franco fil recompens employ
42015,6,lachainemeteo,com,meteo franc vill previs meteo chaumont mard


## Réduction de la dimension des labels
Nous avons 1903 labels pour notre target ce qui reprensente un nombre assez elevé pour de la classification multi-label.
Nous allons donc effectuer une réduction sur l'espace de dimension des labels.
Plusieurs algorithmes existent pour la réduction de l'espace des labels: compressed sensing (CS) et la  Principal Label Space Transformation (PLST) qui est l'équivalente du PCA sur les features. Egalement des techniques d'embeddings entre targets
Contrairement à ces techniques qui sont indépendentes de nos entrées, il en existe aussi d'autres qui le sont comme par exemple la CPLST ( Conditional Principal Label Space Transformation ) 

Un autre moyen serait d'essayer d'utiliser une approches par embedding sur les labels afin d'extraire les labels qui sont souvent ensemble.


Pour la classification multi-label, il existe une librarie nommée `scikit-multilearn` qui s'appuie sur la librarie scikit-learn. Elle contient aussi un wrapper autour de  MEKA qui propose une implémentation des méthodes d'apprentissage et d'évaluation multi-labels.

## Construction des features à partir du path
Différentes approches peuvent etre utilisées sur la colonne path:
#### L'utilisation de TF-IDF 
- TF-IDF: refléte l'importance d'un mot pour un document dans une collection (corpus) mais ne prend pas en compte le sens sémantique des mots. TF signifie la probabilité d'occurrence d'un mot dans une phrase.
- TF-IDF donne plus d'importance aux mots qui apparaissent moins fréquemment dans l'ensemble du corpus et donne également de l'importance aux mots les plus fréquents qui apparaissent dans chaque donnée.

Nous testerons les featurizations suivantes:
- TF-IDF: unigrams, bigrams, trigrams et word n-grams.
- TF-IDF based character: unigrams, bigrams, trigrams. (considérer une séquence de caractères plutôt qu'une séquence de mots)


#### L'utilisation des Embeddings (word2vec)
- Word2vec est l'un des modèle de l'état de l'art pour les embeddings.  Il permet de convertir du texte en vecteurs numériques tout en préservantles relations sémantiques entre les mots.
- vecteur de 300 dimensions 

#### L'utilisation d'une combinaison de TF-IDF et moyenne des Embeddings:
Nombiner les n-grammes de caractères avec les vecteurs obtenus avec Word2Vec

In [10]:
labels = df.explode('target')['target'].unique()
labels
len_labels = len(labels)

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import (KNeighborsClassifier,
                               NeighborhoodComponentsAnalysis)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [11]:
from sklearn.base import BaseEstimator,TransformerMixin
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder


categorical_features = ["day","domaine","top_domaine"]

tfidf = TfidfVectorizer(min_df=0)
vectorizer = TfidfVectorizer(min_df=0.00009, smooth_idf=True, norm="l2", tokenizer = lambda x: x.split(" "), sublinear_tf=False, ngram_range=(1,1))

transformer = ColumnTransformer([ ('categorical', OneHotEncoder(sparse=False, handle_unknown = "ignore"), categorical_features),
                            ("vectorizer", vectorizer, "tokens_path"),
                         ], remainder="passthrough")

X_train_multilabel = transformer.fit_transform(X_train)
X_test_multilabel = transformer.transform(X_test)
y_train_multilabel = y_train 
y_test_multilabel = y_test 

### Création des modèles :
#### Baseslines: 
- Dans un premier temps, nous allons construire nos modèles Baselines avec l'ensemble des targets présentes dans notre dataset. Chaque modèle essaiera de prédire les targets sur soit le nombre total de 1009 targets ( Ce qui n'est peut etre pas une bonne approche), ou sur la réduction de dimension des 1009 targets. 
- Après avoir construit les baslines models, nous essaierons d'améliorer les modèles avec des hyperparamètres pour voir s'il y a une augmentation du score F1.


#### Vu le peu de temps que j'ai en soirée après la journée au stage, j'ai préféré au lieu de me focaliser sur l'amélioration d'une seule approche, me documenter sur d'autres approches parallelement à l'entrainement des modèles et essayer de les implementer.

### Multi-label classification
#### 1.  OneVsRest: 
Le problème est décomposé en un problème de classification binaire multiple. Nous choisissons une classe et formons un classificateur binaire avec les échantillons de la classe sélectionnée d'un côté et tous les autres échantillons de l'autre côté. Ainsi, nous obtiendrons N classificateurs pour N étiquettes et lors du test nous classerons simplement l'échantillon comme appartenant à la classe avec le score maximum parmi les N classificateurs.

In [None]:
classifier1 = OneVsRestClassifier(LogisticRegression(penalty='l1', solver='liblinear', class_weight="balanced"), n_jobs=-1)
classifier1.fit(X_train_multilabel, y_train_multilabel)
predictions = classifier1.predict(X_test_multilabel)

In [37]:
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

print("Accuracy :",metrics.accuracy_score(y_test_multilabel, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test_multilabel,predictions))
precision = precision_score(y_test_multilabel, predictions, average='micro')
recall = recall_score(y_test_multilabel, predictions, average='micro')
f1 = f1_score(y_test_multilabel, predictions, average='micro')
 
print("\nMicro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test_multilabel, predictions, average='macro')
recall = recall_score(y_test_multilabel, predictions, average='macro')
f1 = f1_score(y_test_multilabel, predictions, average='macro')
 
print("\nMacro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print("\nClassification Report")
print (metrics.classification_report(y_test_multilabel, predictions))

Accuracy : 0.007278537191549796
Hamming loss  0.0068070643314176215

Micro-average quality numbers
Precision: 0.2285, Recall: 0.7251, F1-measure: 0.3475

Macro-average quality numbers
Precision: 0.1337, Recall: 0.3929, F1-measure: 0.1884

Classification Report
              precision    recall  f1-score   support

           0       0.05      0.35      0.08        23
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         1
           3       0.10      0.20      0.13         5
           4       0.20      0.70      0.31        43
           5       0.09      0.33      0.14        18
           6       0.00      0.00      0.00         0
           7       0.07      0.23      0.10        22
           8       0.06      0.15      0.09        13
           9       0.03      0.25      0.06         4
          10       0.33      0.20      0.25         5
          11       0.12      0.54      0.19        52
          12       0.13      0.36   

#### Prendre les X labels les plus occurrents

In [91]:
categorical_features = ["day","domaine","top_domaine"]

tfidf = TfidfVectorizer(min_df=0)
vectorizer = TfidfVectorizer(min_df=0.00009, smooth_idf=True, norm="l2", tokenizer = lambda x: x.split(" "), sublinear_tf=False, ngram_range=(1,1))

transformer = ColumnTransformer([ ('categorical', OneHotEncoder(sparse=False, handle_unknown = "ignore"), categorical_features),
                            ("vectorizer", vectorizer, "tokens_path"),
                         ], remainder="passthrough")

X_train_multilabel = transformer.fit_transform(X_train)
X_test_multilabel = transformer.transform(X_test)
y_train_multilabel = y_train 
y_test_multilabel = y_test 

In [92]:
y_test_inversed = mlb.inverse_transform(np.array(y_test))
y_test_inversed = [" ".join(list(ele)) for ele in y_test_inversed]

y_train_inversed = mlb.inverse_transform(np.array(y_train))
y_train_inversed = [" ".join(list(ele)) for ele in y_train_inversed]

y_train_inversed 

['1686 18 531 61 814',
 '1048 112 1781 211 358',
 '433 565 586 690 870',
 '1045 1311 1343 41 925',
 '1094 1095 1096 1097 1107',
 '1236 244 325 536 648',
 '1343 191 546 694',
 '1292 1602 1870 1872 816',
 '408 5527 572',
 '107 1080 1710 29 96',
 '1192 30 309 310 745',
 '1259 182 184 358 507',
 '251 419 557 635 943',
 '1193 1222 184 34 507',
 '105 401',
 '371 372 377 5186 799',
 '1264 1265 1266 666 708',
 '1143 210 22 355 531',
 '1155 1259 184 358 507',
 '250 5187 572 634 947',
 '1532 1556 1557 907 914',
 '1155 1259 184 358 507',
 '1225 1505 566 68 98',
 '1164 16 572 5887 784',
 '1311 1343 609 622 925',
 '1485 433 692 702',
 '1119 1254 5689 63',
 '120 1348 324 907 951',
 '1071 1171 1192 1277 1533',
 '1687 1690 1693 1694 852',
 '1048 1193 1781 22 608',
 '1494 937',
 '1311 1343 353 41 925',
 '1726 1728 269 748 950',
 '1133 1134 1135 1260 690',
 '1143 1145 1146 210 211',
 '1086 1116 184 34 358',
 '1071 1171 1192 1277 1533',
 '1094 1095 1096 1097 1107',
 '1043 1046 381 622 925',
 '1077 1253 1

#### 300 Labels

In [93]:
labels_vectorizer = CountVectorizer(binary='true', max_features=300).fit(y_train_inversed)


In [94]:
y_train_5labels = labels_vectorizer.transform(y_train_inversed)

In [104]:
y_test_5labels = labels_vectorizer.transform(y_test_inversed)

In [122]:
def display_score(y_test_5labels, predictions):
    print("Accuracy :",metrics.accuracy_score(y_test_5labels, predictions))
    print("Hamming loss ",metrics.hamming_loss(y_test_5labels,predictions))
    precision = precision_score(y_test_5labels, predictions, average='micro')
    recall = recall_score(y_test_5labels, predictions, average='micro')
    f1 = f1_score(y_test_5labels, predictions, average='micro')

    print("\nMicro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
    precision = precision_score(y_test_5labels, predictions, average='macro')
    recall = recall_score(y_test_5labels, predictions, average='macro')
    f1 = f1_score(y_test_5labels, predictions, average='macro')

    print("\nMacro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
    print("\nClassification Report")
    print (metrics.classification_report(y_test_5labels, predictions))

In [97]:
labels_vectorizer.inverse_transform(y_train_5labels)

[array(['1686', '18', '531', '61'], dtype='<U4'),
 array(['1048', '1781', '211', '358'], dtype='<U4'),
 array(['433', '690', '870'], dtype='<U4'),
 array(['1311', '1343', '41', '925'], dtype='<U4'),
 array(['1094', '1095', '1096', '1097', '1107'], dtype='<U4'),
 array(['244', '325'], dtype='<U4'),
 array(['1343', '191', '694'], dtype='<U4'),
 array(['1292', '1602', '816'], dtype='<U4'),
 array(['408', '5527', '572'], dtype='<U4'),
 array(['107', '1080', '1710', '96'], dtype='<U4'),
 array(['1192', '30', '309', '310'], dtype='<U4'),
 array(['1259', '182', '184', '358', '507'], dtype='<U4'),
 array(['419'], dtype='<U4'),
 array(['1193', '1222', '184', '34', '507'], dtype='<U4'),
 array(['105', '401'], dtype='<U4'),
 array(['371', '377', '5186'], dtype='<U4'),
 array(['1264', '1265', '1266'], dtype='<U4'),
 array(['1143', '210', '22', '531'], dtype='<U4'),
 array(['1155', '1259', '184', '358', '507'], dtype='<U4'),
 array(['572'], dtype='<U4'),
 array(['1556', '907', '914'], dtype='<U4'),

In [None]:
# comparer les résultats avec y_train et y_train_max_features

In [None]:
classifier1 = OneVsRestClassifier(LogisticRegression(penalty='l1', solver='liblinear', class_weight="balanced"), n_jobs=-1)
classifier1.fit(X_train_multilabel, y_train_5labels)


In [106]:
predictions = classifier1.predict(X_test_multilabel) 

In [108]:
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

print("Accuracy :",metrics.accuracy_score(y_test_5labels, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test_5labels,predictions))
precision = precision_score(y_test_5labels, predictions, average='micro')
recall = recall_score(y_test_5labels, predictions, average='micro')
f1 = f1_score(y_test_5labels, predictions, average='micro')
 
print("\nMicro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(y_test_5labels, predictions, average='macro')
recall = recall_score(y_test_5labels, predictions, average='macro')
f1 = f1_score(y_test_5labels, predictions, average='macro')
 
print("\nMacro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
print("\nClassification Report")
print (metrics.classification_report(y_test_5labels, predictions))

Accuracy : 0.03213207882123203
Hamming loss  0.022857762786752667

Micro-average quality numbers
Precision: 0.3017, Recall: 0.8126, F1-measure: 0.4400

Macro-average quality numbers
Precision: 0.2855, Recall: 0.7531, F1-measure: 0.3999

Classification Report
              precision    recall  f1-score   support

           0       0.30      0.79      0.44        78
           1       0.26      0.76      0.38       226
           2       0.52      0.87      0.65        89
           3       0.20      0.76      0.31        85
           4       0.21      0.59      0.31       126
           5       0.22      0.71      0.33        70
           6       0.10      0.57      0.17        93
           7       0.15      0.60      0.24       180
           8       0.17      0.64      0.26       152
           9       0.25      0.75      0.37       169
          10       0.23      0.91      0.36       163
          11       0.26      0.80      0.40        85
          12       0.34      0.81     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#### Compare predictions et le vrai y_train

#### 30 features

In [109]:
### 30 features
labels_vectorizer = CountVectorizer(binary='true', max_features=30).fit(y_train_inversed)
y_train_5labels = labels_vectorizer.transform(y_train_inversed)
y_test_5labels = labels_vectorizer.transform(y_test_inversed)

classifier1 = OneVsRestClassifier(LogisticRegression(penalty='l1', solver='liblinear', class_weight="balanced"), n_jobs=-1)
classifier1.fit(X_train_multilabel, y_train_5labels)
predictions = classifier1.predict(X_test_multilabel) 

In [124]:
predictions_train = classifier1.predict(X_train_multilabel)
display_score(y_train_5labels, predictions_train)

Accuracy : 0.5401609594445321
Hamming loss  0.027827310504444796

Micro-average quality numbers
Precision: 0.5580, Recall: 0.9957, F1-measure: 0.7152

Macro-average quality numbers
Precision: 0.5845, Recall: 0.9955, F1-measure: 0.7225

Classification Report
              precision    recall  f1-score   support

           0       0.50      1.00      0.67      1666
           1       0.59      1.00      0.74      1281
           2       0.84      1.00      0.91      1494
           3       0.45      0.99      0.62      1312
           4       0.62      0.99      0.77      2205
           5       0.93      1.00      0.96      1697
           6       0.79      1.00      0.88      2727
           7       0.50      1.00      0.66      1369
           8       0.52      1.00      0.69      1522
           9       0.75      1.00      0.86      2651
          10       0.53      0.99      0.69      1375
          11       0.62      1.00      0.76      1913
          12       0.62      0.99      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [123]:
display_score(y_test_5labels, predictions)

Accuracy : 0.480679330137878
Hamming loss  0.03218928141704638

Micro-average quality numbers
Precision: 0.5253, Recall: 0.9113, F1-measure: 0.6664

Macro-average quality numbers
Precision: 0.5517, Recall: 0.9041, F1-measure: 0.6723

Classification Report
              precision    recall  f1-score   support

           0       0.45      0.92      0.60       556
           1       0.53      0.89      0.67       428
           2       0.86      0.97      0.91       510
           3       0.39      0.85      0.54       409
           4       0.61      0.90      0.73       750
           5       0.95      0.97      0.96       563
           6       0.79      0.99      0.88       955
           7       0.47      0.93      0.63       433
           8       0.49      0.91      0.64       469
           9       0.74      0.99      0.85       916
          10       0.44      0.86      0.58       443
          11       0.58      0.95      0.73       618
          12       0.58      0.93      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Le modèle arrive a obtenir un score F1 de 0.67 sur 30 labels, et 0.44 avec 300 labels, cependant cette approche présente quelques inconvenients. Nous pouvons maintenant, au lieu de prédire sur les N tops labels:
- Faire un clustering sur les labels avec un nombre de clusters= 30, voir 100 par exemple et rassembler les labels qui sont proches
- Faire un clustering sur une les N top labels (300 par exemple) avec n clusters = 30

#### Clustering des labels

In [138]:
y_train.shape 

(50696, 1903)

In [140]:
from sklearn.cluster import KMeans
sse = []
k_list = [30, 100, 300]
for k in k_list:
    km = KMeans(n_clusters=k)
    km.fit(y_train)
    sse.append([k, km.inertia_])


KeyboardInterrupt: 

In [None]:
    
oca_results_scale = pd.DataFrame({'Cluster': k_list, 'SSE': sse})
plt.figure(figsize=(12,6))
plt.plot(pd.DataFrame(sse)[0], pd.DataFrame(sse)[1], marker='o')
plt.title('Optimal Number of Clusters using Elbow Method (Scaled Data)')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

In [117]:
import numpy as np
from sklearn import datasets, cluster

agglo = cluster.FeatureAgglomeration(n_clusters=30)
agglo.fit(y_train)


FeatureAgglomeration(n_clusters=30)

In [119]:
y_train_agglo = agglo.transform(y_train)
y_test_agglo = agglo.transform(y_test)
y_train_agglo.shape 

(50696, 30)

In [129]:
np.unique(y_train_agglo[:, 5], return_counts=True)

(array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]),
 array([49121,  1284,    73,    25,    32,   161], dtype=int64))

In [None]:
classifier1 = OneVsRestClassifier(LogisticRegression(penalty='l1', solver='liblinear', class_weight="balanced"), n_jobs=-1)
classifier1.fit(X_train+_multilabel, y_train_5labels)
predictions = classifier1.predict(y_test_agglo) 

In [141]:
### pca

import numpy as np
from sklearn.decomposition import PCA

pca = PCA(n_components=300)
pca.fit(y_train)

print(pca.explained_variance_ratio_)

print(pca.singular_values_)
x = pca.transform(y_train)
x

[0.04023017 0.02378634 0.0214826  0.01987679 0.01907776 0.01691163
 0.01512947 0.01429146 0.01341969 0.01299743 0.01276402 0.01260639
 0.01229661 0.00986634 0.00925893 0.00892538 0.00807739 0.00747416
 0.00683351 0.00665718 0.00658867 0.00633852 0.00603958 0.0056604
 0.00547554 0.00530705 0.00529029 0.00526304 0.00512818 0.00498063
 0.00489969 0.00486983 0.00479707 0.004758   0.00462715 0.00451874
 0.00444162 0.00434504 0.00427258 0.00423812 0.00409509 0.0040688
 0.00403352 0.00379453 0.00378066 0.00371746 0.00368894 0.00359392
 0.00349911 0.00342886 0.00341173 0.00336716 0.0033303  0.00323817
 0.0031973  0.00315694 0.00311608 0.00307391 0.00302119 0.00299054
 0.00293676 0.00291525 0.00290566 0.00285111 0.00281001 0.00277336
 0.00273424 0.00268316 0.00267709 0.00261533 0.00260371 0.00257965
 0.00256445 0.00254102 0.00250735 0.00248395 0.00247005 0.00244284
 0.00243602 0.00241528 0.00240609 0.00237899 0.00236474 0.00234598
 0.00231422 0.00229826 0.00226536 0.00226253 0.00226043 0.002237

array([[-0.1531074 , -0.13709765, -0.14150413, ...,  0.00165985,
        -0.06639837, -0.02434831],
       [-0.1374602 , -0.13699235, -0.07111012, ...,  0.03358345,
         0.00925326,  0.06158339],
       [-0.07734802, -0.03322356, -0.03963993, ...,  0.05977975,
        -0.03153488, -0.03975597],
       ...,
       [-0.11922781, -0.0688332 , -0.10828375, ...,  0.12095328,
         0.00529817, -0.05248345],
       [-0.15467942, -0.17424683, -0.49243248, ...,  0.00290304,
         0.00304785,  0.01091503],
       [-0.14229253, -0.12355337, -0.23098496, ...,  0.01569203,
        -0.00784741,  0.06029934]])

In [168]:
mlb.inverse_transform(np.array(y_train))

[('1686', '18', '531', '61', '814'),
 ('1048', '112', '1781', '211', '358'),
 ('433', '565', '586', '690', '870'),
 ('1045', '1311', '1343', '41', '925'),
 ('1094', '1095', '1096', '1097', '1107'),
 ('1236', '244', '325', '536', '648'),
 ('1343', '191', '546', '694'),
 ('1292', '1602', '1870', '1872', '816'),
 ('408', '5527', '572'),
 ('107', '1080', '1710', '29', '96'),
 ('1192', '30', '309', '310', '745'),
 ('1259', '182', '184', '358', '507'),
 ('251', '419', '557', '635', '943'),
 ('1193', '1222', '184', '34', '507'),
 ('105', '401'),
 ('371', '372', '377', '5186', '799'),
 ('1264', '1265', '1266', '666', '708'),
 ('1143', '210', '22', '355', '531'),
 ('1155', '1259', '184', '358', '507'),
 ('250', '5187', '572', '634', '947'),
 ('1532', '1556', '1557', '907', '914'),
 ('1155', '1259', '184', '358', '507'),
 ('1225', '1505', '566', '68', '98'),
 ('1164', '16', '572', '5887', '784'),
 ('1311', '1343', '609', '622', '925'),
 ('1485', '433', '692', '702'),
 ('1119', '1254', '5689', '6

In [197]:

mlb.inverse_transform(y_train.values)[100]

('1311', '211', '381', '622', '925')

In [198]:

a = [0 if round(v, 0)<0.5 else 1 for v in pca.inverse_transform(x[100])]
mlb.inverse_transform(np.array([a]))


[('1311', '211', '381', '622', '925')]

#### 2.Binary Relevance:
Transformer un problème multi-label avec K étiquettes en des problèmes de classification binaire séparés. Chaque classificateur prédit si une étiquette est présente ou non.
Cette technique ignore les relations entre les étiquettes.

Les deux techniques présentées en haut traitent le problème multi-label (choix multiples) en une série de questions oui/non.

In [111]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

# initialize Binary Relevance multi-label classifier
# with an SVM classifier
# SVM in scikit only supports the X matrix in sparse representation

binary_relevance = BinaryRelevance(
    classifier = SVC(),
    require_dense = [False, False]
)

# train
binary_relevance.fit(X_train_multilabel.values, y_train.values)

# predict
predictions = binary_relevance.predict(X_test_multilabel.values)

AttributeError: values not found

#### 3.Classifier Chains:
Un classificateur 1 sera formé sur les données d'entrée. La sortie du classificateur 1 sera alimentée en entrée pour le classificateur 2, qui prédit la deuxième étiquette, la sortie du classificateur 2 sera alimentée en entrée pour le classificateur 3 et ainsi de suite.

### Exploration: transformer le problème en un problème de regression après une réduction de dimension avec SVD
(je ne suis pas sur que cette technique soit efficace, car transformer le problème en regression n'est pas une bonne technique en général dans un problème de classification normal, cependant, le SVD permet de réduire la dimension d'une matrice sparse (nos labels) et donc peut s'averer interessant de tester cette méthode)

In [169]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random

svd = TruncatedSVD(n_components=300, random_state=42)
svd.fit(y_train)
print(svd.explained_variance_ratio_.sum())



0.7796891562732401


In [170]:
y_train_pca = svd.transform(y_train)
y_test_pca = svd.transform(y_test)

In [175]:
y_train_pca.shape

(50696, 300)

#### MultiOutputRegressor
Cela suppose que les sorties sont indépendantes les unes des autres, ce qui pourrait ne pas être une hypothèse correcte pour nos labels et que l'enchainement séquentiel n'a pas d'importance

In [14]:
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR

In [15]:
model = LinearSVR()
wrapper = MultiOutputRegressor(model)
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(wrapper, X_train_multilabel, y_train_pca, scoring='neg_mean_absolute_error', n_jobs=-1)
n_scores = absolute(n_scores)
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: 0.044 (0.000)


In [16]:
# define base model
model_0 = LinearSVR()
multi_output_classifier = MultiOutputRegressor(model_0, n_jobs=-1).fit(X_train_multilabel, y_train_pca)

In [17]:
from numpy import absolute
n_scores = absolute(multi_output_classifier.score(X_train_multilabel, y_train_pca))
print(n_scores)
n_scores = absolute(multi_output_classifier.score(X_test_multilabel, y_test_pca))
print(n_scores)

0.3443672188400165
0.18439190280448134


#### Chained Multioutput Regression

In [19]:
from sklearn.multioutput import RegressorChain
# define base model
model_1 = LinearSVR()
# define the chained multioutput wrapper model
regressor_chain = RegressorChain(model_1).fit(X_train_multilabel, y_train_pca,)





In [20]:
from numpy import absolute
n_scores = absolute(regressor_chain.score(X_train_multilabel, y_train_pca))
print(n_scores)
n_scores = absolute(regressor_chain.score(X_test_multilabel, y_test_pca))
print(n_scores)

0.29684088143200754
0.17343953055569605


In [24]:
# save the model to disk
import pickle
filename = 'regressor_chain.sav'
pickle.dump(regressor_chain, open(filename, 'wb'))

filename = 'multi_output_classifier.sav'
pickle.dump(multi_output_classifier, open(filename, 'wb'))


In [25]:
loaded_model = pickle.load(open("regressor_chain.sav", 'rb'))
result = loaded_model.score(X_test_multilabel, y_test_pca)
print(result)

0.17343953055569605


In [None]:
# TO do :
# Simple neural network with n_components sigmoid in output