# <center> Projet 5 : Catégorisez automatiquement des questions sur Stack Overflow

Dans cette partie nous allons tester les différents models, supervisés

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Import Python libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
import re
import nltk
import pickle
import time
import spacy
import logging
import gensim
import gensim.corpora as corpora
import multiprocessing
%matplotlib inline
logging.disable(logging.WARNING) # disable WARNING, INFO and DEBUG logging everywhere
import warnings
warnings.filterwarnings("ignore")

In [3]:
from tqdm import tqdm 
from ast import literal_eval
from bs4 import BeautifulSoup
from sklearn import cluster, metrics
from sklearn import manifold, decomposition
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
from wordcloud import WordCloud
from spacy import displacy

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, jaccard_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [5]:
X_train_tfidf = joblib.load('../Variables/X_train_tfidf.joblib')
X_test_tfidf = joblib.load('../Variables/X_test_tfidf.joblib')

In [6]:
Xw2v_train = joblib.load('../Variables/Xw2v_train.joblib')
Xw2v_test = joblib.load('../Variables/Xw2v_test.joblib')

In [7]:
X_train_bert_hf = joblib.load('../Variables/X_train_bert_hf.joblib')
X_test_bert_hf = joblib.load('../Variables/X_test_bert_hf.joblib')

In [8]:
X_train_use = joblib.load('../Variables/X_train_use.joblib')
X_test_use = joblib.load('../Variables/X_test_use.joblib')

In [9]:
X_train = joblib.load('../Variables/X_train.joblib')
X_test = joblib.load('../Variables/X_test.joblib')

In [10]:
y_train = joblib.load('../Variables/y_train.joblib')
y_test = joblib.load('../Variables/y_test.joblib')

In [11]:
multilabel = joblib.load('../Models/multilabel.joblib')

In [12]:
def modeltrainer(Xtrain, Xtest, ytrain, ytest, model):
    """
    Cette fonction va nous permettre d'entrainer les modèles puis génerer les feteares importances de chaque model. 
    """
    print("=="*45)
    print("Modele utilise : {}".format(model))
    score =  {}
    time1 = time.time()
    model.fit(Xtrain, ytrain)
    y_pred = model.predict(Xtest)
    time2 = time.time()
    print("Temps d'entrainement : {:.2f}s".format(time2 - time1))
    # Evaluate model
    print("Evaluation du modele ... ")    
    f1 = f1_score(ytest, y_pred, average='micro')
    jaccard = jaccard_score(ytest, y_pred, average='micro')
    roc_auc =roc_auc_score(ytest, y_pred, average='micro')
    time3 = time.time()
    # Dictionnary of score
    score['f1_score'] = f1
    score['jaccard_score'] = jaccard
    score['ROC_Auc_score'] = roc_auc 
    print("Temps d'evaluation : {:.2f}s".format(time3 - time2))
    times = time3 - time1

    return y_pred, score, model, times

In [13]:
def predict_tags(multilabel, y_pred):
    pred_tags = multilabel.inverse_predict(y_pred)
    return pred_tags

## TFIDF

### Logistic Regression


In [14]:
log_reg_model = OneVsRestClassifier(estimator=LogisticRegression(penalty='l1',
                                                                 solver='saga',
                                                                 C=100,
                                                                 n_jobs=-1), 
                                    n_jobs=-1)

In [15]:
ytfidf_pred_lr, score_tfidf_lr, model_tfidf_lr, time_tfidf_lr = modeltrainer(
    X_train_tfidf, X_test_tfidf, y_train, y_test, log_reg_model) 

Modele utilise : OneVsRestClassifier(estimator=LogisticRegression(C=100, n_jobs=-1, penalty='l1',
                                                 solver='saga'),
                    n_jobs=-1)
Temps d'entrainement : 1364.39s
Evaluation du modele ... 
Temps d'evaluation : 2.16s


In [16]:
multilabel.inverse_transform(ytfidf_pred_lr)

[(),
 ('c++',),
 (),
 (),
 (),
 (),
 ('post',),
 (),
 (),
 ('c#',),
 (),
 (),
 ('python',),
 (),
 ('ios', 'ios7', 'objective-c', 'swift', 'uikit'),
 ('javascript',),
 (),
 ('collections',),
 ('css',),
 ('json', 'post'),
 (),
 (),
 ('javascript',),
 ('asp.net-mvc',),
 (),
 ('android',),
 ('arrays', 'java'),
 ('javascript',),
 (),
 (),
 ('unit-testing',),
 ('sql',),
 (),
 (),
 (),
 (),
 ('android',),
 ('java', 'spring', 'spring-boot', 'spring-mvc'),
 ('git',),
 ('json', 'serialization'),
 (),
 (),
 (),
 ('sql', 'sql-server'),
 (),
 (),
 (),
 ('python',),
 (),
 (),
 (),
 (),
 (),
 ('ios', 'iphone', 'objective-c'),
 (),
 ('arrays', 'javascript', 'jquery', 'object'),
 (),
 (),
 ('android',),
 (),
 (),
 ('performance',),
 (),
 (),
 ('javascript', 'jquery'),
 ('java',),
 ('assembly', 'c', 'c++'),
 (),
 ('c#', 'visual-studio'),
 (),
 (),
 (),
 (),
 (),
 ('.net-core', 'c#'),
 (),
 ('linux',),
 (),
 (),
 ('date',),
 (),
 ('java', 'multithreading'),
 ('php',),
 (),
 ('java',),
 ('java', 'performa

In [17]:
score_tfidf_lr

{'f1_score': 0.32651113733959464,
 'jaccard_score': 0.19510804321728692,
 'ROC_Auc_score': 0.6097414936715185}

In [18]:
joblib.dump(ytfidf_pred_lr, open("../Variables/ytfidf_pred_lr.joblib", 'wb'), compress=True)
joblib.dump(score_tfidf_lr, open("../Scores/score_tfidf_lr.joblib", 'wb'), compress=True)
joblib.dump(model_tfidf_lr, open("../Models/model_tfidf_lr.joblib", 'wb'), compress=True)

### Random Forest

In [19]:
rfc_model = OneVsRestClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)

In [20]:
ytfidf_pred_rf, score_tfidf_rf, model_tfidf_rf, time_tfidf_rf = modeltrainer(
    X_train_tfidf, X_test_tfidf, y_train, y_test, rfc_model)  

Modele utilise : OneVsRestClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)
Temps d'entrainement : 1560.35s
Evaluation du modele ... 
Temps d'evaluation : 2.12s


In [21]:
score_tfidf_rf

{'f1_score': 0.261217813125758,
 'jaccard_score': 0.15023032505028222,
 'ROC_Auc_score': 0.5784086660625621}

## Word2Vec

### Logistic regression

In [22]:
log_reg_model = OneVsRestClassifier(estimator=LogisticRegression(penalty='l1',
                                                                 solver='saga',
                                                                 C=100,
                                                                 n_jobs=-1), 
                                    n_jobs=-1)

In [23]:
yw2v_pred_lr, score_w2v_lr, model_w2v_lr, time_w2v_lr = modeltrainer(
    Xw2v_train, Xw2v_test, y_train, y_test, log_reg_model)  

Modele utilise : OneVsRestClassifier(estimator=LogisticRegression(C=100, n_jobs=-1, penalty='l1',
                                                 solver='saga'),
                    n_jobs=-1)
Temps d'entrainement : 1447.18s
Evaluation du modele ... 
Temps d'evaluation : 1.06s


In [24]:
score_w2v_lr

{'f1_score': 0.4104587833471799,
 'jaccard_score': 0.25822468712796537,
 'ROC_Auc_score': 0.6501723633603349}

In [25]:
joblib.dump(yw2v_pred_lr, open('../Variables/yw2v_pred_lr.joblib', 'wb'), compress=True)
joblib.dump(score_w2v_lr, open("../Scores/score_w2v_lr.joblib", 'wb'), compress=True)
joblib.dump(model_w2v_lr, open("../Models/model_w2v_lr.joblib", 'wb'), compress=True)

### Random Forest Classifier

In [26]:
rfc_model = OneVsRestClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)

In [27]:
yw2v_pred_rf, score_w2v_rf, model_w2v_lr, time_w2v_rf = modeltrainer(
    Xw2v_train, Xw2v_test, y_train, y_test, rfc_model) 

Modele utilise : OneVsRestClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)
Temps d'entrainement : 3955.50s
Evaluation du modele ... 
Temps d'evaluation : 2.17s


In [28]:
score_w2v_rf

{'f1_score': 0.14006332650400447,
 'jaccard_score': 0.07530542759863809,
 'ROC_Auc_score': 0.538223873400425}

In [29]:
multilabel.inverse_transform(yw2v_pred_rf)

[(),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('ios',),
 ('javascript',),
 (),
 (),
 (),
 (),
 (),
 ('c', 'c++'),
 (),
 (),
 (),
 ('android',),
 (),
 ('javascript',),
 (),
 (),
 (),
 (),
 (),
 ('android',),
 (),
 ('java',),
 ('android',),
 ('java', 'spring'),
 (),
 ('json',),
 (),
 (),
 (),
 (),
 ('java',),
 (),
 (),
 ('python',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('javascript',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('javascript',),
 (),
 ('c',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('c#',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('mysql', 'php'),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('ios',),
 (),
 ('ios',),
 ('python',),
 (),
 (),
 (),
 ('python',),
 (),
 (),
 (),
 (),
 (),
 (),
 ('c++',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('c#',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('reactjs',),
 (),
 (),
 (),
 ('css',),
 (),
 (),
 ('c++',),
 (),
 (),
 (),
 (),
 ('python',),
 (),
 (),
 (),
 (),
 (),
 (),
 ('html', '

In [30]:
joblib.dump(yw2v_pred_rf, open("../Variables/yw2v_pred_rf.joblib", 'wb'), compress=True)
joblib.dump(score_w2v_rf, open("../Scores/score_w2v_rf.joblib", 'wb'), compress=True)
joblib.dump(model_w2v_lr, open("../Models/model_w2v_rf.joblib", 'wb'), compress=True)

## USE

### Logistic regression

In [31]:
log_reg_model = OneVsRestClassifier(estimator=LogisticRegression(penalty='l1',
                                                                 solver='saga',
                                                                 C=100,
                                                                 n_jobs=-1), 
                                    n_jobs=-1)

In [32]:
yuse_pred_lr, score_use_lr, model_use_lr, time_use_lr = modeltrainer(
    X_train_use, X_test_use, y_train, y_test, log_reg_model)  

Modele utilise : OneVsRestClassifier(estimator=LogisticRegression(C=100, n_jobs=-1, penalty='l1',
                                                 solver='saga'),
                    n_jobs=-1)
Temps d'entrainement : 3225.77s
Evaluation du modele ... 
Temps d'evaluation : 1.70s


In [33]:
multilabel.inverse_transform(yuse_pred_lr)

[('ios',),
 ('c++', 'java', 'null'),
 ('c#', 'excel'),
 ('dataframe', 'pandas', 'python', 'scala'),
 ('angular', 'https', 'javascript'),
 ('c#', 'data-binding', 'wpf', 'xaml'),
 ('firefox', 'http'),
 ('asp.net', 'c#', 'pdf', 'security', 'ssl'),
 (),
 ('.net', 'c#', 'collections', 'dictionary'),
 ('java',),
 ('version-control',),
 ('python',),
 ('asp.net',),
 ('animation', 'ios'),
 ('dom', 'html', 'javascript', 'jquery'),
 ('encryption', 'mysql', 'php'),
 ('functional-programming',),
 ('ajax', 'asynchronous', 'javascript', 'jquery'),
 ('machine-learning', 'python'),
 (),
 ('c', 'c++', 'pointers', 'string'),
 (),
 ('asp.net-mvc',),
 ('java', 'oracle'),
 ('android', 'android-layout'),
 ('arrays', 'java'),
 ('javascript', 'jquery'),
 ('debugging',),
 ('ruby-on-rails', 'testing'),
 ('swift', 'unit-testing'),
 ('function',),
 (),
 ('android',),
 ('asp.net-core', 'exception'),
 (),
 ('android', 'java'),
 ('java', 'spring', 'spring-boot', 'spring-mvc'),
 ('ios', 'xcode'),
 ('json',),
 ('archit

In [34]:
score_use_lr

{'f1_score': 0.5087794599656625,
 'jaccard_score': 0.3411832430593715,
 'ROC_Auc_score': 0.71993813794796}

In [35]:
joblib.dump(yuse_pred_lr, open("../Variables/yuse_pred_lr.joblib", 'wb'), compress=True)
joblib.dump(score_use_lr, open("../Scores/score_use_lr.joblib", 'wb'), compress=True)
joblib.dump(model_use_lr, open("../Models/model_use_lr.joblib", 'wb'), compress=True)

### SVC

In [36]:
svc_model = OneVsRestClassifier(estimator=SVC(), n_jobs=-1) 

In [37]:
yuse_pred_sv, score_use_sv, model_use_sv, time_use_sv = modeltrainer(
    X_train_use, X_test_use, y_train, y_test, svc_model)  

Modele utilise : OneVsRestClassifier(estimator=SVC(), n_jobs=-1)
Temps d'entrainement : 30221.43s
Evaluation du modele ... 
Temps d'evaluation : 0.67s


In [38]:
score_use_sv

{'f1_score': 0.4841233974740148,
 'jaccard_score': 0.3193686060377832,
 'ROC_Auc_score': 0.6724140669678402}

### Random Forest Classifier

In [39]:
rfc_model = OneVsRestClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)

In [40]:
yuse_pred_rf, score_use_rf, model_use_rf, time_use_rf = modeltrainer(
    X_train_use, X_test_use, y_train, y_test, rfc_model) 

Modele utilise : OneVsRestClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)
Temps d'entrainement : 4666.82s
Evaluation du modele ... 
Temps d'evaluation : 1.25s


In [41]:
score_use_rf

{'f1_score': 0.31215469613259667,
 'jaccard_score': 0.18494271685761046,
 'ROC_Auc_score': 0.595760149593853}

## BERT

### Logistic regression

In [42]:
log_reg_model = OneVsRestClassifier(estimator=LogisticRegression(n_jobs=-1), n_jobs=-1) 

In [43]:
yber_pred_lr, score_ber_lr, model_ber_lr, time_ber_lr = modeltrainer(
    X_train_bert_hf, X_test_bert_hf, y_train, y_test, log_reg_model) 

Modele utilise : OneVsRestClassifier(estimator=LogisticRegression(n_jobs=-1), n_jobs=-1)
Temps d'entrainement : 241.05s
Evaluation du modele ... 
Temps d'evaluation : 0.93s


In [44]:
score_ber_lr

{'f1_score': 0.40632399722547774,
 'jaccard_score': 0.25496022812546904,
 'ROC_Auc_score': 0.6435873991533534}

In [45]:
multilabel.inverse_transform(yber_pred_lr)

[(),
 (),
 ('excel',),
 ('c#',),
 (),
 ('c#', 'mvvm', 'wpf'),
 (),
 (),
 ('css',),
 (),
 ('java',),
 (),
 ('python',),
 (),
 ('animation', 'ios'),
 ('colors', 'html'),
 ('mysql',),
 ('functional-programming',),
 (),
 (),
 (),
 ('c++', 'pointers'),
 ('node.js',),
 ('asp.net-mvc', 'c#'),
 ('java', 'oracle'),
 ('android',),
 ('java',),
 ('javascript',),
 (),
 (),
 ('unit-testing',),
 ('function',),
 (),
 ('android',),
 ('.net', 'c#'),
 ('java', 'pdf'),
 ('android',),
 ('spring',),
 (),
 (),
 (),
 (),
 ('authentication', 'cookies', 'hash', 'session'),
 (),
 ('java', 'jvm'),
 ('postgresql',),
 (),
 ('python',),
 ('javascript',),
 ('python', 'url'),
 ('sql',),
 (),
 (),
 ('ios',),
 ('visual-studio', 'visual-studio-2010'),
 (),
 ('javascript',),
 (),
 (),
 ('swift',),
 (),
 ('stl',),
 (),
 (),
 (),
 ('android', 'docker', 'gradle', 'java'),
 ('c',),
 (),
 (),
 (),
 (),
 (),
 ('javascript',),
 (),
 (),
 (),
 (),
 ('php',),
 ('postgresql',),
 ('date', 'jquery', 'sql-server', 'time', 'tsql'),
 ()

In [46]:
joblib.dump(yber_pred_lr, open("../Variables/yber_pred_lr.joblib", 'wb'), compress=True)
joblib.dump(score_ber_lr, open("../Scores/score_ber_lr.joblib", 'wb'), compress=True)
joblib.dump(model_ber_lr, open("../Models/model_ber_lr.joblib", 'wb'), compress=True)

### Random Forest Classifier

In [47]:
rfc_model = OneVsRestClassifier(estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1)

In [48]:
yber_pred_rf, score_ber_rf, model_ber_rf, time_ber_rf = modeltrainer(
    X_train_bert_hf, X_test_bert_hf, y_train, y_test, log_reg_model) 

Modele utilise : OneVsRestClassifier(estimator=LogisticRegression(n_jobs=-1), n_jobs=-1)
Temps d'entrainement : 250.61s
Evaluation du modele ... 
Temps d'evaluation : 0.89s


In [49]:
score_ber_rf

{'f1_score': 0.40632399722547774,
 'jaccard_score': 0.25496022812546904,
 'ROC_Auc_score': 0.6435873991533534}

In [52]:
metrics = ['f1_score', 'jaccard_score', 'ROC_Auc_score']
raw = [["BERT", "TFIDF", "USE", "W2V"]]

In [53]:
# Create table of scoring
Scores = pd.DataFrame(columns=metrics)
# Affecting socring for each models
for metric in metrics:
    Scores[metric] = [score_ber_lr[metric], score_tfidf_lr[metric],
                      score_use_lr[metric], score_w2v_lr[metric]]
# Set index with corresponding models    
Scores.set_index(raw)

Unnamed: 0,f1_score,jaccard_score,ROC_Auc_score
BERT,0.406324,0.25496,0.643587
TFIDF,0.326511,0.195108,0.609741
USE,0.508779,0.341183,0.719938
W2V,0.410459,0.258225,0.650172
