Dans ce notebook, je présente l'implémentation et les résultats du modèle Baseline du projet 5. 

# Modélisation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import html5lib
from bs4 import BeautifulSoup
import re
from string import punctuation
import nltk.corpus
from ast import literal_eval


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# spacy for lemmatization
import spacy
import sklearn.metrics as metrics

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
#
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_feather('cleaned_data_stackoverflow_questions.feather')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43801 entries, 0 to 43800
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Title_cleaned    43801 non-null  object
 1   Body_cleaned     43801 non-null  object
 2   Tags_cleaned     43801 non-null  object
 3   Title_tokenized  43801 non-null  object
 4   Body_tokenized   43801 non-null  object
 5   Tags_tokenized   43801 non-null  object
 6   Tags_list        43801 non-null  object
 7   number_of_tags   43801 non-null  int64 
 8   Id               43801 non-null  int64 
 9   Score            43801 non-null  int64 
 10  ViewCount        43801 non-null  int64 
 11  AnswerCount      43801 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 4.0+ MB


In [4]:
df.head()

Unnamed: 0,Title_cleaned,Body_cleaned,Tags_cleaned,Title_tokenized,Body_tokenized,Tags_tokenized,Tags_list,number_of_tags,Id,Score,ViewCount,AnswerCount
0,when porting java code to objc how best to rep...,i am working on porting a java codebase to coc...,java objective-c cocoa macos porting,"[porting, java, code, objc, best, represent, c...","[working, porting, java, codebase, cocoaobject...","[java, objective-c, cocoa, macos, porting]","[java, objective-c, macos]",3,1117384,8,1346,6
1,is there a way for hiding some enum values for...,i have enum lets say for example and have two ...,c# .net winforms windows-forms-designer prope...,"[way, hiding, enum, values, specific, property...","[enum, lets, say, example, two, classes, prope...","[c#, .net, winforms, windows-forms-designer, p...","[c#, .net]",2,59024032,8,2172,1
2,fixing words with spaces using a dictionary lo...,i have extracted the list of sentences from a ...,python python-2.7 dictionary nltk text-segmen...,"[fixing, words, spaces, using, dictionary, loo...","[extracted, list, sentences, document, preproc...","[python, python-2.7, dictionary, nltk, text-se...",[python],1,19675106,19,6507,7
3,how to split a string at line breaks in python,i want to copy some tabular data from excel in...,python arrays list clipboard pywin32,"[split, string, line, breaks, python]","[want, copy, tabular, data, excel, python, arr...","[python, arrays, list, clipboard, pywin32]","[python, arrays]",2,21205074,6,7848,5
4,how do you express binary literals in python,how do you express an integer as a binary numb...,python syntax binary integer literals,"[express, binary, literals, python]","[express, integer, binary, number, python, lit...","[python, syntax, binary, integer, literals]",[python],1,1476,394,317942,8


In [5]:
# concaténation de title + body
df['text'] = [list(x.tolist()+ y.tolist()) for x, y in zip(df['Title_tokenized'], df['Body_tokenized'])]

In [6]:

df['text_sentences'] = df['Title_cleaned']+' '+ df['Body_cleaned']

In [7]:
df.set_index('Id', inplace=True)
X = df["text"]
y = df["Tags_list"]

In [8]:
df['Body_cleaned'][21205074]

"i want to copy some tabular data from excel into a python array that is user willselect a range in an excel table press copy ctrl+c so that the range will be copied to clipboard then i will get this clipboard data into a python array list i use to get clipboard data into an array i copy the following range from excel when i use the function above i get a string like how to split this string into a list so that the list will look like i use method but it doesn't give me what i want "

In [10]:


# Multilabel binarizer for targets
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)
y_binarized = multilabel_binarizer.transform(y)

print("Shape of y: {}".format(y_binarized.shape))

Shape of y: (43801, 50)


Nous allons tester plusieurs métriques sur ce modèle LDA :

- Accuracy score :
- F1 score :
- Jaccard similarity score :
- Recall :
- Precision :

In [27]:


def metrics_score(model, df, y_true, y_pred):
    """Compilation function of metrics specific to multi-label
    classification problems in a Pandas DataFrame.
    This dataFrame will have 1 row per metric
    and 1 column per model tested. 

    Parameters
    ----------------------------------------
    model : string
        Name of the tested model
    df : DataFrame 
        DataFrame to extend. 
        If None : Create DataFrame.
    y_true : array
        Array of true values to test
    y_pred : array
        Array of predicted values to test
    ----------------------------------------
    """
    if(df is not None):
        temp_df = df
    else:
        temp_df = pd.DataFrame(index=["Accuracy", "F1",
                                      "Jaccard", "Recall",
                                      "Precision"],
                               columns=[model])
        
    scores = []
    scores.append(metrics.accuracy_score(y_true, y_pred))
    scores.append(metrics.f1_score(y_pred, 
                                   y_true, 
                                   average='weighted'))
    scores.append(metrics.jaccard_score(y_true, 
                                        y_pred, 
                                        average='weighted'))
    scores.append(metrics.recall_score(y_true, 
                                       y_pred, 
                                       average='weighted'))
    scores.append(metrics.precision_score(y_true, 
                                          y_pred, 
                                          average='weighted'))
    temp_df[model] = scores
    
    return temp_df

# Supervised approaches

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [12]:
# Initialize the "CountVectorizer" TFIDF for Doc
vectorizer = TfidfVectorizer(analyzer="word",
                             max_df=.6,
                             min_df=0.005,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)

vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)


In [13]:
# Create train and test split (30%)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_binarized,
                                                    test_size=0.3, random_state=8)
print("X_train shape : {}".format(X_train.shape))
print("X_test shape : {}".format(X_test.shape))
print("y_train shape : {}".format(y_train.shape))
print("y_test shape : {}".format(y_test.shape))

X_train shape : (30660, 1573)
X_test shape : (13141, 1573)
y_train shape : (30660, 50)
y_test shape : (13141, 50)


In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize

def tokenizer_fct(sentence) :
    # print(sentence)
    sentence_clean = sentence.replace('-', ' ').replace('+', ' ').replace('/', ' ').replace('#', ' ')
    word_tokens = word_tokenize(sentence_clean)
    return word_tokens

# Stop words
from nltk.corpus import stopwords
stop_w = list(set(stopwords.words('english'))) + ['[', ']', ',', '.', ':', '?', '(', ')']

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
    filtered_w2 = [w for w in filtered_w if len(w) > 2]
    return filtered_w2

# lower case et alpha
def lower_start_fct(list_words) :
    lw = [w.lower() for w in list_words if (not w.startswith("@")) 
    #                                   and (not w.startswith("#"))
                                       and (not w.startswith("http"))]
    return lw

# Lemmatizer (base d'un mot)
from nltk.stem import WordNetLemmatizer

def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w

# Fonction de préparation du texte pour le bag of words (Countvectorizer et Tf_idf, Word2Vec)
def transform_bow_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    # lem_w = lemma_fct(lw)    
    transf_desc_text = ' '.join(lw)
    return transf_desc_text

# Fonction de préparation du texte pour le bag of words avec lemmatization
def transform_bow_lem_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(sw)
    lem_w = lemma_fct(lw)    
    transf_desc_text = ' '.join(lem_w)
    return transf_desc_text

# Fonction de préparation du texte pour le Deep learning (USE et BERT)
def transform_dl_fct(desc_text) :
    word_tokens = tokenizer_fct(desc_text)
#    sw = stop_word_filter_fct(word_tokens)
    lw = lower_start_fct(word_tokens)
    # lem_w = lemma_fct(lw)    
    transf_desc_text = ' '.join(lw)
    return transf_desc_text

In [15]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Twins\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
df['sentence_bow'] = df['text_sentences'].apply(lambda x : transform_bow_fct(x))
df['sentence_bow_lem'] = df['text_sentences'].apply(lambda x : transform_bow_lem_fct(x))
df['sentence_dl'] = df['text_sentences'].apply(lambda x : transform_dl_fct(x))

##  best modèle avec SBERT transformer

SBERT (Sentence-BERT) est une implémentation spécifique de modèles de Sentence Transformers qui utilise BERT comme composant d'encodage de phrases. Contrairement à BERT qui a été conçu pour traiter des séquences de mots entières, SBERT est conçu pour encoder des phrases entières pour produire des embeddings de phrases sémantiquement similaires.

SBERT utilise une méthode d'entraînement appelée "siamese and triplet network" pour apprendre des embeddings sémantiques de phrases en utilisant des paires ou des triplets de phrases étiquetées. Le réseau siamois prend en entrée deux phrases et produit des vecteurs d'embeddings de phrases qui sont comparés pour évaluer leur similarité. Le réseau triplet prend en entrée trois phrases (une ancre, une phrase positive et une phrase négative) et utilise la distance entre les embeddings pour maximiser la similarité entre l'ancre et la phrase positive, tout en minimisant la similarité entre l'ancre et la phrase négative.

SBERT est souvent utilisé pour des tâches telles que la recherche de phrases similaires, la classification de textes et le résumé automatique. En utilisant SBERT, il est possible d'obtenir des représentations vectorielles de haute qualité pour les phrases qui capturent leur signification et leur contexte sémantique.

In [17]:
import tensorflow as tf
# import tensorflow_hub as hub
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers.legacy import Adam

# Bert
import os
import transformers
from transformers import *

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
from sentence_transformers import SentenceTransformer

In [19]:
model_bert = SentenceTransformer('all-MiniLM-L6-v2')

loading configuration file C:\Users\Twins/.cache\torch\sentence_transformers\sentence-transformers_all-MiniLM-L6-v2\config.json
Model config BertConfig {
  "_name_or_path": "C:\\Users\\Twins/.cache\\torch\\sentence_transformers\\sentence-transformers_all-MiniLM-L6-v2\\",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file C:\Users\Twins/.cache\torch\sentence_transformers\sentence-transformers_all-MiniLM-L6-v2\pytorch_model.bin
All model ch

pour de raison de mémoire, la fonction suivante construit la matrice des features de Bert morceau par morceau

In [20]:
sentences = df['sentence_dl'].to_list()

In [21]:
sentence_embeddings = model_bert.encode(sentences)

In [22]:
# split features
Bertfet_train, Bertfet_test, y_train, y_test = train_test_split(sentence_embeddings, y_binarized, test_size=0.30)
reg_logit_clf = OneVsRestClassifier(LogisticRegression(C= 10, penalty= 'l1', solver= 'saga', random_state = 9))
reg_logit_clf.fit(Bertfet_train, y_train)





In [23]:
#Prediction
bert_pred_lr = reg_logit_clf.predict(Bertfet_test)

In [24]:
Bertfet_test.shape

(13141, 384)

In [25]:
y_test.shape

(13141, 50)

In [58]:
df_metrics_compare = metrics_score("logitR_BERT", df=df_metrics_compare,
                                   y_true=y_test,
                                   y_pred=lr_pred_use)
df_metrics_compare

Unnamed: 0,logitR_BERT,logitR_USE
Accuracy,0.327824,0.31621
F1,0.670116,0.652936
Jaccard,0.49688,0.475392
Recall,0.587398,0.566866
Precision,0.733199,0.717987


In [98]:
df_metrics_compare = metrics_score("logitR_BERT_ACP", df=df_metrics_compare,
                                   y_true=y_test,
                                   y_pred=bert_pred_lr_PCA)
df_metrics_compare

Unnamed: 0,logitR_BERT,logitR_USE,logitR_BERT_ACP
Accuracy,0.327824,0.31621,0.294494
F1,0.670116,0.652936,0.64614
Jaccard,0.49688,0.475392,0.453536
Recall,0.587398,0.566866,0.530658
Precision,0.733199,0.717987,0.718362


# selection du modèle et sauvegarde

le modèle choisis : logistic regression + sentence BERT transformer .

In [82]:
final_model = reg_logit_clf # using sentence_transformer - logisticRegression

In [83]:
# Export fitted model and Preprocessor
from pickle import dump

#Modèle serialisation
dump(final_model, open("final_model.pkl","wb"))

In [100]:
dump(multilabel_binarizer, open("multilabel_binarizer.pkl","wb"))