In [1]:
# Import Python libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval

In [2]:
data = pd.read_csv('StackOverflow_cleaned.csv',sep=";", index_col=0,converters={"Title": literal_eval,
                                                                                 "Body": literal_eval,
                                                                                  "Tags": literal_eval})

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24044 entries, 0 to 27048
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   24044 non-null  object
 1   Body    24044 non-null  object
 2   Score   24044 non-null  int64 
 3   Tags    24044 non-null  object
dtypes: int64(1), object(3)
memory usage: 939.2+ KB


In [4]:
data

Unnamed: 0,Title,Body,Score,Tags
0,"[core, dump, linux, segmentation, fault]","[process, linux, segmentation, fault, core, dump]",237,"[linux, bash]"
1,"[php, server, mysql, server, time, zone]","[hosting, package, godaddy, network, solution,...",12,"[php, mysql]"
2,"[constraint, landscape, orientation]","[constraint, device, example, image, portrait,...",48,"[ios, cocoa-touch]"
3,"[loading, system, servicemodel, configuration,...","[net, wcf, wcf, configuration, client, applica...",64,"[c#, .net]"
4,"[xml, serialization, inherited, type]","[question, object, model, xml, problem, quelle...",86,[c#]
...,...,...,...,...
27043,"[python, process, pdf, report, chart]","[database, survey, university, professor, pyth...",41,"[python, pandas]"
27044,"[network, connection, type, android, pie]","[android, pie, request, encryption, default, s...",192,"[java, android]"
27045,"[react, router, position, component]","[react, router, card, list, card, game, detail...",28,[javascript]
27046,"[operation, library]","[project, crash, request, page, problem, docum...",46,"[ios, swift]"


In [5]:
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import gensim

#X = data["Body"]
y = data["Tags"]

def transform_stc(my_text) :
    
    transf_desc_text = ' '.join(my_text)
    return transf_desc_text

X =  data['Body'].apply(lambda x : transform_stc(x))

In [6]:
X

0               process linux segmentation fault core dump
1        hosting package godaddy network solution conve...
2        constraint device example image portrait lands...
3        net wcf wcf configuration client application n...
4        question object model xml problem quelle surpr...
                               ...                        
27043    database survey university professor python sc...
27044    android pie request encryption default system ...
27045    react router card list card game detail card g...
27046    project crash request page problem documentati...
Name: Body, Length: 24044, dtype: object

## Création du modèle Word2Vec

In [7]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 24 # adapt to length of sentences
sentences = X.to_list()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

In [8]:
# Création et entraînement du modèle Word2Vec

print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                                vector_size=w2v_size,
                                                seed=42,
                                                workers=1)

w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

Build & train Word2Vec model ...
Vocabulary size: 25416
Word2Vec trained


In [9]:
# Préparation des sentences (tokenization)

print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences),
                                                     maxlen=maxlen,
                                                     padding='post') 
                                                   
num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)

Fit Tokenizer ...
Number of unique words: 25417


## Création de la matrice d'embedding

In [10]:
# Création de la matrice d'embedding

print("Create Embedding matrix ...")
w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i=0
j=0
    
for word, idx in word_index.items():
    i +=1
    if word in w2v_words:
        j +=1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = model_vectors[word]
            
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))

Create Embedding matrix ...
Word embedding rate :  1.0
Embedding matrix: (25417, 300)


## Création du modèle d'embedding

In [11]:
# Création du modèle

input=Input(shape=(len(x_sentences),maxlen),dtype='float64')
word_input=Input(shape=(maxlen,),dtype='float64')  
word_embedding=Embedding(input_dim=vocab_size,
                         output_dim=w2v_size,
                         weights = [embedding_matrix],
                         input_length=maxlen)(word_input)
word_vec=GlobalAveragePooling1D()(word_embedding)  
embed_model = Model([word_input],word_vec)

embed_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 24)]              0         
                                                                 
 embedding (Embedding)       (None, 24, 300)           7625100   
                                                                 
 global_average_pooling1d (G  (None, 300)              0         
 lobalAveragePooling1D)                                          
                                                                 
Total params: 7,625,100
Trainable params: 7,625,100
Non-trainable params: 0
_________________________________________________________________


## Exécution du modèle

In [12]:
X_w2v= embed_model.predict(x_sentences)

In [13]:
X_w2v.shape

(24044, 300)

RandomForest ONeVsRest

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split



# transform output : 
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)
y_bin = multilabel_binarizer.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_w2v, y_bin, test_size=0.3, random_state=8)



In [15]:
# Initialize RandomForest with OneVsRest
param_rfc = {"estimator__max_depth": [5, 25, 50],
             "estimator__min_samples_leaf": [1, 5, 10],
             "estimator__class_weight": ["balanced"]}

multi_rfc_cv = GridSearchCV(OneVsRestClassifier(RandomForestClassifier()),
                            param_grid=param_rfc,
                            n_jobs=-1,
                            cv=2,
                            scoring="f1_weighted",
                            return_train_score = True,
                            refit=True,
                            verbose=3)
# Fit on Sample data
multi_rfc_cv.fit(X_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


GridSearchCV(cv=2,
             estimator=OneVsRestClassifier(estimator=RandomForestClassifier()),
             n_jobs=-1,
             param_grid={'estimator__class_weight': ['balanced'],
                         'estimator__max_depth': [5, 25, 50],
                         'estimator__min_samples_leaf': [1, 5, 10]},
             return_train_score=True, scoring='f1_weighted', verbose=3)

In [16]:
rfc_cv_results = pd.DataFrame.from_dict(multi_rfc_cv.cv_results_)
rfc_best_params = multi_rfc_cv.best_params_
print(rfc_best_params)

{'estimator__class_weight': 'balanced', 'estimator__max_depth': 5, 'estimator__min_samples_leaf': 5}


In [17]:
rfc_best_params_ok = {}
for k, v in rfc_best_params.items():
    rfc_best_params_ok[k.replace("estimator__","")] = v

In [18]:
# Refit RandomForestClassifier best_params with full dataset
rfc_final_model = OneVsRestClassifier(RandomForestClassifier(**rfc_best_params_ok))
rfc_final_model.fit(X_train, y_train)

# Predict
y_test_predicted_labels_tfidf_rfc = rfc_final_model.predict(X_test)

# Inverse transform
y_test_pred_inversed_rfc = multilabel_binarizer.inverse_transform(y_test_predicted_labels_tfidf_rfc)
y_test_inversed = multilabel_binarizer.inverse_transform(y_test)
print("Predicted:", y_test_pred_inversed_rfc[0:5])
print("True:", y_test_inversed[0:5])

Predicted: [('algorithm', 'arrays', 'c++', 'numpy', 'pandas', 'performance', 'python', 'r'), ('algorithm', 'python'), ('database', 'json', 'mysql', 'performance', 'php', 'python', 'sql', 'sql-server'), ('.net', 'ajax', 'asp.net', 'asp.net-mvc', 'c#', 'java', 'javascript', 'json', 'spring'), ('c', 'c++', 'c++11', 'python')]
True: [('algorithm', 'arrays', 'c'), ('java',), ('sql-server',), ('asp.net-mvc', 'javascript', 'json'), ('javascript',)]


In [19]:
def metrics_score(model, df, y_true, y_pred):
 
    if(df is not None):
        temp_df = df
    else:
        temp_df = pd.DataFrame(index=["Accuracy", "F1",
                                      "Jaccard", "Recall",
                                      "Precision"],
                               columns=[model])
        
    scores = []
    scores.append(metrics.accuracy_score(y_true, 
                                         y_pred))
    scores.append(metrics.f1_score(y_pred, 
                                   y_true, 
                                   average='weighted'))
    scores.append(metrics.jaccard_score(y_true, 
                                        y_pred, 
                                        average='weighted'))
    scores.append(metrics.recall_score(y_true, 
                                       y_pred, 
                                       average='weighted'))
    scores.append(metrics.precision_score(y_true, 
                                          y_pred, 
                                          average='weighted'))
    temp_df[model] = scores
    
    return temp_df
    
df_metrics_compare = metrics_score("RandomForest", 
                                   df=None,
                                   y_true = y_test,
                                   y_pred = y_test_predicted_labels_tfidf_rfc)
df_metrics_compare


Unnamed: 0,RandomForest
Accuracy,0.023704
F1,0.352814
Jaccard,0.24546
Recall,0.689521
Precision,0.278463
