In [1]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
import mlflow
import matplotlib.pyplot as plt
import gensim
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
from sklearn.base import BaseEstimator, TransformerMixin
import torch
from sklearn.manifold import TSNE
import seaborn as sns
import pyLDAvis
import pyLDAvis.lda_model
import time

  from .autonotebook import tqdm as notebook_tqdm


### Metrique d'évaluation utilisé
  
  
Perplexity : La perplexité est une mesure de la qualité du modèle. Une perplexité plus faible indique un meilleur modèle. Vous pouvez obtenir ce score en utilisant la méthode perplexity.  
  
Coherence Score : Le score de cohérence mesure la cohérence sémantique des sujets générés par le modèle LDA. Un score de cohérence plus élevé indique que les mots les plus importants d'un sujet sont plus sémantiquement similaires les uns aux autres. Le score de cohérence n'est pas directement disponible dans scikit-learn, mais peut être calculé en utilisant la bibliothèque Gensim.
  
### LDA.  
L'Analyse de la Répartition Latente de Dirichlet (LDA) est une technique de modélisation de sujets qui est largement utilisée en traitement du langage naturel. Elle permet de découvrir les sujets cachés dans un ensemble de documents en attribuant des distributions de probabilité à la fois aux documents (sur les sujets) et aux sujets (sur les mots), ce qui permet d'obtenir une représentation non supervisée et interprétable des données textuelles.



In [2]:
# Définir l'expérience MLflow
mlflow.set_experiment("mon_experiment")


<Experiment: artifact_location='file:///Users/bahia/Desktop/MLE-P5-V2/mlruns/637386724232269704', creation_time=1715868222514, experiment_id='637386724232269704', last_update_time=1715868222514, lifecycle_stage='active', name='mon_experiment', tags={}>

### Lda avec gensim

In [3]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Classe de prétraitement
class GensimPreprocessor:
    def __init__(self, stop_words=stopwords.words('english')):
        self.stop_words = stop_words
        
    def tokenize(self, text):
        return [word for word in word_tokenize(text.lower()) if word.isalpha()]
    
    def remove_stopwords(self, tokens):
        return [word for word in tokens if word not in self.stop_words]
    
    def preprocess(self, documents):
        preprocessed = []
        for document in documents:
            tokens = self.tokenize(document)
            tokens = self.remove_stopwords(tokens)
            preprocessed.append(tokens)
        return preprocessed

# Classe pour l'application de LDA de Gensim
class GensimLDA:
    def __init__(self, num_topics=8, passes=10):
        self.num_topics = num_topics
        self.passes = passes
        self.dictionary = None
        self.model = None
        
    def fit(self, documents):
        self.dictionary = Dictionary(documents)
        corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.model = LdaModel(corpus=corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=self.passes, random_state=42)
        
    def get_coherence(self, documents):
        coherence_model = CoherenceModel(model=self.model, texts=documents, dictionary=self.dictionary, coherence='c_v')
        return coherence_model.get_coherence()


[nltk_data] Downloading package stopwords to /Users/bahia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def visualize_topics(document_topics, dim_names, document_topics_max):
    # Appliquer t-SNE aux sujets
    tsne_model = TSNE(n_components=2, random_state=0)
    tsne_topics = tsne_model.fit_transform(document_topics)

    # Créer un DataFrame avec les résultats t-SNE et les sujets
    df_tsne = pd.DataFrame(data=tsne_topics, columns=dim_names)
    df_tsne['Topic'] = document_topics_max

    # Créer un graphique de dispersion des sujets avec Seaborn
    plt.figure(figsize=(5, 5))
    sns.scatterplot(x=dim_names[0], y=dim_names[1], hue="Topic", palette="deep", data=df_tsne)
    plt.savefig(f"tsne.png")
    mlflow.log_artifact(f"tsne.png")
    plt.show()

In [5]:
def stocker_results(name = str,loglikhood_train=None, perplexity_train=None, loglikhood_test=None, perplexity_test=None, fit_time=None, transform_time=None):
    # Enregistrer les résultats dans un DataFrame
    results = pd.DataFrame({'Vectorizer': [name], 'Log Likelihood_train': [loglikhood_train], 'Perplexity_train': [perplexity_train], 
    'Log Likelihood Test': [loglikhood_test], 'Perplexity Test': [perplexity_test], 'Fit Time': [fit_time]})
    display(results)
    return results


In [6]:
data = pd.read_csv('Data/df_tags_filtered.csv')
data['combined'] = data['Title'] + " " + data['Body']

display(data[:1])

def words_tokenize(text):
  text = text.split()
  return text

#data['Body'] = data.Body.apply(lambda text: words_tokenize(text))
#data['Title'] = data.Title.apply(lambda text: words_tokenize(text))
data['Tags'] = data.Tags.apply(lambda text: words_tokenize(text))
display(data[:1])

  if distutils.version.LooseVersion(version) < minimum_version:
  other = LooseVersion(other)


Unnamed: 0,Title,Body,Tags,Body_word_count,combined
0,android jetpack navigation bottomnavigationvie...,android jetpack navigation bottomnavigationvie...,android android-architecture-components bottom...,131,android jetpack navigation bottomnavigationvie...


Unnamed: 0,Title,Body,Tags,Body_word_count,combined
0,android jetpack navigation bottomnavigationvie...,android jetpack navigation bottomnavigationvie...,"[android, android-architecture-components, bot...",131,android jetpack navigation bottomnavigationvie...


In [7]:
X = data['combined']
y = data['Tags']


from sklearn.model_selection import train_test_split
df = pd.DataFrame(data)
def merge_and_keep_first_occurrence(row):
    merged = row['Title'] + row['Body']
    return list(dict.fromkeys(merged))

display(X[:1])

X_train, X_test, y_train, y_test = train_test_split(X,y,
    test_size=0.2,
    random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


0    android jetpack navigation bottomnavigationvie...
Name: combined, dtype: object

X_train shape: (26863,)
X_test shape: (6716,)
y_train shape: (26863,)
y_test shape: (6716,)


## Test avec LDA Gensim

train set

In [8]:
#documents = X_fulltrain['Body']
documents =  X_train
display(documents)
print(documents.shape)
preprocessor = GensimPreprocessor()
preprocessed_documents = preprocessor.preprocess(documents)
lda = GensimLDA(num_topics=6, passes=10)
lda.fit(preprocessed_documents)

coherence_score = lda.get_coherence(preprocessed_documents)
corpus = [lda.dictionary.doc2bow(text) for text in preprocessed_documents]


print("coherence score =",coherence_score)

23366    replaceenvironmentnewline work expect display ...
28980    how i programmatically download file browser n...
1873     how i find data directory sql server instance ...
9188     how correctly standardly compare float every t...
12898    could find aaptprotojar see link suspect pom f...
                               ...                        
16850    no debuggable process logcat phone detect andr...
6265     bug v live unit test minus work tldr edit narr...
11284    hierarchical enums c work message parsergenera...
860      what difference among collg colmd colsm bootst...
15795    how find small number common among array quest...
Name: combined, Length: 26863, dtype: object

(26863,)
coherence score = 0.5011265507000353


Test set

In [9]:
documents = X_test
print(documents.shape)
#preprocessor = GensimPreprocessor()
preprocessed_test_documents = preprocessor.preprocess(X_test)
print(X_test.shape)
# Supposons que `dictionary` est votre dictionnaire Gensim créé pendant l'entraînement
test_corpus = [lda.dictionary.doc2bow(doc) for doc in preprocessed_test_documents]

# Supposons que `lda_model` est votre modèle LDA Gensim entraîné
test_documents_topics = [lda.model.get_document_topics(bow) for bow in test_corpus]

most_probable_topics = [max(doc, key=lambda x: x[1])[0] for doc in test_documents_topics]
# Calcul du score de cohérence sur le set de test
#coherence_model_test = CoherenceModel(model=lda.model, texts=preprocessed_test_documents, dictionary=lda.dictionary, coherence='c_v')
coherence_score_test = lda.get_coherence(preprocessed_test_documents)
print(f"Score de cohérence sur le set de test: {coherence_score_test}")



(6716,)
(6716,)
Score de cohérence sur le set de test: 0.5004027276975475


In [10]:
test_documents_topics = lda.model.get_document_topics(test_corpus, minimum_probability=0.0)
documents_topics_count_vec = np.array([[prob for _, prob in doc_topics] for doc_topics in test_documents_topics])
document_topics_max = np.argmax(documents_topics_count_vec, axis=1)

In [11]:
import pyLDAvis.gensim_models
import pyLDAvis
# Assurez-vous d'appeler pyLDAvis.enable_notebook() dans un environnement Jupyter pour l'interaction inline
pyLDAvis.enable_notebook()
import os

print(os.path.dirname(pyLDAvis.__file__))

# Préparation des données pour PyLDAVis
vis = pyLDAvis.gensim_models.prepare(lda.model, test_corpus, dictionary=lda.dictionary)

# Visualisation
pyLDAvis.display(vis)

/opt/anaconda3/envs/newenvP5/lib/python3.8/site-packages/pyLDAvis


  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p19 = _nlv < LooseVersion("1.19")
  _np_version_under1p20 = _nlv < LooseVersion("1.20")
  other = LooseVersion(other)
  _nlv = LooseVersion(_np_version)
  np_version_under1p17 = _nlv < LooseVersion("1.17")
  np_version_under1p18 = _nlv < LooseVersion("1.18")
  _np_version_under1p1

In [12]:
# obtain topic distributions for each document
topic_dist = pd.DataFrame(test_documents_topics)
for topic in topic_dist.columns:
    topic_dist[topic] = topic_dist[topic].apply(lambda x : x[1])

print('matrix of document/topic distribution shape:', topic_dist.shape)
display(topic_dist.head())


matrix of document/topic distribution shape: (6716, 6)


Unnamed: 0,0,1,2,3,4,5
0,0.760363,0.002114,0.002105,0.002094,0.231235,0.002089
1,0.929836,0.002538,0.002548,0.002548,0.059989,0.002541
2,0.514477,0.001965,0.001969,0.079289,0.243532,0.158768
3,0.847218,0.037943,0.001747,0.00175,0.001758,0.109583
4,0.120295,0.001279,0.04913,0.405605,0.422413,0.001278


In [13]:

from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer()

# Adapter MultiLabelBinarizer et transformer les étiquettes
y_binarized = mlb.fit_transform(y_train)
y_test_binarized = mlb.transform(y_test) 
display(y_test_binarized.shape)




(6716, 10378)

In [14]:
topic_tag = np.matmul(topic_dist.T, y_test_binarized)
# normalizing 
topic_tag = topic_tag/np.sum(y_test_binarized,axis=0)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier


time1 =time.time()
x_train, x_test, y_train_lr, y_test_lr = train_test_split(topic_dist, y_test_binarized, test_size=0.2, random_state=100)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train_lr.shape)
print("y_test shape:", y_test_lr.shape)
lr = OneVsRestClassifier(LogisticRegression()).fit(x_train, y_train_lr)
y_pred_lr = lr.predict_proba(x_test)
fit_time = np.round(time.time() - time1,1)
#score of prediction



x_train shape: (5372, 6)
x_test shape: (1344, 6)
y_train shape: (5372, 10378)
y_test shape: (1344, 10378)




In [16]:
probabilities = y_pred_lr

print(probabilities.shape, y_test_lr.shape)
# Définir un seuil personnalisé pour sélectionner les tags
threshold = 0.15  # baisser le seuil pour inclure plus de tags

def select_top_n_tags(probabilities, threshold, top_n=5):
    # Créer une matrice de zéros de la même forme que probabilities
    final_tags = np.zeros(probabilities.shape)
    
    for i in range(probabilities.shape[0]):
        # Trouver les indices qui satisfont le seuil
        eligible_indices = np.where(probabilities[i] >= threshold)[0]
        
        # Trier ces indices en fonction de leur probabilité
        top_indices = eligible_indices[np.argsort(probabilities[i][eligible_indices])][::-1]
        
        # Sélectionner les top N indices
        top_n_indices = top_indices[:top_n]
        
        # Mettre à 1 les positions correspondant aux tags sélectionnés
        final_tags[i, top_n_indices] = 1
    
    return final_tags

# Utiliser la fonction pour obtenir les tags prédits

binary_predictions = select_top_n_tags(probabilities, threshold, top_n=7)

(1344, 10378) (1344, 10378)


In [17]:
print("Shape of true values:", y_test_lr.shape)
print("Shape of predictions:", binary_predictions.shape)

Shape of true values: (1344, 10378)
Shape of predictions: (1344, 10378)


In [18]:
from sklearn.metrics import jaccard_score


print("jaccord score:", jaccard_score(binary_predictions, y_test_lr, average='samples'))
print("Precision:", precision_score(binary_predictions, y_test_lr, average='samples'))
print("Recall:", recall_score(binary_predictions, y_test_lr, average='samples'))
print("F1 Score:", f1_score(binary_predictions, y_test_lr, average='samples'))


jaccord score: 0.13773660821651892
Precision: 0.19124503968253967


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.3119224773242631
F1 Score: 0.22058515641104925


In [19]:
# Afficher les labels des prédictions
predictions_labels =mlb.inverse_transform(y_test_lr)
true_labels = mlb.inverse_transform(binary_predictions)
print(predictions_labels)
print(true_labels)

[('c#', 'c++', 'java', 'python'), ('c#', 'c++', 'java', 'python'), ('android',), ('c#', 'java', 'python'), ('android', 'c#'), ('android', 'c#', 'python'), ('android', 'c#', 'python'), ('c#', 'java', 'python'), ('android', 'android-activity', 'android-fragments', 'android-intent', 'android-layout', 'android-recyclerview', 'java'), ('c#', 'c++', 'java', 'python'), ('javascript',), ('c#', 'python'), ('c#', 'python'), ('c#', 'python'), ('android', 'c#', 'python'), ('javascript',), ('c#', 'java', 'javascript'), ('c#', 'c++', 'java', 'python'), ('android',), ('android', 'java'), ('android', 'c#', 'python'), ('android', 'c#', 'python'), ('android',), ('c#', 'c++', 'java', 'python'), ('android', 'c#', 'python'), ('c#', 'java'), ('c#', 'python'), ('android', 'c#', 'java'), ('c#', 'c++', 'java'), ('android', 'c#', 'python'), ('c#', 'java', 'python'), ('android', 'c#', 'python'), ('javascript',), ('android', 'android-fragments', 'android-layout', 'java'), ('c#', 'python'), ('c#', 'python'), ('ios

In [20]:


# Créer un DataFrame à partir des labels de prédiction et des vrais labels
df = pd.DataFrame({
    'Predicted Labels': predictions_labels,
    'True Labels': true_labels
})

display(df)

Unnamed: 0,Predicted Labels,True Labels
0,"(abstract-class, arrays, c#, pass-by-reference)","(c#, c++, java, python)"
1,"(c++, exponentiation, integer-overflow, modulo)","(c#, c++, java, python)"
2,"(database, geocoding, latitude-longitude, sql)","(android,)"
3,"(firebase, google-cloud-firestore, javascript)","(c#, java, python)"
4,"(android, android-2.2-froyo, android-service, ...","(android, c#)"
...,...,...
1339,"(android, android-button, android-layout, andr...","(android, android-activity, android-fragments,..."
1340,"(cocoa, compilation, compiler-errors, objectiv...","(c++, java, python)"
1341,"(annotations, autowired, java, spring, spring-...","(android, java, javascript, reactjs, spring, s..."
1342,"(dictionary, grouping, list, python, reorganize)","(c#, c++, java, python)"
