In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.20.4-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.20.4 (from mlflow)
  Downloading mlflow_skinny-2.20.4-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.20.4->mlflow)
  Downloading databricks_sdk-0.46.0-py3-none-any.whl.metadata (38 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_relay-3.2.0-py3-none-any.whl.metadata (12 kB)
Downloading mlflow-2.20.4-py3-none-any.whl (28.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.4/28.4 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[?25hDown

In [2]:
import gensim
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn
import os
import shutil
import pickle

In [3]:
os.makedirs("/kaggle/working/models", exist_ok=True)

In [4]:
mlflow.set_tracking_uri("/kaggle/working/mlruns")

mlflow.set_experiment("WORD2VEC")

print("MLflow configuré sur Kaggle ! Logs enregistrés dans /kaggle/working/mlruns")

2025/03/13 21:54:46 INFO mlflow.tracking.fluent: Experiment with name 'WORD2VEC' does not exist. Creating a new experiment.


MLflow configuré sur Kaggle ! Logs enregistrés dans /kaggle/working/mlruns


In [5]:
# Exemple de fonction de tokenisation
def preprocess_text(text):
    return simple_preprocess(text)

In [6]:
data_og = pd.read_csv("/kaggle/input/bigot-benjamin-1-notebook-exploration-022025/preprocessed_data.csv")
data = data_og.sample(n=10000, random_state=42)
data.head()

Unnamed: 0,Id,Title,Body,Tags,Score,ViewCount,AnswerCount,CreationDate,Processed
33553,76601348,How do I pass in parameters into arrow functio...,"<p><a href=""https://i.sstatic.net/u7abJ.png"" r...",<reactjs><react-hooks><jsx><parameter-passing>...,-2,488,2,2023-07-03 01:16:37,"['pass', 'parameter', 'arrow', 'function', 're..."
9427,75749445,How to test @NotBlank spring validation with J...,<p>I am trying to test validation I have a con...,<java><spring><spring-boot><validation><testing>,0,1582,2,2023-03-15 19:51:22,"['test', 'spring', 'validation', 'junit', 'try..."
199,75900198,"DBeaver - unable to update driver: ""Network un...",<p>Always getting this error when I try to upd...,<ssl><network-programming><ssl-certificate><dr...,5,21968,1,2023-03-31 15:22:24,"['update', 'driver', 'network', 'certificate',..."
12447,76021053,Youtube player API to get server url few times...,<p>I am using this code in PHP to get the Yout...,<php><youtube><youtube-api><youtube-data-api><...,0,1251,1,2023-04-15 08:08:22,"['player', 'api', 'server', 'url', 'time', 're..."
39489,78823804,Where is it specified the path to look for ker...,<p>I am launching an Anconda prompt from a Win...,<python><json><anaconda><kernel><jupyter>,0,398,1,2024-08-02 04:42:33,"['specify', 'path', 'look', 'launch', 'window'..."


In [7]:
df_train = pd.read_csv("/kaggle/input/embeddings/X_train.csv")
df_test = pd.read_csv("/kaggle/input/embeddings/X_test.csv")

X_train = df_train.to_numpy()
X_test = df_test.to_numpy()

In [8]:
df_test.head()

Unnamed: 0,0
0,"['pytorch', 'the', 'number', 'of', 'sizes', 'p..."
1,"['the', 'best', 'and', 'fast', 'way', 'to', 'c..."
2,"['why', 'my', 'mistral', 'model', 'generate', ..."
3,"['is', 'there', 'any', 'way', 'beeware', 'is',..."
4,"['how', 'to', 'use', 'ffmpeg', 'with', 'node',..."


In [9]:
df_train_w2v, df_test_w2v, df_y_train, df_y_test = pd.read_csv("/kaggle/input/embeddings/X_train_W2V.csv"), \
pd.read_csv("/kaggle/input/embeddings/X_test_W2V.csv"), \
pd.read_csv("/kaggle/input/embeddings/y_train.csv"), \
pd.read_csv("/kaggle/input/embeddings/y_test.csv")

X_train_w2v, X_test_w2v, y_train, y_test = df_train_w2v.to_numpy(), df_test_w2v.to_numpy(), df_y_train.to_numpy(), df_y_test.to_numpy()

# Vérifier la taille des jeux de données
print(f"Taille du train : {len(X_train_w2v)}")
print(f"Taille du test : {len(X_test_w2v)}")

Taille du train : 8000
Taille du test : 2000


In [10]:
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_train_w2v, y_train)

y_pred = clf.predict(X_test_w2v)
score = jaccard_score(y_test, y_pred, average="samples")
print("Jaccard Score avec Word2Vec :", score)

with mlflow.start_run():
    mlflow.log_param("model", "W2V : OneVsRest + LogisticRegression")
    mlflow.log_param("max_iter", 1000)
    mlflow.log_metric("jaccard_score", score)
    
    # Sauvegarder le modèle
    mlflow.sklearn.log_model(clf, "model")

  _warn_prf(average, modifier, msg_start, len(result))


Jaccard Score avec Word2Vec : 0.20130421653509886




In [11]:
# Supposons que y_train et y_test contiennent les étiquettes (préalablement encodées)
clf_sgd = OneVsRestClassifier(SGDClassifier(loss="log_loss", max_iter=1000))
clf_sgd.fit(X_train_w2v, y_train)

y_pred = clf_sgd.predict(X_test_w2v)
score = jaccard_score(y_test, y_pred, average="samples")
print("Jaccard Score avec Word2Vec :", score)

with mlflow.start_run():
    mlflow.log_param("model", "W2V : OneVsRest + SGDClassifier")
    mlflow.log_param("max_iter", 1000)
    mlflow.log_metric("jaccard_score", score)
    
    # Sauvegarder le modèle
    mlflow.sklearn.log_model(clf_sgd, "model")

  _warn_prf(average, modifier, msg_start, len(result))


Jaccard Score avec Word2Vec : 0.21125368974897726




In [12]:
y_proba = clf_sgd.predict_proba(X_test_w2v)

with open("/kaggle/input/embeddings/mlb.pkl", "rb") as f:
    mlb_loaded = pickle.load(f)

print("Classes rechargées :", mlb_loaded.classes_)
# Définir un seuil de probabilité
threshold = 0.7

# Fonction pour récupérer les tags qui dépassent le seuil
def get_tags_with_threshold(probas, threshold=0.3):
    indices = np.where(probas >= threshold)[0]
    if len(indices) == 0:
        indices = [np.argmax(probas)]
    return [mlb_loaded.classes_[i] for i in indices]

# Appliquer sur toutes les prédictions
predicted_tags = [get_tags_with_threshold(proba, threshold) for proba in y_proba]


Classes rechargées : ['python' 'javascript' 'reactjs' ... 'quic' 'apple-developer' 'appkit']


In [13]:
X_test_with_predictions = pd.DataFrame(X_test)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train) 
y_test = pd.DataFrame(y_test)

X_test_with_predictions["Predicted_Tags"] = predicted_tags

X_total = pd.concat([X_train, X_test], axis=0).reset_index(drop=True)
Y_total = pd.concat([y_train, y_test], axis=0).reset_index(drop=True)
X_total_reset = X_total.reset_index(drop=True)
Y_total_reset = Y_total.reset_index(drop=True)

# Concaténer horizontalement (axis=1)
df_total = pd.concat([X_total_reset, Y_total_reset], axis=1)

X_test_with_predictions.to_csv("X_test_with_predictions_W2V.csv", index=False)
df_total.to_csv("df_final_W2V.csv", index=False)

In [14]:
# Compresser tout le répertoire mlruns
shutil.make_archive("/kaggle/working/mlruns_w2v", "zip", "/kaggle/working/mlruns")

'/kaggle/working/mlruns_w2v.zip'