# Pipeline de classification de post StackOverflow

In [1]:
import pandas as pd
import re
import string
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Téléchargement des ressources NLTK 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Fonctions de prétraitement
def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r'https?\S+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

def preprocess_dataset(dataset, text_columns):
    for col in text_columns:
        dataset[col] = dataset[col].apply(preprocess_text)
    return dataset

def apply_stemming_lemmatization(tokens):
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed = [ps.stem(word) for word in tokens]
    lemmatized = [lemmatizer.lemmatize(word) for word in stemmed]
    return lemmatized

def preprocess_and_transform(dataset):
    text_columns = dataset.columns
    dataset = preprocess_dataset(dataset, text_columns)
    for col in text_columns:
        dataset[col] = dataset[col].apply(apply_stemming_lemmatization)
    return dataset

# Fonction pour joindre les tokens en une seule chaîne de caractères
def join_tokens(df):
    for col in df.columns:
        df[col] = df[col].apply(lambda x: ' '.join(x))
    return df

def combine_text_columns(df):
    return df['Title'] + ' ' + df['Body']

def tag_cleaning(df):
    extracted_tags = []
    for tags in df['Tags']:
        try:
            tag_list = re.findall(r'<(.*?)>', tags)
            first_tag = tag_list[0] if tag_list else 'vide'
            extracted_tags.append(first_tag)
        except:
            extracted_tags.append('vide')
    
    df['new_tags'] = extracted_tags
    df = df[df['new_tags'] != 'vide']
    top_15_tags = df['new_tags'].value_counts().head(15).index.tolist()
    df = df[df['new_tags'].isin(top_15_tags)]
    
    return df
# Fonction qui encode les tags
def encode_labels(df):
    label_encoder = LabelEncoder()
    df['encoded_tags'] = label_encoder.fit_transform(df['new_tags'])

    
    # Sauvegarder les labels originaux pour retourner le nom du tags 
    with open('labels.txt', 'w') as f:
        for label in label_encoder.classes_:
            f.write(f"{label}\n")

    return df, label_encoder

# Chargement et prétraitement des données
df = pd.read_csv('QueryResults.csv')
df = tag_cleaning(df)
df, label_encoder = encode_labels(df)

# /!\ Toutes les etapes avant la pipeline doivent etre des transformation de données 

# Création de la pipeline sklearn

pipeline = Pipeline(steps=[
    ('preprocess', FunctionTransformer(func=preprocess_and_transform, validate=False)),
    ('join_tokens', FunctionTransformer(func=join_tokens, validate=False)), 
    ('combine', FunctionTransformer(func=combine_text_columns, validate=False)),
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter=1000)) # /!\ on ne peut pas mettre inverse transform dans la pipeline, il faut trouver un autre moyen
    ])

# Préparer les données pour l'entraînement
X = df[['Title', 'Body']]
y = df['encoded_tags']

# Entraîner le modèle
pipeline.fit(X, y)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bleye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bleye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bleye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bleye\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  text = BeautifulSoup(text, "html.parser").get_text()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entraîner le modèle
pipeline.fit(X_train, y_train)

# Faire des prédictions sur l'ensemble de test
y_pred = pipeline.predict(X_test)

# Afficher les métriques de performance
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Classification Report:
               precision    recall  f1-score   support

      asp.net       0.76      0.67      0.71       345
            c       0.63      0.15      0.24       145
           c#       0.61      0.83      0.70      1259
          c++       0.70      0.71      0.70       533
         html       0.76      0.57      0.65       125
         java       0.90      0.81      0.85       706
   javascript       0.85      0.86      0.86       401
          php       0.94      0.79      0.86       348
       python       0.95      0.85      0.89       414
ruby-on-rails       0.89      0.73      0.80       118
          sql       0.69      0.74      0.71       287
   sql-server       0.66      0.51      0.57       176
visual-studio       0.69      0.63      0.66       117
      windows       0.63      0.50      0.56       152
          wpf       0.79      0.57      0.66       125

     accuracy                           0.74      5251
    macro avg       0.76      0.66      

# Faire une prediction

In [3]:
# Création un Df pour tester des nouvelles données
new_data = pd.DataFrame({
    'Title': ['help ! my function not working'],
    'Body': ['i need help with my python function']
})

# Faire une prédiction avec la pipeline
predictions = pipeline.predict(new_data)

# Décoder les étiquettes prédites
decoded_predictions = label_encoder.inverse_transform(predictions)

# Afficher les résultats
print(decoded_predictions)

['python']


In [5]:
import cloudpickle

# Enregistrer la pipeline
with open('pipeline.pkl', 'wb') as f:
    cloudpickle.dump(pipeline, f)

# Charger la pipeline
with open('pipeline.pkl', 'rb') as f:
    loaded_pipeline = cloudpickle.load(f)


new_data = pd.DataFrame({
    'Title': ['help ! my function not working'],
    'Body': ['i need help with my python function']
})

# Utiliser la pipeline chargée
X_transformed = loaded_pipeline.predict(new_data)
print(X_transformed)

[8]


### Afficher la doc d'une fonction 

In [23]:
help(encode_labels)

Help on function encode_labels in module __main__:

encode_labels(df)



# Enregistrer la pipeline dans mlflow

In [6]:
import mlflow
import mlflow.sklearn
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline

# Configurer le tracking URI de MLflow
mlflow.set_tracking_uri("http://localhost:5000")

# Démarrer une nouvelle expérience MLflow
mlflow.set_experiment("Text Classification")

with mlflow.start_run():
    # Entraîner la pipeline
    pipeline.fit(X, y)
    
    # Enregistrer la pipeline dans MLflow
    mlflow.sklearn.log_model(pipeline, "model")

    # Enregistrer les paramètres et les métriques
    mlflow.log_params({"vectorizer": "TF-IDF", "classifier": "Logistic Regression"})
    mlflow.log_metric("accuracy", pipeline.score(X, y))
    
    print("Modèle enregistré avec succès dans MLflow.")


2024/09/19 14:32:49 INFO mlflow.tracking.fluent: Experiment with name 'Text Classification' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datas

Modèle enregistré avec succès dans MLflow.


http://127.0.0.1:5000/#/experiments/849489080888736322/runs/f456f5602e9648d98bed7f4729ccd337

In [31]:
# from evidently.report import Report
# from evidently.metric_preset import DataDriftPreset

# # Générer un rapport de dérive de données avec Evidently
# report = Report(metrics=[DataDriftPreset()])
# report.run(reference_data=X_train, current_data=X_test)
# report.save_html("pipeline_data_drift_report.html")

# # Enregistrer le rapport Evidently dans MLflow
# mlflow.log_artifact("pipeline_data_drift_report.html")
