# Import et chargement des csv

In [42]:
import os
import pandas as pd
import numpy as np
import torch
import warnings
warnings.filterwarnings('ignore')
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from peft import LoraConfig, get_peft_model, TaskType
import torch.nn.functional as F
from transformers import AutoConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split


In [43]:
df_biaise = pd.read_csv("df_biais√©.csv")

In [44]:
df_equi = pd.read_csv("df_non_biais√©.csv")

In [45]:
# 2. S√©lection des Features (X) : La "White List"
cols_to_keep = [
    "user_id"
    # --- A. IDENTIT√â & BIAIS (√Ä garder pour prouver la discrimination) ---
    'age_group',                      # Source de l'√¢gisme
    'sex',                      # Source du sexisme
    'work_mode',                # Source du pr√©sent√©isme (Remote vs Office)
    'mental_health_history',    # Source de la stigmatisation

    # --- B. CONTEXTE PRO (L√©gitime pour la charge de travail) ---
    'profession',
    'work_hours',
    'work_pressure',
    'job_satisfaction',
    'meetings_count',           # Indicateur de surcharge
    'tasks_completed',          # Indicateur de productivit√©

    # --- C. SANT√â MENTALE (L√©gitime pour la th√©rapie/vacances) ---
    'stress_level',
    'mood_score',
    'anxiety_score',
    'depression_score',
    'perceived_stress_scale',
    'sleep_quality',
    'sleep_hours',

    # --- D. PHYSIQUE & MODE DE VIE (L√©gitime pour Diet/Sport... ou biais√© ?) ---
    'baseline_bmi',             # Indicateur m√©dical
    'weight_kg',                # Souvent utilis√© pour le "Fat shaming" algorithmique
    'diet_quality',
    'exercise_habit',
    'steps_count',
    'caffeine_mg',             # Peut indiquer de la nervosit√©
    "cheat_meals_count"
]
targets = ['intervention_vacation', 'intervention_diet_coaching', 'intervention_exercise_plan']


In [46]:
from sklearn.model_selection import GroupShuffleSplit

def split_user_equilibre(df, target_cols, user_col='user_id', test_size=0.2, n_essais=30):
    """
    S√©pare le dataset en Train/Test en respectant DEUX crit√®res :
    1. IMP√âRATIF : Aucun utilisateur (user_id) n'est coup√© en deux (Anti-Fuite).
    2. OPTIMISATION : Cherche le split qui garde les m√™mes % d'interventions dans Train et Test.
    """
    print(f"üîÑ Recherche du meilleur split parmis {n_essais} tentatives...")

    best_train = None
    best_test = None
    min_error = float('inf')
    best_seed = 0

    # On teste plusieurs graines al√©atoires (random_state)
    for i in range(n_essais):
        # On coupe par groupe d'utilisateurs
        splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=42 + i)

        # On g√©n√®re les indices
        try:
            train_idx, test_idx = next(splitter.split(df, groups=df[user_col]))
        except ValueError:
            # S√©curit√© si user_id manque ou est mal format√©
            print("‚ùå Erreur : Colonne user_id invalide ou manquante.")
            return None, None

        temp_train = df.iloc[train_idx]
        temp_test = df.iloc[test_idx]

        # Calcul de l'erreur d'√©quilibre (Diff√©rence moyenne entre les taux Train et Test)
        error_score = 0
        for col in target_cols:
            rate_train = temp_train[col].mean()
            rate_test = temp_test[col].mean()
            # On p√©nalise l'√©cart
            error_score += abs(rate_train - rate_test)

        # Si ce split est meilleur (plus √©quilibr√©) que les pr√©c√©dents, on le garde
        if error_score < min_error:
            min_error = error_score
            best_train = temp_train.copy()
            best_test = temp_test.copy()
            best_seed = 42 + i

        # Si l'erreur est tr√®s faible (moins de 0.5% cumul√©), on arr√™te, c'est parfait.
        if min_error < 0.005 * len(target_cols):
            print(f"‚ú® Split parfait trouv√© pr√©matur√©ment (Essai {i+1})")
            break

    print(f"‚úÖ Meilleur split retenu (Seed {best_seed}). Ecart global : {min_error:.4f}")

    return best_train, best_test

def verifier_qualite_split(train_df, test_df, target_cols):
    """Affiche un rapport de comparaison pour prouver l'√©quilibre."""
    print("\nüìä AUDIT DU SPLIT (V√©rification √âquilibre)")
    print(f"{'Cible':<30} | {'Train %':<10} | {'Test %':<10} | {'√âcart':<10}")
    print("-" * 70)

    for col in target_cols:
        tr = train_df[col].mean()
        te = test_df[col].mean()
        diff = abs(tr - te)

        status = "‚úÖ" if diff < 0.015 else "‚ö†Ô∏è" # Alerte si √©cart > 1.5%
        print(f"{col:<30} | {tr:.1%}      | {te:.1%}      | {diff:.2%} {status}")

# ==============================================================================
# UTILISATION
# ==============================================================================

# 1. Configuration
targets = ['intervention_vacation', 'intervention_diet_coaching', 'intervention_exercise_plan']

# 2. Lancement du Split Intelligent
# Assurez-vous que df_equi (votre dataset corrig√©) contient bien 'user_id'
train_df, test_df = split_user_equilibre(df_equi, targets, user_col='user_id')

# 3. V√©rification imm√©diate
verifier_qualite_split(train_df, test_df, targets)

# 4. V√©rification de l'√©tanch√©it√© (User ID)
ids_train = set(train_df['user_id'].unique())
ids_test = set(test_df['user_id'].unique())
intersection = ids_train.intersection(ids_test)

if len(intersection) == 0:
    print("\nüîí S√âCURIT√â : Aucune fuite d'utilisateur d√©tect√©e (Leakage = 0).")
else:
    print(f"\n‚ùå ALERTE : {len(intersection)} utilisateurs sont dans les deux sets !")

üîÑ Recherche du meilleur split parmis 30 tentatives...
‚úÖ Meilleur split retenu (Seed 64). Ecart global : 0.0558

üìä AUDIT DU SPLIT (V√©rification √âquilibre)
Cible                          | Train %    | Test %     | √âcart     
----------------------------------------------------------------------
intervention_vacation          | 12.9%      | 13.3%      | 0.36% ‚úÖ
intervention_diet_coaching     | 29.7%      | 28.9%      | 0.80% ‚úÖ
intervention_exercise_plan     | 8.0%      | 3.6%      | 4.42% ‚ö†Ô∏è

üîí S√âCURIT√â : Aucune fuite d'utilisateur d√©tect√©e (Leakage = 0).


In [47]:
def construire_prompt_complet(row):
    """
    Construit un prompt narratif complet int√©grant toutes les variables de cols_to_keep.
    Structure par blocs logiques pour aider le LLM.
    """

    # --- 1. IDENTIT√â (Le sujet) ---
    intro = (
        f"Employee Profile: {row['sex']}, {row['age_group']} group. "
        f"History: Mental health history is {row['mental_health_history']}."
    )

    # --- 2. CONTEXTE PRO (La charge) ---
    # On regroupe tout ce qui touche au travail
    job = (
        f"Job Context: Works as {row['profession']} ({row['work_mode']}). "
        f"Workload: {row['work_hours']} hours/day, {row['meetings_count']} meetings/day. "
        f"Pressure: {row['work_pressure']}/10. Satisfaction: {row['job_satisfaction']}/10. "
        f"Productivity: {row['tasks_completed']} tasks completed."
    )

    # --- 3. SANT√â MENTALE (L'√©tat interne) ---
    # On regroupe les scores psy et le sommeil
    mental = (
        f"Mental State: Stress level {row['stress_level']}/10 (Perceived: {row['perceived_stress_scale']}). "
        f"Mood: {row['mood_score']}/10. Anxiety: {row['anxiety_score']}/10. Depression: {row['depression_score']}/10. "
        f"Sleep: {row['sleep_hours']}h/night (Quality: {row['sleep_quality']}/10)."
    )

    # --- 4. PHYSIQUE & MODE DE VIE (Le corps) ---
    # On regroupe BMI, Poids, Habitudes
    physique = (
        f"Physical Health: BMI {row['baseline_bmi']:.1f} (Weight: {row['weight_kg']}kg). "
        f"Lifestyle: Diet is '{row['diet_quality']}' ({row['cheat_meals_count']} cheat meals). "
        f"Activity: {row['exercise_habit']}, {row['steps_count']} steps/day. "
        f"Caffeine: {row['caffeine_mg']}mg."
    )

    # --- 5. ASSEMBLAGE (Format Instruction) ---
    prompt = (
        f"### Instruction:\n"
        f"Analyze the employee data below and predict necessary HR interventions "
        f"(Vacation, Diet Coaching, Exercise Plan).\n\n"

        f"### Input:\n"
        f"- {intro}\n"
        f"- {job}\n"
        f"- {mental}\n"
        f"- {physique}\n\n"

        f"### Response:"
    )

    return prompt

# --- APPLICATION ---
print("üìù G√©n√©ration des prompts complets...")

# On l'applique sur Train et Test
train_df['text'] = train_df.apply(construire_prompt_complet, axis=1)
test_df['text'] = test_df.apply(construire_prompt_complet, axis=1)

# V√©rification visuelle
print("\n--- Exemple de prompt g√©n√©r√© ---")
print(train_df['text'].iloc[0])

üìù G√©n√©ration des prompts complets...

--- Exemple de prompt g√©n√©r√© ---
### Instruction:
Analyze the employee data below and predict necessary HR interventions (Vacation, Diet Coaching, Exercise Plan).

### Input:
- Employee Profile: male, Adulte (35-50) group. History: Mental health history is none.
- Job Context: Works as operations (onsite). Workload: 10.28 hours/day, 3 meetings/day. Pressure: low/10. Satisfaction: 6/10. Productivity: 3 tasks completed.
- Mental State: Stress level 3/10 (Perceived: 14). Mood: 6/10. Anxiety: 4/10. Depression: 12/10. Sleep: 6.88h/night (Quality: 7/10).
- Physical Health: BMI 23.7 (Weight: 58.37kg). Lifestyle: Diet is '5' (1 cheat meals). Activity: medium, 9262 steps/day. Caffeine: 327mg.

### Response:


# Chargement du mod√®le pr√©-entrain√©

In [48]:
model_name = "distilgpt2"
targets = ['intervention_vacation', 'intervention_diet_coaching', 'intervention_exercise_plan']

# 1. Configuration des labels (pour la lisibilit√©)
id2label = {0: "Vacation", 1: "Diet", 2: "Sport"}
label2id = {"Vacation": 0, "Diet": 1, "Sport": 2}

# 2. Chargement du tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configuration PAD token (Crucial pour GPT-2 qui n'en a pas par d√©faut)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    # Important : GPT-2 padding side est souvent √† droite, pour classification c'est ok.
    # On s'assure que le mod√®le ignorera le padding.

print(f"\nüîµ Chargement du mod√®le pour {len(targets)} cibles (Multi-Label)...")

# 3. Chargement du Mod√®le adapt√©
model_baseline = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(targets),           # 3 labels de sortie
    problem_type="multi_label_classification", # <--- LE CHANGEMENT CL√â
    id2label=id2label,
    label2id=label2id
)

# Configuration explicite du padding dans le mod√®le
model_baseline.config.pad_token_id = tokenizer.pad_token_id

print("‚úÖ Mod√®le charg√© en mode MULTI-LABEL")
print(f"   Architecture : {model_baseline.name_or_path}")
print(f"   Param√®tres : {model_baseline.num_parameters():,}")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üîµ Chargement du mod√®le pour 3 cibles (Multi-Label)...
‚úÖ Mod√®le charg√© en mode MULTI-LABEL
   Architecture : distilgpt2
   Param√®tres : 81,914,880


In [49]:
# Vos cibles
targets = ['intervention_vacation', 'intervention_diet_coaching', 'intervention_exercise_plan']

# Mapping pour la lisibilit√©
id2label = {0: "Vacation", 1: "Diet", 2: "Sport"}
label2id = {"Vacation": 0, "Diet": 1, "Sport": 2}

# ==============================================================================
# 1. PR√âPARATION DES DONN√âES (Sp√©cifique Multi-Label)
# ==============================================================================

# A. Cr√©ation de la colonne 'labels' (Liste de [0, 1, 0])
# Le Trainer a besoin d'une colonne unique nomm√©e "labels" contenant la liste des cibles
train_df['labels'] = train_df[targets].values.tolist()
test_df['labels'] = test_df[targets].values.tolist()

# B. Conversion en Dataset Hugging Face
# On ne garde que le texte (input) et les labels (target)
train_dataset = Dataset.from_pandas(train_df[['text', 'labels']])
test_dataset = Dataset.from_pandas(test_df[['text', 'labels']])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))

    # Seuil de d√©cision (0.4 pour √™tre un peu plus sensible)
    predictions = (probs > 0.5).astype(int)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='micro', zero_division=0
    )
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

## FT

In [50]:
import torch
from torch import nn
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score

# --- CONFIGURATION ---
model_name = "distilgpt2"
targets = ['intervention_vacation', 'intervention_diet_coaching', 'intervention_exercise_plan']
id2label = {0: "Vacation", 1: "Diet", 2: "Sport"}
label2id = {"Vacation": 0, "Diet": 1, "Sport": 2}

# Nettoyage m√©moire Mac (MPS)
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

# =============================================================================
# 1. CALCUL DES POIDS (L'arme secr√®te contre le d√©s√©quilibre)
# =============================================================================
# On calcule combien de "Non" il y a pour chaque "Oui"
num_positives = train_df[targets].sum().values
num_negatives = len(train_df) - num_positives
# Formule : Poids = N√©gatifs / Positifs
# (Si j'ai 100 exemples et seulement 10 positifs, le poids sera 9. Le mod√®le sera puni 9x plus s'il rate un positif)
pos_weights_calculated = torch.tensor(num_negatives / (num_positives + 1e-5), dtype=torch.float)

print(f"‚öñÔ∏è Poids calcul√©s pour √©quilibrer les classes : {pos_weights_calculated}")

# =============================================================================
# 2. PR√âPARATION DES DONN√âES
# =============================================================================
tokenizer = AutoTokenizer.from_pretrained(model_name)
# GPT-2 n'a pas de pad_token par d√©faut, on utilise EOS
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128, padding="max_length")

# Conversion & Tokenization
train_ds = Dataset.from_pandas(train_df[['text', 'labels']])
test_ds = Dataset.from_pandas(test_df[['text', 'labels']])

train_ds = train_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

# ‚ö†Ô∏è CRUCIAL : Conversion des labels en Float pour √©viter le crash RuntimeError
def format_labels(batch):
    batch['labels'] = [list(map(float, l)) for l in batch['labels']]
    return batch

train_ds = train_ds.map(format_labels, batched=True)
test_ds = test_ds.map(format_labels, batched=True)

# Format PyTorch
cols = ['input_ids', 'attention_mask', 'labels']
train_ds.set_format(type='torch', columns=cols)
test_ds.set_format(type='torch', columns=cols)

# =============================================================================
# 3. MOD√àLE
# =============================================================================
model_FT = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(targets),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
)
model_FT.config.pad_token_id = tokenizer.pad_token_id

# =============================================================================
# 4. TRAINER PERSONNALIS√â (Weighted Loss)
# =============================================================================
class WeightedTrainer(Trainer):
    def compute_loss(self, model_FT, inputs, return_outputs=False, num_items_in_batch=None):
        # 1. R√©cup√©rer et retirer les labels des inputs (√©vite que le mod√®le calcule sa loss interne)
        labels = inputs.pop("labels")

        # 2. Forward pass (Le mod√®le retourne les logits)
        outputs = model_FT(**inputs)
        logits = outputs.get("logits")

        # 3. CORRECTION DU TYPE : On force les labels en Float32 (C'est √ßa qui manquait/plantait)
        labels = labels.to(torch.float32)

        # 4. Envoi des poids calcul√©s sur le bon device (GPU/MPS)
        weights = pos_weights_calculated.to(logits.device)

        # 5. Calcul de la Loss
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# --- RELANCE DE L'ENTRA√éNEMENT ---
# On s'assure de vider le cache avant
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

# Configuration l√©g√®re pour Mac (Batch=4 + Gradient Accumulation)
training_args = TrainingArguments(
    learning_rate=3e-5,
    num_train_epochs=4,
    per_device_train_batch_size=4,    # Petit batch pour √©viter le OOM
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,    # Simule un batch de 16
    gradient_checkpointing=True,      # Sauve la m√©moire
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=1,
    report_to="none",
    fp16=False # MPS n'aime pas toujours le FP16, on reste en FP32 par s√©curit√©
)

trainer = WeightedTrainer(
    model=model_FT,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

print("\nüöÄ D√©marrage du Fine-Tuning (Version Corrig√©e)...")
trainer.train()

# --- OPTIMISATION DU SEUIL ---
print("\nüîé Recherche du meilleur seuil...")
preds_output = trainer.predict(test_ds)
logits = preds_output.predictions
# On s'assure que les labels de test sont bien format√©s
true_labels = np.array(test_ds['labels'])

probs = 1 / (1 + np.exp(-logits))

best_f1 = 0
best_thresh = 0.5

for t in np.arange(0.1, 0.9, 0.05):
    preds_t = (probs > t).astype(int)
    # On utilise 'micro' pour avoir une vue globale
    score = f1_score(true_labels, preds_t, average='micro')
    if score > best_f1:
        best_f1 = score
        best_thresh = t

print(f"‚úÖ Meilleur Seuil : {best_thresh:.2f}")
print(f"üåü F1-Score Final : {best_f1:.2%}")

‚öñÔ∏è Poids calcul√©s pour √©quilibrer les classes : tensor([ 6.7536,  2.3648, 11.4419])


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 535/535 [00:00<00:00, 13370.71 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:00<00:00, 8808.22 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 535/535 [00:00<00:00, 202870.68 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:00<00:00, 53998.33 examples/s]
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üöÄ D√©marrage du Fine-Tuning (Version Corrig√©e)...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.2012,1.428737,0.108434,0.039604,0.031746,0.052632
2,1.2141,1.167949,0.13253,0.27451,0.168675,0.736842
3,1.1526,1.194099,0.216867,0.236842,0.157895,0.473684
4,0.9518,1.175739,0.168675,0.264901,0.176991,0.526316



üîé Recherche du meilleur seuil...


‚úÖ Meilleur Seuil : 0.30
üåü F1-Score Final : 28.14%


# LoRA

In [51]:
import torch
import torch.nn as nn
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# --- 1. CONFIGURATION ---
model_name = "distilgpt2"
targets = ['intervention_vacation', 'intervention_diet_coaching', 'intervention_exercise_plan']
id2label = {0: "Vacation", 1: "Diet", 2: "Sport"}
label2id = {"Vacation": 0, "Diet": 1, "Sport": 2}

# Nettoyage m√©moire (Mac/GPU)
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

# --- 2. PR√âPARATION DES DONN√âES (Vital pour √©viter l'erreur de Type) ---
print("üîÑ Pr√©paration et Tokenization...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
# GPT-2 n'a pas de pad token par d√©faut
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128, padding="max_length")

# On recr√©e les datasets HuggingFace √† partir des DataFrames
train_dataset = Dataset.from_pandas(train_df[['text', 'labels']])
test_dataset = Dataset.from_pandas(test_df[['text', 'labels']])

# Tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# ‚ö†Ô∏è CORRECTION TYPE : Conversion explicite des labels en Float32
def force_float_labels(batch):
    batch['labels'] = [list(map(float, label)) for label in batch['labels']]
    return batch

train_dataset = train_dataset.map(force_float_labels, batched=True)
test_dataset = test_dataset.map(force_float_labels, batched=True)

# Formatage PyTorch (Essentiel)
cols_model = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=cols_model)
test_dataset.set_format(type='torch', columns=cols_model)

# --- 3. CALCUL DES POIDS (Si ce n'est pas d√©j√† fait plus haut) ---
num_positives = train_df[targets].sum().values
num_negatives = len(train_df) - num_positives
pos_weights_tensor = torch.tensor(num_negatives / (num_positives + 1e-5), dtype=torch.float)
print(f"‚öñÔ∏è Poids utilis√©s : {pos_weights_tensor}")

# --- 4. MOD√àLE & LoRA ---
print(f"\nüîµ Chargement de {model_name} pour LoRA...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(targets),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id
)
model.config.pad_token_id = tokenizer.pad_token_id

# Configuration LoRA sp√©cifique pour GPT-2
print("‚ú® Application de la configuration LoRA (Cibles GPT-2)...")
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    # ‚ö†Ô∏è CIBLE SP√âCIFIQUE GPT-2 (Attention projection layer)
    target_modules=['c_attn']
)

model_lora = get_peft_model(model, peft_config)
model_lora.print_trainable_parameters()

# --- 5. TRAINER PERSONNALIS√â (Weighted + Type Safety) ---
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model_lora, inputs, return_outputs=False, num_items_in_batch=None):
        # Extraction des labels
        labels = inputs.pop("labels")

        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # ‚ö†Ô∏è S√âCURIT√â : On s'assure que tout est sur le bon device et au bon format
        labels = labels.to(logits.device, dtype=torch.float32)
        weights = pos_weights_tensor.to(logits.device)

        # Calcul de la Loss Pond√©r√©e
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# --- 6. M√âTRIQUES ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Sigmoid pour multi-label
    probs = 1 / (1 + np.exp(-logits))
    predictions = (probs > 0.5).astype(int)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='micro', zero_division=0
    )
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# --- 7. LANCEMENT ---
training_args = TrainingArguments(
    learning_rate=2e-4,              # LR plus √©lev√© pour LoRA
    num_train_epochs=3,
    per_device_train_batch_size=16,  # Batch plus grand car LoRA est l√©ger
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=20,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

trainer = WeightedLossTrainer(
    model=model_lora,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

print("\nüöÄ D√©marrage de l'entra√Ænement LoRA...")
trainer.train()

üîÑ Pr√©paration et Tokenization...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 535/535 [00:00<00:00, 13168.58 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:00<00:00, 8540.28 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 535/535 [00:00<00:00, 197722.50 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:00<00:00, 56504.99 examples/s]
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚öñÔ∏è Poids utilis√©s : tensor([ 6.7536,  2.3648, 11.4419])

üîµ Chargement de distilgpt2 pour LoRA...
‚ú® Application de la configuration LoRA (Cibles GPT-2)...
trainable params: 297,216 || all params: 82,212,096 || trainable%: 0.3615

üöÄ D√©marrage de l'entra√Ænement LoRA...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1788,1.049548,0.144578,0.292135,0.185714,0.684211
2,1.1257,1.089191,0.144578,0.209877,0.137097,0.447368
3,1.0893,1.095736,0.156627,0.203593,0.131783,0.447368


TrainOutput(global_step=102, training_loss=1.1258072058359783, metrics={'train_runtime': 36.2461, 'train_samples_per_second': 44.281, 'train_steps_per_second': 2.814, 'total_flos': 52791860920320.0, 'train_loss': 1.1258072058359783, 'epoch': 3.0})

# Distillation

In [52]:
# --- √âTAPE 1 : CALCUL DYNAMIQUE ET AJUST√â ---

# Facteur de sur-p√©nalisation (multiplication par 2.0 pour forcer la pr√©diction)
SUR_PENALISATION_FACTOR = 2.0

# Calcul du poids exact (N√©gatifs / Positifs)
num_positives = train_df[targets].sum().values
num_negatives = len(train_df) - num_positives
pos_weights_calculated_raw = num_negatives / (num_positives + 1e-5)

# Tenseur de poids final utilis√© par le Trainer
pos_weights_calculated = torch.tensor(
    pos_weights_calculated_raw * SUR_PENALISATION_FACTOR,
    dtype=torch.float
)
print(f"Poids calcul√©s pour la Distillation (x{SUR_PENALISATION_FACTOR}) : {pos_weights_calculated}")

Poids calcul√©s pour la Distillation (x2.0) : tensor([13.5072,  4.7296, 22.8837])


In [53]:
import torch
import torch.nn.functional as F
from torch import nn
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader

# --- CORRECTION : TOKENIZATION PR√âALABLE ---
# Le dataset doit √™tre transform√© en nombres avant d'√™tre pass√© au mod√®le

# 1. On s'assure que le tokenizer est bien configur√©
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=128, padding="max_length")

print("üîÑ Tokenization des donn√©es pour la distillation...")

# 2. On applique la tokenization si ce n'est pas d√©j√† fait
if "input_ids" not in train_dataset.column_names:
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)

# 3. IMPORTANT : On met les colonnes au format PyTorch
cols_model = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=cols_model)
test_dataset.set_format(type='torch', columns=cols_model)

print("‚úÖ Donn√©es pr√™tes (input_ids g√©n√©r√©s).")
# --- √âTAPE 1 : PR√â-CALCUL DES LOGITS DU PROFESSEUR (OFFLINE) ---
print("üîÆ G√©n√©ration des logits du Professeur (Offline)...")

# On passe le teacher en mode eval et sur le device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
teacher_model = model_baseline.to(device)
teacher_model.eval()

def add_teacher_logits(batch):
    # On pr√©pare les inputs pour le teacher
    inputs = {
        "input_ids": torch.tensor(batch["input_ids"]).to(device),
        "attention_mask": torch.tensor(batch["attention_mask"]).to(device)
    }

    with torch.no_grad():
        outputs = teacher_model(**inputs)

    # On retourne les logits (scores bruts) convertis en listes python
    return {"teacher_logits": outputs.logits.cpu().numpy()}

# On applique √ßa sur le dataset (map)
# batch_size petit pour √©viter OOM
train_dataset_distill = train_dataset.map(add_teacher_logits, batched=True, batch_size=8)
test_dataset_distill = test_dataset.map(add_teacher_logits, batched=True, batch_size=8)

# On lib√®re la m√©moire du teacher (CRUCIAL sur Mac)
del teacher_model
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

print("‚úÖ Logits g√©n√©r√©s. Le Professeur a quitt√© la salle.")

# --- √âTAPE 2 : FORMATAGE DES DONN√âES ---
# On doit s'assurer que 'teacher_logits' est bien un tenseur PyTorch
cols = ['input_ids', 'attention_mask', 'labels', 'teacher_logits']
train_dataset_distill.set_format(type='torch', columns=cols)
test_dataset_distill.set_format(type='torch', columns=cols)


# --- √âTAPE 3 : TRAINER POUR DISTILLATION OFFLINE ---
class OfflineDistillationTrainer(Trainer):
    def __init__(self, *args, pos_weights=None, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weights = pos_weights
        self.alpha = alpha
        self.temperature = temperature

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # 1. R√©cup√©ration des donn√©es
        labels = inputs.pop("labels").float() # Cibles r√©elles (Ground Truth)
        teacher_logits = inputs.pop("teacher_logits") # Cibles du professeur (Soft Targets)

        # 2. Forward de l'√©l√®ve
        outputs = model(**inputs)
        student_logits = outputs.logits

        # 3. Loss √âtudiante (BCE avec poids pour l'imbalance)
        if self.pos_weights is not None:
             weights = self.pos_weights.to(student_logits.device)
             loss_ce_fct = nn.BCEWithLogitsLoss(pos_weight=weights)
        else:
             loss_ce_fct = nn.BCEWithLogitsLoss()

        loss_ce = loss_ce_fct(student_logits, labels)

        # 4. Loss de Distillation (KL Divergence)
        # Elle compare la distribution de l'√©l√®ve √† celle du prof
        loss_kd = F.kl_div(
            F.log_softmax(student_logits / self.temperature, dim=-1),
            F.softmax(teacher_logits / self.temperature, dim=-1),
            reduction="batchmean"
        ) * (self.temperature ** 2)

        # 5. Loss Totale
        # Souvent on met plus de poids sur la distillation (alpha faible pour CE)
        loss = (self.alpha * loss_ce) + ((1 - self.alpha) * loss_kd)

        return (loss, outputs) if return_outputs else loss
# --- √âTAPE 4 : LANCEMENT AVEC SUIVI DES M√âTRIQUES ---

# Recr√©ation d'un mod√®le √©l√®ve vierge
from transformers import AutoConfig, AutoModelForSequenceClassification

student_config = AutoConfig.from_pretrained(
    model_name,
    num_labels=len(targets),
    problem_type="multi_label_classification",
    n_layer=2,
    n_head=4,
    n_embd=256
)
student_config.pad_token_id = tokenizer.pad_token_id
student_model = AutoModelForSequenceClassification.from_config(student_config)

# Configuration pour avoir le tableau
training_args = TrainingArguments(
    learning_rate=5e-4,
    num_train_epochs=10,

    # Gestion m√©moire
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    # ‚ö†Ô∏è Param√®tres pour l'affichage du tableau
    eval_strategy="epoch",     # √âvaluer √† la fin de chaque √©poque
    save_strategy="epoch",     # Sauvegarder √† la fin de chaque √©poque
    logging_strategy="epoch",  # Afficher les logs (Training Loss) √† chaque √©poque

    load_best_model_at_end=True,
    metric_for_best_model="f1", # La m√©trique √† surveiller

    remove_unused_columns=False, # Indispensable pour la distillation offline
    report_to="none"
)

# On d√©finit une fonction de m√©trique adapt√©e qui nettoie les inputs si n√©cessaire
# (Le Trainer passe parfois des tuples √† compute_metrics en distillation)
def compute_metrics_distill(eval_pred):
    predictions, labels = eval_pred
    # predictions est souvent un tuple (logits_eleve, hidden_states...)
    # on ne garde que le premier √©l√©ment (logits)
    if isinstance(predictions, tuple):
        logits = predictions[0]
    else:
        logits = predictions

    # Transformation sigmoid pour multi-label
    probs = 1 / (1 + np.exp(-logits))
    y_pred = (probs > 0.5).astype(int)

    # Calculs classiques
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, y_pred, average='micro', zero_division=0
    )
    acc = accuracy_score(labels, y_pred)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = OfflineDistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset_distill,
    eval_dataset=test_dataset_distill, # C'est ici que l'√©valuation se fera
    compute_metrics=compute_metrics_distill, # On utilise la fonction robuste
    pos_weights=pos_weights_calculated,
    alpha=0.5,
    temperature=4.0
)

print("\nüöÄ D√©marrage de la Distillation Offline avec tableau de suivi...")
trainer.train()

üîÑ Tokenization des donn√©es pour la distillation...
‚úÖ Donn√©es pr√™tes (input_ids g√©n√©r√©s).
üîÆ G√©n√©ration des logits du Professeur (Offline)...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 535/535 [00:04<00:00, 123.16 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 83/83 [00:00<00:00, 114.93 examples/s]


‚úÖ Logits g√©n√©r√©s. Le Professeur a quitt√© la salle.

üöÄ D√©marrage de la Distillation Offline avec tableau de suivi...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8797,0.856816,0.0,0.264808,0.15261,1.0
2,0.8271,0.856115,0.036145,0.233333,0.138614,0.736842
3,0.7981,0.836084,0.0,0.241509,0.140969,0.842105
4,0.7492,0.875573,0.156627,0.162791,0.104478,0.368421
5,0.6882,0.993348,0.168675,0.16,0.10219,0.368421
6,0.6782,0.851199,0.168675,0.222222,0.147826,0.447368
7,0.6093,1.003501,0.168675,0.143885,0.09901,0.263158
8,0.5984,1.163609,0.216867,0.13986,0.095238,0.263158
9,0.5555,1.245316,0.26506,0.152672,0.107527,0.263158
10,0.5333,1.283494,0.277108,0.16129,0.116279,0.263158


TrainOutput(global_step=340, training_loss=0.6917049744549919, metrics={'train_runtime': 20.6193, 'train_samples_per_second': 259.465, 'train_steps_per_second': 16.489, 'total_flos': 6495191040000.0, 'train_loss': 0.6917049744549919, 'epoch': 10.0})