In [40]:
import pandas as pd

# 1. Load raw CSV
df_raw = pd.read_csv("train_data.csv")

# 2. Clean string fields
df_raw['Formations'] = df_raw['Formations'].apply(lambda x: [f.strip() for f in x.split(',')])
df_raw['Experience_Professionnelle'] = df_raw['Experience_Professionnelle'].map({'Yes': 1, 'No': 0})
df_raw['Sexe'] = df_raw['Sexe'].map({'Homme': 0, 'Femme': 1})

#rename Filiére to 'Filiere' for consistency
df_raw.rename(columns={'Filiére': 'Filiere'}, inplace=True)
# 3. One-hot encode Cellule
df_raw = pd.get_dummies(df_raw, columns=['Cellule'], drop_first=False)

# 4. Drop ID
df_raw = df_raw.drop(columns=['ID_Membre'])

# 5. Save raw cleaned formations before exploding
df_cleaned = df_raw.copy()

# Show shape
print("Cleaned dataframe shape before explosion:", df_cleaned.shape)
# 1. Explode the 'Formations' column into multiple rows
df_exploded = df_cleaned.explode('Formations').reset_index(drop=True)

# 2. Save the formation column separately for clarity
df_exploded['Formation'] = df_exploded['Formations']
df_exploded = df_exploded.drop(columns=['Formations'])

# 3. Check for remaining object (string) columns — we should only have 'Filière' as string
print("Object columns remaining (should only be Filière and Formation):")
print(df_exploded.select_dtypes(include='object').columns.tolist())
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Step 1: Define valid Filières
valid_filieres = ['GL', 'RT', 'IIA', 'IMI', 'BIO', 'CH', 'MPI', 'CBA']
mask_valid = df_exploded['Filiere'].isin(valid_filieres)

# Step 2: Build X (drop string columns) and y (Filiére)
X = df_exploded[mask_valid].drop(columns=['Filiere', 'Formation'])
y = df_exploded[mask_valid]['Filiere']

# Step 3: Encode the target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 4: Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 5: Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 6: Evaluate
print("✅ Exploded View Filiere Prediction Report:")
print(classification_report(y_val, clf.predict(X_val), target_names=le.classes_))
# Step 1: Select rows with invalid Filière
mask_invalid = ~df_exploded['Filiere'].isin(valid_filieres)
df_invalid = df_exploded[mask_invalid].copy().reset_index(drop=True)

# Step 2: Prepare X for prediction (drop 'Filiére' and 'Formation')
X_infer = df_invalid.drop(columns=['Filiere', 'Formation'])

# Step 3: Predict
preds_encoded = clf.predict(X_infer)
preds_decoded = le.inverse_transform(preds_encoded)

# Step 4: Show replacements
df_invalid['Filiere_Predicted'] = preds_decoded

print("🔁 Sample replacements:")
print(df_invalid[['Formation', 'Filiere', 'Filiere_Predicted']].head(10))


Cleaned dataframe shape before explosion: (4950, 18)
Object columns remaining (should only be Filière and Formation):
['Filiere', 'Formation']
✅ Exploded View Filiere Prediction Report:
              precision    recall  f1-score   support

         BIO       0.89      0.89      0.89       353
          CH       0.92      0.92      0.92       338
          GL       0.92      0.89      0.91       304
         IIA       0.96      0.90      0.93       315
         IMI       0.91      0.95      0.93       331
         MPI       0.90      0.94      0.92       301
          RT       0.91      0.91      0.91       305

    accuracy                           0.92      2247
   macro avg       0.92      0.92      0.92      2247
weighted avg       0.92      0.92      0.92      2247

🔁 Sample replacements:
          Formation Filiere Filiere_Predicted
0   Risk Management     ART               BIO
1      Negociations     ART               BIO
2         Photoshop     ART               BIO
3   Risk M

In [41]:
# Get mapping Filiere name → ID
filiere_to_id = {name: idx for idx, name in enumerate(le.classes_)}

# Or inverse mapping ID → Filiere
id_to_filiere = {idx: name for idx, name in enumerate(le.classes_)}

# Print nicely
print(le.classes_)
for filiere, idx in filiere_to_id.items():
    print(f"{filiere} --> {idx}")




['BIO' 'CH' 'GL' 'IIA' 'IMI' 'MPI' 'RT']
BIO --> 0
CH --> 1
GL --> 2
IIA --> 3
IMI --> 4
MPI --> 5
RT --> 6


In [42]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# === STEP 1: Load original data ===
df = pd.read_csv("train_data.csv")
# Load and keep ID_Membre in a separate variable
df_ids = pd.read_csv("train_data.csv", usecols=["ID_Membre"])
df.rename(columns={'Filiére': 'Filiere'}, inplace=True)
# === STEP 2: Preprocessing ===
df['Formations'] = df['Formations'].apply(lambda x: [f.strip() for f in x.split(',')])
df['Experience_Professionnelle'] = df['Experience_Professionnelle'].map({'Yes': 1, 'No': 0})
df['Sexe'] = df['Sexe'].map({'Homme': 0, 'Femme': 1})
df = pd.get_dummies(df, columns=['Cellule'], drop_first=False)
# Save ID into main DataFrame before dropping later
df['ID_Membre'] = df_ids

# === STEP 3: Fix formation spelling errors ===
def clean_formations(formations):
    return ['Negotiation' if f.lower() in ['negociations', 'negotiation'] else f for f in formations]
df['Formations'] = df['Formations'].apply(clean_formations)

# === STEP 4: Inject predicted Filière ===
valid_filieres = ['GL', 'RT', 'IIA', 'IMI', 'BIO', 'CH', 'MPI', 'CBA']
corrected_filiere = df_invalid[['Filiere_Predicted']]
corrected_filiere.index = df_invalid.index
df.loc[corrected_filiere.index, 'Filiere'] = corrected_filiere['Filiere_Predicted']

# === STEP 5: Transform Formations into 3 slots ===
df['Formation_1'] = df['Formations'].apply(lambda x: x[0] if len(x) > 0 else None)
df['Formation_2'] = df['Formations'].apply(lambda x: x[1] if len(x) > 1 else None)
df['Formation_3'] = df['Formations'].apply(lambda x: x[2] if len(x) > 2 else None)
df = df.drop(columns=['Formations'])
from sklearn.preprocessing import LabelEncoder

# Encode 'Filiére' as numeric
le_filiere = LabelEncoder()
df['Filiere_Encoded'] = le_filiere.fit_transform(df['Filiere'])

# Drop original string column (optional, keeps training clean)
df = df.drop(columns=['Filiere'])



print("Final shape:", df.shape)
print(df.head())
# 1. Stack Formation_1, 2, 3 into a single column
df_long = pd.melt(
    df,
    id_vars=[col for col in df.columns if col not in ['Formation_1', 'Formation_2', 'Formation_3']],
    value_vars=['Formation_1', 'Formation_2', 'Formation_3'],
    var_name='Slot',
    value_name='Formation'
)

# 2. Drop rows with missing formations
df_long = df_long[df_long['Formation'].notna()].reset_index(drop=True)
X_with_ids = df_long.copy()

# 3. Preview
print(" Exploded shape:", df_long.shape)
print(df_long[['Formation', 'Slot']].head())
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Create feature matrix and label
X_rank = df_long.drop(columns=['Formation', 'Slot'])  # Drop label + slot info
y_rank = df_long['Formation']

# Step 2: Encode the formation label
le_rank = LabelEncoder()
y_rank_encoded = le_rank.fit_transform(y_rank)

# Step 3: Train/test split
X_train_r, X_val_r, y_train_r, y_val_r = train_test_split(X_rank, y_rank_encoded, test_size=0.2, random_state=42)

# Step 4: Train model
clf_rank = RandomForestClassifier(n_estimators=200, random_state=42)
clf_rank.fit(X_train_r, y_train_r)

# Step 5: Evaluate
y_pred_r = clf_rank.predict(X_val_r)
print("🎯 Per-Formation Ranking Model:")
print("Accuracy:", accuracy_score(y_val_r, y_pred_r))
print(classification_report(y_val_r, y_pred_r))  # Let it auto-handle labels

import numpy as np


# Step 1: Keep ID from X_with_ids
person_ids = X_with_ids['ID_Membre'].reset_index(drop=True)

# Step 2: Predict probabilities
probs = clf_rank.predict_proba(X_rank)

# Step 3: Group by real ID and average predictions
probs_df = pd.DataFrame(probs)
probs_df['ID_Membre'] = person_ids
grouped_probs = probs_df.groupby('ID_Membre').mean()

# Step 4: Get top-3 predicted labels
top3_indices = np.argsort(grouped_probs.values, axis=1)[:, -3:][:, ::-1]
top3_labels = [[le_rank.classes_[i] for i in row] for row in top3_indices]

# Step 5: Format and display sample output
df_top3 = pd.DataFrame({
    'ID_Membre': grouped_probs.index,
    'Top_1': [row[0] for row in top3_labels],
    'Top_2': [row[1] for row in top3_labels],
    'Top_3': [row[2] for row in top3_labels]
})

print("📊 Sample Top-3 Formation Recommendations by ID:")
print(df_top3.head())
# Step 1: Get the real formations per person from original df
true_formations = df[['ID_Membre', 'Formation_1', 'Formation_2', 'Formation_3']].copy()
true_formations = true_formations.set_index('ID_Membre')

# Step 2: Compare top-3 predictions to true formations
hits = 0
for i, row in df_top3.iterrows():
    true = set(filter(None, true_formations.loc[row['ID_Membre']].values))
    predicted = {row['Top_1'], row['Top_2'], row['Top_3']}
    if not true.isdisjoint(predicted):
        hits += 1

top3_hit_rate = hits / len(df_top3)
print(f"Top-3 Hit Rate (at least 1 correct): {top3_hit_rate:.2%}")
# Step 1: Get true formations as sets
true_sets = df[['ID_Membre', 'Formation_1', 'Formation_2', 'Formation_3']].copy()
true_sets = true_sets.set_index('ID_Membre')
true_sets = true_sets.apply(lambda row: set(filter(None, row)), axis=1)

# Step 2: Get predicted formations as sets
pred_sets = df_top3.set_index('ID_Membre')[['Top_1', 'Top_2', 'Top_3']].apply(set, axis=1)

# Step 3: Align both series on index
true_sets, pred_sets = true_sets.align(pred_sets, join='inner')

# Step 4: Compare exact 3/3 match
exact_matches = (true_sets == pred_sets).sum()
total = len(pred_sets)
exact_match_accuracy = exact_matches / total

print(f"Exact 3/3 Match Accuracy: {exact_match_accuracy:.2%}")
# Step 1: Get true formations as sets
true_sets = df[['ID_Membre', 'Formation_1', 'Formation_2', 'Formation_3']].copy()
true_sets = true_sets.set_index('ID_Membre')
true_sets = true_sets.apply(lambda row: set(filter(None, row)), axis=1)

# Step 2: Get predicted formations as sets
pred_sets = df_top3.set_index('ID_Membre')[['Top_1', 'Top_2', 'Top_3']].apply(set, axis=1)

# Step 3: Align both series on index
true_sets, pred_sets = true_sets.align(pred_sets, join='inner')

# Step 4: Count how many true formations are in the top-3 predictions
matches_2_or_more = sum([len(true & pred) >= 2 for true, pred in zip(true_sets, pred_sets)])

# Step 5: Compute 2/3 match accuracy
match_2_accuracy = matches_2_or_more / len(pred_sets)

print(f" 2/3 Match Accuracy: {match_2_accuracy:.2%}")


Final shape: (4950, 21)
   ID_Membre  Age  Sexe  Moyenne_Lycée  Autres_Clubs  Projets_Realisés  \
0        337   20     0          13.35             3                 1   
1       2241   25     1          15.55             4                 5   
2       1323   20     1          18.20             2                 9   
3       3011   20     1          13.90             1                 8   
4       4591   22     0          11.75             1                 8   

   Evaluation_Bureau  Soft_Skills  Score_Entretien  \
0             2.9250            9                7   
1             4.2000            4               10   
2             3.4625           10                7   
3             2.9750            5                9   
4             4.3500            2                6   

   Experience_Professionnelle  ...  Cellule_Cellule Developpement Commercial  \
0                           0  ...                                     False   
1                           0  ...            

In [43]:
import json
# Extract mapping
formation_id_to_name = {idx: name for idx, name in enumerate(le_rank.classes_)}


print("=== Formation ID to Name mapping ===")
for id_, name in formation_id_to_name.items():
    print(f"{id_} --> {name}")



with open("formation_mapping.json", "w", encoding="utf-8") as f:
    json.dump(formation_id_to_name, f, ensure_ascii=False, indent=4)

print("Formation mapping saved to formation_mapping.json")


=== Formation ID to Name mapping ===
0 --> AI/ML
1 --> B2B Sales
2 --> CRM
3 --> Communication
4 --> Content Creation
5 --> Data Science
6 --> Digital Marketing
7 --> Docker
8 --> E-Branding
9 --> Flutter
10 --> Git
11 --> ISO9001
12 --> JavaScript
13 --> Leadership
14 --> Lean Management
15 --> Negotiation
16 --> NestJS
17 --> Photoshop
18 --> Pitching
19 --> Public Speaking
20 --> Python
21 --> Quality Assurance
22 --> Quality Control
23 --> RSE
24 --> React Native
25 --> ReactJS
26 --> Risk Management
27 --> Self Improvement
28 --> Teamwork
29 --> Time and stress management
30 --> UI/UX
31 --> Web Development
Formation mapping saved to formation_mapping.json


In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# === STEP 1: Load original data ===
df = pd.read_csv("train_data.csv")
df_ids = df[["ID_Membre"]].copy()

# === STEP 2: Preprocessing ===
df['Formations'] = df['Formations'].apply(lambda x: [f.strip() for f in x.split(',')])
df['Experience_Professionnelle'] = df['Experience_Professionnelle'].map({'Yes': 1, 'No': 0})
df['Sexe'] = df['Sexe'].map({'Homme': 0, 'Femme': 1})
df = pd.get_dummies(df, columns=['Cellule'], drop_first=False)
df['ID_Membre'] = df_ids

# === STEP 3: Fix formation spelling errors ===
def clean_formations(formations):
    return ['Negotiation' if f.lower() in ['negociations', 'negotiation'] else f for f in formations]
df['Formations'] = df['Formations'].apply(clean_formations)

# === STEP 4: Assume predicted Filière was injected externally ===
# Simulate correction step for demonstration (keep existing)
if 'Filiére' in df.columns:
    le_filiere = LabelEncoder()
    df['Filiere_Encoded'] = le_filiere.fit_transform(df['Filiére'])
    df = df.drop(columns=['Filiére'])

# === STEP 5: Transform Formations into 3 slots ===
df['Formation_1'] = df['Formations'].apply(lambda x: x[0] if len(x) > 0 else None)
df['Formation_2'] = df['Formations'].apply(lambda x: x[1] if len(x) > 1 else None)
df['Formation_3'] = df['Formations'].apply(lambda x: x[2] if len(x) > 2 else None)
df = df.drop(columns=['Formations'])

# === STEP 6: Explode ===
df_long = pd.melt(
    df,
    id_vars=[col for col in df.columns if col not in ['Formation_1', 'Formation_2', 'Formation_3']],
    value_vars=['Formation_1', 'Formation_2', 'Formation_3'],
    var_name='Slot',
    value_name='Formation'
)
df_long = df_long[df_long['Formation'].notna()].reset_index(drop=True)

# === STEP 7: Train model ===
X_rank = df_long.drop(columns=['Formation', 'Slot'])
y_rank = df_long['Formation']
le_rank = LabelEncoder()
y_rank_encoded = le_rank.fit_transform(y_rank)

X_train_r, X_val_r, y_train_r, y_val_r = train_test_split(X_rank, y_rank_encoded, test_size=0.2, random_state=42)
clf_rank = RandomForestClassifier(n_estimators=200, random_state=42)
clf_rank.fit(X_train_r, y_train_r)
y_pred_r = clf_rank.predict(X_val_r)



In [45]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Encode formation labels
le_rank = LabelEncoder()
y_rank_encoded = le_rank.fit_transform(y_rank)

# Train/Test Split
X_train_r, X_val_r, y_train_r, y_val_r = train_test_split(X_rank, y_rank_encoded, test_size=0.2, random_state=42)

# Train XGBoost model
clf_xgb = XGBClassifier(
    objective='multi:softprob',
    num_class=len(le_rank.classes_),
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
clf_xgb.fit(X_train_r, y_train_r)

# Evaluate
y_pred_xgb = clf_xgb.predict(X_val_r)
xgb_accuracy = accuracy_score(y_val_r, y_pred_xgb)
xgb_report = classification_report(y_val_r, y_pred_xgb)

print("XGBoost Accuracy:", xgb_accuracy)
print(" XGBoost Classification Report:\n", xgb_report)


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.01292929292929293
 XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.01      0.02      0.01        66
           1       0.07      0.06      0.06        17
           2       0.00      0.00      0.00        19
           3       0.00      0.00      0.00       188
           4       0.05      0.06      0.05        35
           5       0.00      0.00      0.00        69
           6       0.00      0.00      0.00        30
           7       0.01      0.01      0.01       100
           8       0.00      0.00      0.00        17
           9       0.03      0.05      0.04        20
          10       0.02      0.02      0.02        95
          11       0.02      0.03      0.02        37
          12       0.00      0.00      0.00        70
          13       0.01      0.01      0.01       162
          14       0.03      0.03      0.03        31
          15       0.02      0.02      0.02        45
          

In [46]:
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Predict probabilities on all data
probs_xgb = clf_xgb.predict_proba(X_rank)

# Step 2: Map ID_Membre from df_long
id_list_xgb = df_long['ID_Membre'].values

# Step 3: Create DataFrame with class probabilities and align to IDs
probs_df_xgb = pd.DataFrame(probs_xgb, columns=le_rank.classes_)
probs_df_xgb['ID_Membre'] = id_list_xgb

# Step 4: Average probabilities per ID_Membre
grouped_probs_xgb = probs_df_xgb.groupby('ID_Membre').mean()

# Step 5: Get top 3 predictions (no threshold)
top3_indices = np.argsort(grouped_probs_xgb.values, axis=1)[:, -3:][:, ::-1]
top3_labels_xgb = [[le_rank.classes_[i] for i in row] for row in top3_indices]

# Step 6: Format results
df_top3_xgb = pd.DataFrame({
    'ID_Membre': grouped_probs_xgb.index,
    'Top_1': [row[0] if len(row) > 0 else None for row in top3_labels_xgb],
    'Top_2': [row[1] if len(row) > 1 else None for row in top3_labels_xgb],
    'Top_3': [row[2] if len(row) > 2 else None for row in top3_labels_xgb]
})

# Step 7: Build ground truth sets
true_formations = df[['ID_Membre', 'Formation_1', 'Formation_2', 'Formation_3']].copy()
true_formations = true_formations.set_index('ID_Membre')
true_sets = true_formations.apply(lambda row: set(filter(None, row)), axis=1)

# Step 8: Predicted sets
pred_sets = df_top3_xgb.set_index('ID_Membre')[['Top_1', 'Top_2', 'Top_3']].apply(lambda row: set(filter(None, row)), axis=1)

# Step 9: Align sets
true_sets, pred_sets = true_sets.align(pred_sets, join='inner')

# Step 10: Compute match stats
hit_1_or_more = sum(len(true & pred) >= 1 for true, pred in zip(true_sets, pred_sets))
hit_2_or_more = sum(len(true & pred) >= 2 for true, pred in zip(true_sets, pred_sets))
exact_3_match = sum(true == pred for true, pred in zip(true_sets, pred_sets))
total = len(true_sets)

{
    "Top-1+ Hit Rate": f"{hit_1_or_more / total:.2%}",
    "2/3 Match Accuracy": f"{hit_2_or_more / total:.2%}",
    "Exact 3/3 Match Accuracy": f"{exact_3_match / total:.2%}"
}



{'Top-1+ Hit Rate': '98.51%',
 '2/3 Match Accuracy': '80.04%',
 'Exact 3/3 Match Accuracy': '26.40%'}

In [47]:
import numpy as np
import pandas as pd

# Step 1: Predict probabilities (not just raw classes)
probs = clf_rank.predict_proba(X_rank)

# Step 2: Set confidence threshold 
confidence_threshold = 0.1

# Step 3: Get top-k predictions (top-3 or fewer)
top_k_labels = []
for person_probs in probs:
    # Sort indices by probability (descending)
    sorted_indices = np.argsort(person_probs)[::-1]  # Sort in descending order
    top_k = sorted_indices[person_probs[sorted_indices] > confidence_threshold][:3]  # Only include top-3 above threshold
    top_k_labels.append([le_rank.classes_[i] for i in top_k])

# Step 4: Rebuild DataFrame with top-3 recommendations per user
df_top_k = pd.DataFrame({
    'ID_Membre': df_long['ID_Membre'],
    'Top_1': [row[0] if len(row) > 0 else None for row in top_k_labels],
    'Top_2': [row[1] if len(row) > 1 else None for row in top_k_labels],
    'Top_3': [row[2] if len(row) > 2 else None for row in top_k_labels]
})

# Step 5: Display sample output
print("📊 Sample Top-3 Formation Recommendations (with confidence threshold):")
print(df_top_k.head())


📊 Sample Top-3 Formation Recommendations (with confidence threshold):
   ID_Membre             Top_1            Top_2          Top_3
0        337   Public Speaking       Leadership  Communication
1       2241         Photoshop  Risk Management    Negotiation
2       1323  Self Improvement             None           None
3       3011        JavaScript             None           None
4       4591   Public Speaking             None           None


In [48]:
# Step 1: Get true formations per ID_Membre
true_formations = df[['ID_Membre', 'Formation_1', 'Formation_2', 'Formation_3']].copy()
true_formations = true_formations.set_index('ID_Membre')
true_sets = true_formations.apply(lambda row: set(filter(None, row)), axis=1)

# Step 2: Get predicted formations per ID_Membre
pred_sets = df_top_k.set_index('ID_Membre')[['Top_1', 'Top_2', 'Top_3']].apply(lambda row: set(filter(None, row)), axis=1)

# Step 3: Align true and predicted sets
true_sets, pred_sets = true_sets.align(pred_sets, join='inner')

# Step 4: Compute matches
hit_1_or_more = sum(len(true & pred) >= 1 for true, pred in zip(true_sets, pred_sets))
hit_2_or_more = sum(len(true & pred) >= 2 for true, pred in zip(true_sets, pred_sets))
exact_3_match = sum(true == pred for true, pred in zip(true_sets, pred_sets))

total = len(true_sets)

# Step 5: Display results
print(f"✅ Hit Rate (at least 1 correct): {hit_1_or_more / total:.2%}")
print(f"✅ 2/3 Match Accuracy: {hit_2_or_more / total:.2%}")
print(f"✅ Exact 3/3 Match Accuracy: {exact_3_match / total:.2%}")
from sklearn.metrics import f1_score, hamming_loss, accuracy_score

# Step 6: Prepare y_true and y_pred in multi-label binary format
# Create a complete list of unique formations
all_formations = sorted(set(true_formations.values.flatten()) - {None})

# Helper to convert sets to binary vector
def set_to_binary_vector(s, all_labels):
    return [1 if label in s else 0 for label in all_labels]

# Build binary matrices
y_true_binary = [set_to_binary_vector(true_set, all_formations) for true_set in true_sets]
y_pred_binary = [set_to_binary_vector(pred_set, all_formations) for pred_set in pred_sets]

# Step 7: Compute additional metrics
f1_micro = f1_score(y_true_binary, y_pred_binary, average="micro")
f1_macro = f1_score(y_true_binary, y_pred_binary, average="macro")
strict_subset_accuracy = accuracy_score(y_true_binary, y_pred_binary)  # Strict exact match
ham_loss = hamming_loss(y_true_binary, y_pred_binary)

# Step 8: Display additional results
print(f"🎯 F1 Score (micro): {f1_micro:.2%}")
print(f"🎯 F1 Score (macro): {f1_macro:.2%}")
print(f"⚠️  Subset Accuracy (Strict): {strict_subset_accuracy:.2%}")
print(f"⚠️  Hamming Loss: {ham_loss:.2%}")

✅ Hit Rate (at least 1 correct): 98.91%
✅ 2/3 Match Accuracy: 78.93%
✅ Exact 3/3 Match Accuracy: 56.54%
🎯 F1 Score (micro): 88.43%
🎯 F1 Score (macro): 88.32%
⚠️  Subset Accuracy (Strict): 56.54%
⚠️  Hamming Loss: 1.71%


In [49]:
import joblib

# === Save the trained model ===
joblib.dump(clf_xgb, "xgb_model.joblib")

# === Save the label encoder used to decode/encode formations ===
joblib.dump(le_rank, "formation_label_encoder.joblib")

# === Save the confidence threshold ===
# Save it as a dictionary in case you want to include more  later
config = {
    "confidence_threshold": 0.1
}
joblib.dump(config, "model_config.joblib")

print(" Model, encoder, and config saved successfully.")


 Model, encoder, and config saved successfully.


In [50]:
# === Print features used by model ===
print("=== Features used in the model ===")
if hasattr(clf_xgb, "get_booster"):
    print(clf_xgb.get_booster().feature_names)
elif hasattr(clf_xgb, "feature_names_in_"):
    print(clf_xgb.feature_names_in_)
else:
    print("Unable to detect features automatically.")

=== Features used in the model ===
['ID_Membre', 'Age', 'Sexe', 'Moyenne_Lycée', 'Autres_Clubs', 'Projets_Realisés', 'Evaluation_Bureau', 'Soft_Skills', 'Score_Entretien', 'Experience_Professionnelle', 'Indice_Engagement', 'Cellule_Cellule Developpement Commercial', 'Cellule_Cellule Marketing', 'Cellule_Cellule Mobile', 'Cellule_Cellule Qualité', 'Cellule_Cellule RH', 'Cellule_Cellule Web', 'Filiere_Encoded']
