In [11]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder


In [20]:
import pandas as pd

# Chargement des deux datasets
df_mat = pd.read_csv("data/student-mat.csv", sep=";")
df_por = pd.read_csv("data/student-por.csv", sep=";")

# Ajout d'une colonne source pour garder trace de l'origine
df_mat["source"] = "mat"
df_por["source"] = "por"

# Concaténation verticale
df_all = pd.concat([df_mat, df_por], ignore_index=True)

# Suppression des doublons
df_all_unique = df_all.drop_duplicates()

# Optionnel : affichage de la forme
print(f"Forme initiale concaténée : {df_all.shape}")
print(f"Forme après suppression des doublons : {df_all_unique.shape}")

# Sauvegarde (si besoin)
df_all_unique.to_csv("data/student-merged.csv", sep=";", index=False)


Forme initiale concaténée : (1044, 34)
Forme après suppression des doublons : (1044, 34)


In [21]:
# === 1. Chargement des données ===
df = pd.read_csv("data/student-merged.csv", sep=';')

# === 2. Encodage des variables catégorielles ===
df_encoded = df.copy()
label_encoders = {}
for col in df_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# === 3. Variables sensibles à exclure ===
sensitive_features = [
    'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
    'Mjob', 'Fjob', 'reason', 'guardian', 'romantic'
]

# === 4. Fonctions d’évaluation ===
def evaluate_regression(X, y):
    models = {
        "LinearRegression": LinearRegression(),
        "LGBMRegressor": LGBMRegressor()
    }
    results = {}
    for name, model in models.items():
        rmse = -cross_val_score(model, X, y, cv=KFold(5, shuffle=True, random_state=42),
                                scoring='neg_root_mean_squared_error').mean()
        r2 = cross_val_score(model, X, y, cv=KFold(5, shuffle=True, random_state=42),
                             scoring='r2').mean()
        results[name] = {"RMSE": round(rmse, 2), "R2": round(r2, 3)}
    return results


def evaluate_classification(X, y):
    models = {
        "LogisticRegression": LogisticRegression(max_iter=1000),
        "LGBMClassifier": LGBMClassifier()
    }
    results = {}
    for name, model in models.items():
        acc = cross_val_score(model, X, y, cv=StratifiedKFold(5, shuffle=True, random_state=42),
                              scoring='accuracy').mean()
        f1 = cross_val_score(model, X, y, cv=StratifiedKFold(5, shuffle=True, random_state=42),
                             scoring='f1').mean()
        results[name] = {"Accuracy": round(acc, 3), "F1": round(f1, 3)}
    return results

# === 5. Définition des scénarios ===
scenarios = {
    "All features": df_encoded.drop(columns=[]),
    "No sensitive": df_encoded.drop(columns=sensitive_features),
    "No sensitive + No G2": df_encoded.drop(columns=sensitive_features + ['G2']),
    "No sensitive + No G1, G2": df_encoded.drop(columns=sensitive_features + ['G1', 'G2'])
}




In [22]:
# === 6. Évaluation ===
regression_results = {}
classification_results = {}

for scenario, data in scenarios.items():
    X = data.drop(columns=['G3'])
    y_reg = data['G3']
    regression_results[scenario] = evaluate_regression(X, y_reg)

    y_clf = (data['G3'] >= 10).astype(int)
    classification_results[scenario] = evaluate_classification(X, y_clf)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 835, number of used features: 33
[LightGBM] [Info] Start training from score 11.413174
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 170
[LightGBM] [Info] Number of data points in the train set: 835, number of used features: 33
[LightGBM] [Info] Start training from score 11.407186
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

In [23]:

# === 7. Affichage des résultats ===
print("\n=== Résultats Régression ===")
print(pd.DataFrame(regression_results).T)

print("\n=== Résultats Classification ===")
print(pd.DataFrame(classification_results).T)


=== Résultats Régression ===
                                     LinearRegression  \
All features              {'RMSE': 1.56, 'R2': 0.832}   
No sensitive              {'RMSE': 1.55, 'R2': 0.834}   
No sensitive + No G2      {'RMSE': 2.22, 'R2': 0.665}   
No sensitive + No G1, G2  {'RMSE': 3.42, 'R2': 0.208}   

                                        LGBMRegressor  
All features              {'RMSE': 1.46, 'R2': 0.853}  
No sensitive              {'RMSE': 1.49, 'R2': 0.847}  
No sensitive + No G2      {'RMSE': 2.08, 'R2': 0.709}  
No sensitive + No G1, G2   {'RMSE': 3.3, 'R2': 0.257}  

=== Résultats Classification ===
                                        LogisticRegression  \
All features              {'Accuracy': 0.925, 'F1': 0.952}   
No sensitive              {'Accuracy': 0.923, 'F1': 0.951}   
No sensitive + No G2      {'Accuracy': 0.876, 'F1': 0.922}   
No sensitive + No G1, G2  {'Accuracy': 0.803, 'F1': 0.882}   

                                            LGBMClassifier 