# Modèles Scikit-Learn - Classification et Régression

Ce notebook implémente des modèles classiques avec Scikit-Learn pour comparer avec l'approche PyTorch:
1. **Classification**: Prédire si un étudiant va compléter le cours (Completed: 0 ou 1)
2. **Régression**: Prédire Quiz_Score_Avg, Project_Grade, Satisfaction_Rating, Time_Spent_Hours

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score

## 1. Chargement des données

In [None]:
# Classification
X_class = pd.read_csv('../data/processed/X_classification.csv')
y_class = pd.read_csv('../data/processed/y_classification.csv')

# Régression
X_reg = pd.read_csv('../data/processed/X_regression.csv')
y_reg = pd.read_csv('../data/processed/y_regression.csv')

print("CLASSIFICATION:")
print(f"  X_class: {X_class.shape}")
print(f"  y_class: {y_class.shape}")

print("\nREGRESSION:")
print(f"  X_reg: {X_reg.shape}")
print(f"  y_reg: {y_reg.shape}")

---
# PARTIE 1: CLASSIFICATION
---
**Objectif**: Prédire la complétion du cours (0/1)

In [None]:
# Split train/test (80% train, 20% test)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

# Raveler y pour sklearn (attend un vecteur 1D, pas une colonne)
y_train_class = y_train_class.values.ravel()
y_test_class = y_test_class.values.ravel()

# Normalisation
scaler_class = StandardScaler()
X_train_class_scaled = scaler_class.fit_transform(X_train_class)
X_test_class_scaled = scaler_class.transform(X_test_class)

print(f"Train size: {X_train_class_scaled.shape[0]}, Test size: {X_test_class_scaled.shape[0]}")

### Modèle 1: Régression Logistique

In [None]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_class_scaled, y_train_class)

y_pred_log = log_reg.predict(X_test_class_scaled)
acc_log = accuracy_score(y_test_class, y_pred_log)

print(f"Logistic Regression Accuracy: {acc_log:.4f}")
print("\nClassification Report:")
print(classification_report(y_test_class, y_pred_log))

### Modèle 2: Random Forest Classifier

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_class_scaled, y_train_class)

y_pred_rf = rf_clf.predict(X_test_class_scaled)
acc_rf = accuracy_score(y_test_class, y_pred_rf)

print(f"Random Forest Accuracy: {acc_rf:.4f}")

In [None]:
# Visualisation Matrice de Confusion (Random Forest)
cm = confusion_matrix(y_test_class, y_pred_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('Vrai')
plt.xlabel('Prédit')
plt.show()

---
# PARTIE 2: REGRESSION
---
**Objectif**: Prédire 4 variables continues (Quiz, Project, Satisfaction, Time)

In [None]:
# Split train/test
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Normalisation X
scaler_X_reg = StandardScaler()
X_train_reg_scaled = scaler_X_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_X_reg.transform(X_test_reg)

# Normalisation y (pour comparer équitablement avec PyTorch qui l'avait fait)
scaler_y_reg = StandardScaler()
y_train_reg_scaled = scaler_y_reg.fit_transform(y_train_reg)
y_test_reg_scaled = scaler_y_reg.transform(y_test_reg)

print(f"Train size: {X_train_reg_scaled.shape[0]}, Test size: {X_test_reg_scaled.shape[0]}")

### Modèle 1: Régression Linéaire (Multi-output)

In [None]:
# LinearRegression gère nativement le multi-output
lin_reg = LinearRegression()
lin_reg.fit(X_train_reg_scaled, y_train_reg_scaled)

y_pred_lin_scaled = lin_reg.predict(X_test_reg_scaled)

# Inverse transform pour les métriques réelles
y_pred_lin = scaler_y_reg.inverse_transform(y_pred_lin_scaled)
y_test_reg_inv = scaler_y_reg.inverse_transform(y_test_reg_scaled)

mse_lin = mean_squared_error(y_test_reg_inv, y_pred_lin)
rmse_lin = np.sqrt(mse_lin)

print(f"Linear Regression Global RMSE: {rmse_lin:.4f}")

### Modèle 2: Random Forest Regressor

In [None]:
# RF Regressor gère aussi le multi-output
rf_reg = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_reg_scaled, y_train_reg_scaled)

y_pred_rf_scaled = rf_reg.predict(X_test_reg_scaled)

# Inverse transform
y_pred_rf_reg = scaler_y_reg.inverse_transform(y_pred_rf_scaled)

mse_rf = mean_squared_error(y_test_reg_inv, y_pred_rf_reg)
rmse_rf = np.sqrt(mse_rf)

print(f"Random Forest Global RMSE: {rmse_rf:.4f}")

In [None]:
# Métriques détaillées par variable (RF)
target_names = ['Quiz_Score_Avg', 'Project_Grade', 'Satisfaction_Rating', 'Time_Spent_Hours']

print("Détail par variable (Random Forest):")
print("="*60)

for i, name in enumerate(target_names):
    mse = mean_squared_error(y_test_reg_inv[:, i], y_pred_rf_reg[:, i])
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_reg_inv[:, i], y_pred_rf_reg[:, i])
    print(f"{name}:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²: {r2:.4f}")
    print()

In [None]:
# Comparaison Graphique (Exemple sur Project_Grade - indice 1)
idx = 1
name = target_names[idx]

plt.figure(figsize=(10, 5))
plt.scatter(y_test_reg_inv[:, idx], y_pred_rf_reg[:, idx], alpha=0.3)
plt.plot([y_test_reg_inv[:, idx].min(), y_test_reg_inv[:, idx].max()],
         [y_test_reg_inv[:, idx].min(), y_test_reg_inv[:, idx].max()],
         'r--', lw=2)
plt.title(f'Random Forest: Vrai vs Prédit ({name})')
plt.xlabel('Vraies Valeurs')
plt.ylabel('Valeurs Prédites')
plt.grid(True)
plt.show()