In [1]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from torch.optim import Adam
import polars as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from torch.cuda.amp import autocast, GradScaler
import lightgbm as lgb  
import warnings
warnings.filterwarnings("ignore")
from utils import r2_loss, train_model, plot_r2

# Data

In [2]:
subpath = "/home/onyxia/work/AML/data/"
weights = pl.read_parquet(subpath + "weights.parquet")
target = pl.read_parquet(subpath + "target.parquet")
responders = pl.read_parquet(subpath + "responders.parquet")
features = pl.read_parquet(subpath + "features.parquet")

# Architecture

## MLP Buider

In [7]:
def make_mlp(widths, activation=nn.ReLU, dropout_p=0.0, use_bn=False):
    """
    Creates a Multi-Layer Perceptron (MLP) block with optional Dropout and Batch Normalization.

    Args:
        widths (list): [in_dim, h1, h2, ..., out_dim]
        activation (nn.Module): Activation function to use (default: ReLU).
        dropout_p (float): Probability of an element to be zeroed (default: 0.0).
        use_bn (bool): Whether to insert BatchNorm1d layers (default: False).
    """
    layers = []
    for in_f, out_f in zip(widths[:-1], widths[1:]):
        is_output_layer = (out_f == widths[-1])
        layers.append(nn.Linear(in_f, out_f))
        if not is_output_layer:
            if use_bn:
                layers.append(nn.BatchNorm1d(out_f))
            layers.append(activation())
            if dropout_p > 0.0:
                layers.append(nn.Dropout(p=dropout_p))
    return nn.Sequential(*layers)


## Auto-Encoder supervised & unsupervised

In [9]:
class Encoder(nn.Module):
    def __init__(self, widths, dropout_p=0.0, use_bn=False):
        """
        Initializes the Encoder using make_mlp with specified regularization options.
        """
        super().__init__()
        # Pass regularization parameters to the underlying MLP builder
        self.net = make_mlp(widths, dropout_p=dropout_p, use_bn=use_bn)

    def forward(self, x):
        return self.net(x)


class Decoder(nn.Module):
    def __init__(self, widths, dropout_p=0.0, use_bn=False):
        """
        Initializes the Decoder using make_mlp with specified regularization options.
        """
        super().__init__()
        # Pass regularization parameters to the underlying MLP builder
        self.net = make_mlp(widths, dropout_p=dropout_p, use_bn=use_bn)

    def forward(self, z):
        return self.net(z)


class TaskHead(nn.Module):
    def __init__(self, widths, dropout_p=0.0, use_bn=False):
        """
        Initializes the TaskHead using make_mlp with specified regularization options.
        """
        super().__init__()
        # Pass regularization parameters to the underlying MLP builder
        self.net = make_mlp(widths, dropout_p=dropout_p, use_bn=use_bn)

    def forward(self, z):
        return self.net(z)


## Build up

In [10]:
class FullModel(nn.Module):
    def __init__(self,
                 encoder_widths,   # e.g.: [input_dim, 256, 128, latent_dim]
                 decoder_widths,   # e.g.: [latent_dim, 128, 256, input_dim]
                 head_widths,      # e.g.: [latent_dim, 64, output_dim]
                 
                 # --- NEW PARAMETERS FOR REGULARIZATION ---
                 dropout_p=0.0,
                 use_bn=False
                ):
        """
        Initializes the FullModel, passing regularization parameters to all sub-modules.
        """
        super().__init__()

        # Pass all necessary regularization parameters to the Encoder, Decoder, and TaskHead
        self.encoder = Encoder(encoder_widths, dropout_p=dropout_p, use_bn=use_bn)
        self.decoder = Decoder(decoder_widths, dropout_p=dropout_p, use_bn=use_bn)
        # Note: BN/Dropout are often disabled or reduced on the TaskHead if the output 
        # is critical for downstream metrics like LGBM. You can adjust this call if needed.
        self.head = TaskHead(head_widths, dropout_p=dropout_p, use_bn=use_bn)

    def forward(self, x):
        """
        Forward pass: X -> Z -> (X_hat, Y_hat)
        """
        # 1. Encode: Map input X to latent representation Z
        z = self.encoder(x)
        
        # 2. Decode: Reconstruct input X_hat from Z (Auxiliary Task)
        x_hat = self.decoder(z)
        
        # 3. Predict: Map latent Z to target Y_hat (Supervised Task)
        y_hat = self.head(z)
        
        return z, x_hat, y_hat


# Train real data

In [11]:
n = features.height
n_train = int(0.8 * n)

X_train = features.slice(0, n_train)
y_train = target.slice(0, n_train)

X_test = features.slice(n_train)
y_test = target.slice(n_train)


# --- 2. CONVERSION VERS NUMPY POUR LE CALCUL STATISTIQUE ---
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy().reshape(-1, 1)
X_test_np = X_test.to_numpy()
y_test_np = y_test.to_numpy().reshape(-1, 1)


# --- 3. CALCUL DES STATISTIQUES SUR X_TRAIN ET Y_TRAIN ---

# Calcul de la moyenne (mean) et de l'écart-type (std) pour les features (X)
# Keepdims=True ensures the shape is retained for broadcasting
X_mean = np.mean(X_train_np, axis=0, keepdims=True)
X_std = np.std(X_train_np, axis=0, keepdims=True)

# Important: Avoid division by zero if a feature has zero standard deviation
# We replace 0 std with 1 to prevent division by zero, effectively skipping standardization for constant features.
X_std[X_std == 0] = 1 

# Calcul de la moyenne et de l'écart-type pour la cible (Y)
Y_mean = np.mean(y_train_np, axis=0, keepdims=True)
Y_std = np.std(y_train_np, axis=0, keepdims=True)
Y_std[Y_std == 0] = 1 


# --- 4. APPLICATION DE LA STANDARDISATION ---

# Application sur l'ensemble d'entraînement (X et Y)
X_train_scaled = (X_train_np - X_mean) / X_std
y_train_scaled = (y_train_np - Y_mean) / Y_std

# Application sur l'ensemble de test, en utilisant les stats de TRAIN! (X et Y)
X_test_scaled = (X_test_np - X_mean) / X_std
y_test_scaled = (y_test_np - Y_mean) / Y_std


# --- 5. CONVERSION VERS TENSEURS PYTORCH ---

X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_t = torch.tensor(y_train_scaled, dtype=torch.float32)

X_test_t  = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_t  = torch.tensor(y_test_scaled, dtype=torch.float32)

print("Standardization complete.")
print(f"X_train_t shape after scaling: {X_train_t.shape}")
print(f"Mean of X_train_t (should be ~0): {X_train_t.mean(dim=0)}")
print(f"Std of X_train_t (should be ~1): {X_train_t.std(dim=0)}")

Standardization complete.
X_train_t shape after scaling: torch.Size([1219662, 56])
Mean of X_train_t (should be ~0): tensor([ 8.1820e-09, -9.2579e-10,  3.9096e-09,  3.6093e-09,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  5.6298e-11, -8.7575e-09, -2.0017e-10,
         3.3816e-08,  1.4741e-07,  6.0051e-09, -1.4262e-09,  2.2563e-08,
        -2.6489e-07,  1.3570e-07,  1.3480e-08,  2.6043e-08, -1.0290e-07,
         2.5027e-07, -7.7481e-08,  1.3806e-08,  3.3966e-09,  9.9585e-09,
         4.3787e-10,  3.0463e-09,  1.1084e-08,  1.8516e-09, -1.4562e-08,
        -9.4174e-09, -1.8140e-09,  3.1277e-09,  1.8453e-09,  6.1552e-09,
         7.5564e-09, -2.5021e-11,  5.8800e-09, -2.0267e-09, -7.1874e-09,
        -1.4975e-08,  5.4396e-08,  1.0822e-09, -1.8134e-08,  1.2411e-08,
        -1.4913e-08, -5.8362e-09, -5.1044e-09, -5.8315e-09, -8.1945e-10,
        -7.4814e-09,  5.6235e-09,  6.8058e-08, -7.0085e-08,  2.7561e-08,
        -7.8442e-09])
Std of X_train_t (should be ~1): tensor([1.0000, 1.0000, 1

# Test of various architecture

In [14]:
# --- DÉFINITION DES DIMENSIONS FIXES ---
INPUT_DIM = 56 
OUTPUT_DIM = 1 
LATENT_DIM = 16 

# --- ARCHITECTURE FIXE (4 couches linéaires en tout) ---
ENCODER_W = [INPUT_DIM, 64, 32, LATENT_DIM]
DECODER_W = [LATENT_DIM, 32, 64, INPUT_DIM]
HEAD_W =    [LATENT_DIM, 32, OUTPUT_DIM] 

ARCHITECTURES_CONFIGS = {
    # 1. Baseline: Référence sans aucune régularisation.
    "A_Baseline_None": {
        "encoder_widths": ENCODER_W,
        "decoder_widths": DECODER_W,
        "head_widths":    HEAD_W,
        "dropout_p": 0.0,
        "use_bn": False
    },
    
    # 2. Test Dropout: Ajoute une régularisation pour prévenir le surapprentissage.
    "B_Dropout_Only": {
        "encoder_widths": ENCODER_W,
        "decoder_widths": DECODER_W,
        "head_widths":    HEAD_W,
        "dropout_p": 0.15, # Dropout modéré
        "use_bn": False
    },
    
    # 3. Test Batch Norm: Ajoute la BN pour stabiliser et accélérer l'entraînement.
    "C_BatchNorm_Only": {
        "encoder_widths": ENCODER_W,
        "decoder_widths": DECODER_W,
        "head_widths":    HEAD_W,
        "dropout_p": 0.0,
        "use_bn": True
    },
    
    # 4. Test Full Reg: Combine BN et un Dropout plus élevé pour maximiser la généralisation.
    "D_Full_Regularization": {
        "encoder_widths": ENCODER_W,
        "decoder_widths": DECODER_W,
        "head_widths":    HEAD_W,
        "dropout_p": 0.25, # Dropout plus important
        "use_bn": True
    },
}

In [None]:
# --- Définition des Hyperparamètres d'Entraînement Fixes ---
# Basés sur la correction de la stagnation (LR plus faible, alpha réduit)
FIXED_PARAMS = {
    "n_epochs": 30,
    "batch_size": 512,
    "lr": 1e-3,      # Learning Rate stable
    "alpha": 1.0,    # Poids de Reconstruction faible (pour prioriser la supervision)
    "beta": 1.0,     # Poids de Supervision fort
    "optimizer_cls": Adam,
    "loss_rec_cls": nn.MSELoss,
    "loss_sup_cls": nn.MSELoss,
    "weight_decay": 1e-5 # Ajout d'une petite régularisation L2, conseillée avec Adam
}

# Dictionnaire pour stocker tous les résultats pour l'analyse finale
all_experiment_results = {}

print("--- STARTING ARCHITECTURAL EXPERIMENTS ---")
print(f"Total Epochs: {FIXED_PARAMS['n_epochs']} | LR: {FIXED_PARAMS['lr']} | Alpha (Rec): {FIXED_PARAMS['alpha']}")

for arch_name, config in ARCHITECTURES_CONFIGS.items():
    print(f"\n=======================================================")
    print(f"RUNNING EXPERIMENT: {arch_name}")
    print(f" -> Dropout: {config['dropout_p']}, BatchNorm: {config['use_bn']}")
    print(f" -> Encoder: {config['encoder_widths']}")
    print(f"=======================================================")
    
    # 1. Création du modèle avec les paramètres d'architecture et de régularisation
    # Assurez-vous que les tenseurs de données sont définis (X_train_t, etc.)
    try:
        model_instance = FullModel(
            encoder_widths=config['encoder_widths'],
            decoder_widths=config['decoder_widths'],
            head_widths=config['head_widths'],
            dropout_p=config['dropout_p'],
            use_bn=config['use_bn']
        )
    except NameError:
        print("\nERROR: FullModel, X_train_t, or ARCHITECTURES_CONFIGS are not defined.")
        # Simuler un retour pour éviter l'échec total si les données ne sont pas chargées
        break

    # 2. Lancement de l'entraînement
    r2_rec, r2_sup, r2_lgbm, losses = train_model(
        model=model_instance,
        X_train_t=X_train_t, y_train_t=y_train_t,
        X_test_t=X_test_t, y_test_t=y_test_t,
        **FIXED_PARAMS
    )
    
    # 3. Stockage des résultats
    all_experiment_results[arch_name] = {
        'r2_rec': r2_rec, 
        'r2_sup': r2_sup, 
        'r2_lgbm': r2_lgbm, 
        'final_lgbm_r2': r2_lgbm[-1] if r2_lgbm else None,
        'final_sup_r2': r2_sup[-1] if r2_sup else None
    }
    
print("\n--- ALL EXPERIMENTS COMPLETED ---")

# --- ANALYSE FINALE DES PERFORMANCES ---

print("\nFINAL R² LGBM PERFORMANCE COMPARISON:")
final_lgbm_results = {name: res['final_lgbm_r2'] for name, res in all_experiment_results.items()}

# Trier et afficher les résultats pour déterminer la meilleure architecture
sorted_results = sorted(final_lgbm_results.items(), key=lambda item: item[1] if item[1] is not None else -float('inf'), reverse=True)

for name, r2 in sorted_results:
    print(f"- {name:<20}: R² LGBM = {r2:.4f}")

# Vous pouvez ensuite utiliser matplotlib pour visualiser l'évolution
# plot_comparison(all_experiment_results) # Vous devrez adapter plot_comparison pour qu'elle itère sur ce dictionnaire

--- STARTING ARCHITECTURAL EXPERIMENTS ---
Total Epochs: 30 | LR: 0.001 | Alpha (Rec): 1.0

RUNNING EXPERIMENT: A_Baseline_None
 -> Dropout: 0.0, BatchNorm: False
 -> Encoder: [56, 64, 32, 16]
Epoch 01 | loss=3091.360 | R2_sup_NN=-0.0046 | R2_sup_LGBM=-0.0021 | R2_rec=0.3072
Epoch 02 | loss=2792.084 | R2_sup_NN=-0.0038 | R2_sup_LGBM=-0.0072 | R2_rec=0.3774
Epoch 03 | loss=2732.474 | R2_sup_NN=-0.0106 | R2_sup_LGBM=-0.0061 | R2_rec=0.4420
Epoch 04 | loss=2716.511 | R2_sup_NN=-0.0168 | R2_sup_LGBM=-0.0060 | R2_rec=0.4689
Epoch 05 | loss=2706.618 | R2_sup_NN=-0.0180 | R2_sup_LGBM=-0.0049 | R2_rec=0.4582
Epoch 06 | loss=2700.508 | R2_sup_NN=-0.0252 | R2_sup_LGBM=-0.0073 | R2_rec=0.4608
Epoch 07 | loss=2694.035 | R2_sup_NN=-0.0174 | R2_sup_LGBM=-0.0151 | R2_rec=0.4892
Epoch 08 | loss=2688.460 | R2_sup_NN=-0.0312 | R2_sup_LGBM=-0.0118 | R2_rec=0.4731
Epoch 09 | loss=2684.714 | R2_sup_NN=-0.0170 | R2_sup_LGBM=-0.0117 | R2_rec=0.5037
Epoch 10 | loss=2681.013 | R2_sup_NN=-0.0471 | R2_sup_LGBM=-