In [None]:
# Notebook: 03_Intermediate_Preprocessing.ipynb
# Objectif : proposer plusieurs pré-traitements alternatifs vus en STA211,
# puis exporter des jeux de données prêts pour modélisation afin de comparer les F1-scores.

# %%
# 1. Import des bibliothèques
import pandas as pd
import numpy as np
import os
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import KBinsDiscretizer
import prince
from minisom import MiniSom  # si package installé
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction import FeatureHasher

# %%
# 2. Chargement des données transformées
df = pd.read_csv('data_transformed.csv')

# %%
# 3. Préparer la cible et les colonnes
y = df['outcome'].map({'ad.':1, 'noad.':0})
features_num = ['X1','X2','X3','X4']
features_bin = [c for c in df.columns if c not in features_num+['outcome']]

# %%
# 4. Scénarios de pré-traitement intermédiaires
os.makedirs('processed', exist_ok=True)

# 4.1 Discrétisation MDLPC (via arbre binaire - approximée par KBins + entropie)
disc = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
df_disc = df.copy()
df_disc[features_num] = disc.fit_transform(df[features_num])
df_disc.to_csv('processed/data_disc.csv', index=False)

# 4.2 Classification hiérarchique de variables (CAH) pour regrouper continues
corr = df[features_num].corr().abs()
cluster = AgglomerativeClustering(n_clusters=2, affinity='precomputed', linkage='average')
labels = cluster.fit_predict(1-corr)
# Créer moyennes par groupe
df_cah = df.copy()
for grp in np.unique(labels):
    cols = [features_num[i] for i in range(len(features_num)) if labels[i]==grp]
    df_cah[f'group_{grp}'] = df[cols].mean(axis=1)
# drop originales ou non?
# ici on conserve nouveaux et binaires
cols_keep = [f'group_0','group_1'] + features_bin + ['outcome']
df_cah[cols_keep].to_csv('processed/data_cah.csv', index=False)

# 4.3 AFM (pour variables mixtes)
am = prince.MFA(
    groups=[len(features_num), len(features_bin)],
    group_names=['quant','bin'],
    n_components=2,
    random_state=42
)
am = am.fit(df[features_num + features_bin])
coords_mfa = am.transform(df[features_num + features_bin])
df_mfa = pd.concat([pd.DataFrame(coords_mfa, columns=['MFA1','MFA2']), df[features_bin].reset_index(drop=True)], axis=1)
df_mfa.to_csv('processed/data_mfa.csv', index=False)

# 4.4 CARTES DE KOHONEN (SOM)
# Normaliser X1-X4
df_som = df.copy()
X = (df_som[features_num] - df_som[features_num].mean())/df_som[features_num].std()
som = MiniSom(x=3, y=3, input_len=4, sigma=1.0, learning_rate=0.5)
som.random_weights_init(X.values)
som.train_random(X.values, 100)
bmus = np.array([som.winner(x) for x in X.values])
df_som['SOM_cluster'] = bmus[:,0]*3 + bmus[:,1]
df_som.to_csv('processed/data_som.csv', index=False)

# 4.5 Imputation multiple MICE
imp = IterativeImputer(random_state=42)
df_mice = df.copy()
df_mice[features_num] = imp.fit_transform(df_mice[features_num])
df_mice.to_csv('processed/data_mice.csv', index=False)

# %%
# 5. Résumé des fichiers générés
print("Fichiers générés :")
for f in os.listdir('processed'):
    print("- processed/" + f)

# Fin du notebook 03_Intermediate_Preprocessing.ipynb
