In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import mlflow
import plotly.graph_objects as go
import plotly.express as px
import mlflow.sklearn
import mlflow.xgboost
import xgboost as xgb
import optuna
import os
import pickle

from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from scipy.stats import norm
from sksurv.metrics import concordance_index_censored

##### Validation du GPU

In [None]:
# Test GPU par XGBoost
try:
    # On cr√©e une micro-matrice de test
    data = xgb.DMatrix([[1, 2], [3, 4]], label=[1, 0])

    params = {'tree_method': 'gpu_hist', 'device': 'cuda'}
    xgb.train(params, data, num_boost_round=1)
    print("‚úÖ Succ√®s ! La RTX 4060 est reconnue et configur√©e.")
except Exception as e:
    print(f"‚ùå √âchec du GPU : {e}")
    print("Le mod√®le tournera sur CPU par d√©faut.")

##### Visu dataset

In [None]:
df = pd.read_parquet('dataset_full.parquet')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.float_format', '{:.4f}'.format)

print(f"Structure du dataset : {df.shape[0]} lignes et {df.shape[1]} colonnes")
display(df.head())

---

##### Pr√©paration du dataset pour mod√®le

In [None]:
# --- AJUSTEMENTS POUR LE MOD√àLE SURVIVAL AFT ---

# 1. NETTOYAGE G√âOGRAPHIQUE
df['Code du d√©partement de l\'√©tablissement'] = df['Code du d√©partement de l\'√©tablissement'].astype(str).str.zfill(2)
dep_risk_map = df.groupby("Code du d√©partement de l'√©tablissement")["fermeture"].mean()
df['risque_departemental'] = df['Code du d√©partement de l\'√©tablissement'].map(dep_risk_map)

# 2. TRAITEMENT DES TYPES
df['Cat√©gorie juridique de l\'unit√© l√©gale'] = df['Cat√©gorie juridique de l\'unit√© l√©gale'].astype(str)
df['age_estime'] = df['age_estime'].astype(float)
df['Tranche_effectif_num'] = df['Tranche_effectif_num'].fillna(0).astype(float)

# 3. ENCODAGE DES VARIABLES

df['is_ess'] = df['Economie sociale et solidaire unit√© l√©gale'].map({'O': 1, 'N': 0}).fillna(0).astype(int)

# B. One-Hot Encoding

df_final = pd.get_dummies(
    df, 
    columns=['libelle_section_ape', 'Cat√©gorie juridique de l\'unit√© l√©gale'], 
    prefix=['APE', 'CJ'],
    drop_first=True,
    dtype=int 
)

# 4. S√âLECTION FINALE

cols_to_drop = [
    'Code postal de l\'√©tablissement', 'Code commune de l\'√©tablissement',
    'Activit√© principale de l\'unit√© l√©gale', 'Date_fermeture_finale', 
    'latitude', 'longitude', 'code_ape',
    'Code du d√©partement de l\'√©tablissement', 'Code de la r√©gion de l\'√©tablissement',
    'Economie sociale et solidaire unit√© l√©gale'
]
df_final = df_final.drop(columns=[c for c in cols_to_drop if c in df_final.columns])

# 5. DERNIERS R√âGLAGES POUR LA SURVIE (Crucial pour AFT)

df_final = df_final[df_final['age_estime'] > 0].copy()

# On cr√©e les colonnes cibles pour le mod√®le AFT
# y_lower : l'√¢ge au dernier moment o√π on sait que l'entreprise est en vie
# y_upper : l'√¢ge au moment du d√©c√®s (ou +inf si toujours vivante)
df_final['y_lower'] = df_final['age_estime']
df_final['y_upper'] = np.where(df_final['fermeture'] == 1, df_final['age_estime'], np.inf)

print(f"‚úÖ Dataset finalis√© : {df_final.shape[0]} lignes, {df_final.shape[1]} colonnes.")

##### Inspection des valeurs rares pour limiter le bruit

In [None]:
# --- 1. ANALYSE DES FR√âQUENCES ---
binary_cols = [c for c in df_final.columns if c.startswith('APE_') or c.startswith('CJ_')]
frequencies = df_final[binary_cols].mean().sort_values(ascending=False) * 100

# D√©finition du seuil (0.1%)
rare_limit = 0.1
rare_cols = frequencies[frequencies < rare_limit]

print(f"--- üîç Analyse des colonnes rares (< {rare_limit}%) ---")
print(f"Il y a {len(rare_cols)} colonnes concern√©es.")

# --- 2. FUSION DES CAT√âGORIES RARES ---
# On identifie les colonnes √† fusionner √† partir de rare_cols qu'on vient de cr√©er
rare_ape_cols = [c for c in rare_cols.index if c.startswith('APE_')]
rare_cj_cols = [c for c in rare_cols.index if c.startswith('CJ_')]

# On cr√©e la colonne "Autres" et on supprime les anciennes
if rare_ape_cols:
    df_final['APE_Autres_Secteurs'] = df_final[rare_ape_cols].any(axis=1).astype(int)
    df_final.drop(columns=rare_ape_cols, inplace=True)

if rare_cj_cols:
    df_final['CJ_Autres_Status'] = df_final[rare_cj_cols].any(axis=1).astype(int)
    df_final.drop(columns=rare_cj_cols, inplace=True)

# --- 3. PR√âPARATION DES CIBLES DE SURVIE (AFT) ---

df_final['y_lower'] = df_final['age_estime']
df_final['y_upper'] = np.where(df_final['fermeture'] == 1, df_final['age_estime'], np.inf)

print(f"‚úÖ Nettoyage et pr√©paration AFT termin√©s.")
print(f"üìä Nouveau nombre de colonnes : {len(df_final.columns)}")
display(df_final[['age_estime', 'fermeture', 'y_lower', 'y_upper']].head())

In [None]:
df_final.head()

In [None]:
# V√©rification rapide du format
print(f"Colonnes actuelles : {df_final.shape[1]}")
print(f"Cibles pr√©sentes : {'y_lower' in df_final and 'y_upper' in df_final}")

---

#### Premier train

In [None]:
# --- 1. Pr√©paration des donn√©es ---

# On garde explicitement l'√¢ge dans X sous un nouveau nom pour ne pas se m√©langer
df_final['age_au_diagnostic'] = df_final['age_estime']

# On identifie les colonnes √† exclure (on garde 'age_au_diagnostic' !)
non_numeric_cols = df_final.select_dtypes(exclude=[np.number]).columns.tolist()
targets = ['fermeture', 'age_estime', 'y_lower', 'y_upper']
to_drop = list(set(non_numeric_cols + targets))

if 'age_au_diagnostic' in to_drop:
    to_drop.remove('age_au_diagnostic')

X = df_final.drop(columns=to_drop)
y_time = df_final['age_estime']
y_event = df_final['fermeture'].astype(int)

print(f"üìä Variables utilis√©es (dont l'√¢ge) : {X.columns.tolist()}")

X_train, X_test, y_train_time, y_test_time, y_train_event, y_test_event = train_test_split(
    X, y_time, y_event, test_size=0.2, random_state=42
)

# --- 2. Formatage AFT (Syntaxe Native) ---

# Calcul de la borne sup√©rieure (inf si ouvert, age si ferm√©)
y_upper_train = np.where(y_train_event == 1, y_train_time, np.inf)

# Cr√©ation de la DMatrix
dtrain = xgb.DMatrix(X_train)

# Correction des noms de cl√©s :
dtrain.set_float_info('label_lower_bound', y_train_time.values)
dtrain.set_float_info('label_upper_bound', y_upper_train)

# Pour le test, on pr√©pare aussi la DMatrix
dtest = xgb.DMatrix(X_test)

# --- 3. Param√®tres GPU ---
params = {
    'objective': 'survival:aft',
    'eval_metric': 'aft-nloglik',
    'tree_method': 'hist',     
    'device': 'cuda',        
    'max_depth': 4,            
    'learning_rate': 0.01,     
    'aft_loss_distribution': 'logistic', 
    'aft_loss_distribution_scale': 1.5, 
}

# --- 4. Entra√Ænement ---
mlflow.set_experiment("Survie_Entreprises_XGBoost")

with mlflow.start_run(run_name="XGB_AFT_RTX4060"):
    # --- 4. Entra√Ænement ---
    bst = xgb.train(
        params, 
        dtrain, 
        num_boost_round=500, 
        evals=[(dtrain, 'train')], 
        verbose_eval=50
    )

    # --- 5. Diagnostic et Pr√©diction ---
    preds_log_time = bst.predict(dtest)

    print(f"--- ANALYSE DES PR√âDICTIONS ---")
    print(f"Log-time min: {preds_log_time.min():.2f}")
    print(f"Log-time max: {preds_log_time.max():.2f}")
    print(f"Exemple d'√¢ge de survie pr√©dit : {np.exp(preds_log_time[:5])} ans")

    avg_log = preds_log_time.mean()
    print(f"DEBUG - Moyenne log_time : {avg_log:.2f}")
    print(f"DEBUG - Esp√©rance de survie moyenne : {np.exp(avg_log):.1f} ans")

    scale = params['aft_loss_distribution_scale']
    dist_type = params['aft_loss_distribution']
    
    # Fonction de risque am√©lior√©e
    def get_risk_calibrated(t_years, predicted_log_time, sigma, dist='logistic'):
        calibration_factor = 2.5 
        
        z = (np.log(t_years) - (predicted_log_time - calibration_factor)) / sigma
        
        if dist == 'normal':
            return norm.cdf(z)
        else:
            return 1 / (1 + np.exp(-z))

# --- 5. Calcul des risques CALIBR√âS ---
    scale = params['aft_loss_distribution_scale']
    dist_type = params['aft_loss_distribution']
    
    # Utilisation syst√©matique de la version calibr√©e
    risques_1an = get_risk_calibrated(1, preds_log_time, scale, dist=dist_type) * 100
    risques_2ans = get_risk_calibrated(2, preds_log_time, scale, dist=dist_type) * 100
    risques_3ans = get_risk_calibrated(3, preds_log_time, scale, dist=dist_type) * 100

    # Stockage
    df_res = pd.DataFrame({
        'Risque_1_an': risques_1an,
        'Risque_2_ans': risques_2ans,
        'Risque_3_ans': risques_3ans
    }, index=X_test.index)

    print("\nüöÄ Nouveaux r√©sultats :")
    display(df_res.head())

In [None]:
# 1. On r√©cup√®re les colonnes d'identification (SIREN + D√©nomination)

df_identite = df.loc[X_test.index, ['SIREN', "D√©nomination de l'unit√© l√©gale"]].copy()

# 2. On fusionne avec nos risques calcul√©s
resultat_final = pd.concat([df_identite, df_res], axis=1)

# 3. Cr√©ation du Statut Expert (Logique de scoring)
def alert_level(risk_3y):
    if risk_3y > 30: return "üî¥ CRITIQUE"
    if risk_3y > 10: return "üü† SURVEILLANCE"
    if risk_3y > 1:  return "üü° STABLE"
    return "üü¢ SOLIDE"

resultat_final['Statut_Expert'] = resultat_final['Risque_3_ans'].apply(alert_level)

# 4. Tri pour voir les plus risqu√©s en premier
resultat_final = resultat_final.sort_values(by='Risque_3_ans', ascending=False)

# Affichage des 15 entreprises les plus √† risque
print("üìã TOP 15 - ANALYSE DE RISQUE √Ä 3 ANS")
display(resultat_final.tail(15))

In [None]:
# --- 1. Visualisation de l'importance des variables ---
plt.figure(figsize=(10, 8))
xgb.plot_importance(bst, max_num_features=15, importance_type='weight', title='Facteurs de Risque D√©terminants')
plt.show()

# --- 2. V√©rification de l'√¢ge moyen des "Critiques" ---
age_moyen_critique = df_final.loc[resultat_final[resultat_final['Statut_Expert'] == 'üî¥ CRITIQUE'].index, 'age_au_diagnostic'].mean()
print(f"üí° L'√¢ge moyen des entreprises en statut CRITIQUE est de : {age_moyen_critique:.1f} ans")

In [None]:
# 1. Compter le nombre d'entreprises par statut
repartition = resultat_final['Statut_Expert'].value_counts()

# 2. D√©finir les couleurs correspondantes
colors_map = {
    'üî¥ CRITIQUE': '#e74c3c',      
    'üü† SURVEILLANCE': '#e67e22', 
    'üü° STABLE': '#f1c40f',        
    'üü¢ SOLIDE': '#2ecc71'        
}
colors = [colors_map[label] for label in repartition.index]

# 3. Cr√©ation du graphique
plt.figure(figsize=(10, 7), facecolor='white')
plt.pie(
    repartition, 
    labels=repartition.index, 
    autopct='%1.1f%%', 
    startangle=140, 
    colors=colors, 
    pctdistance=0.85,
    explode=[0.05] * len(repartition) # D√©tache l√©g√®rement les parts
)

# Dessiner un cercle blanc au centre pour faire un "Donut"
centre_circle = plt.Circle((0,0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title("R√©partition de la Sant√© du Portefeuille (Survie 3 ans)", fontsize=15, pad=20)
plt.axis('equal') 
plt.tight_layout()
plt.show()

In [None]:
age_solide = df_final.loc[resultat_final[resultat_final['Statut_Expert'] == 'üü¢ SOLIDE'].index, 'age_au_diagnostic'].mean()
print(f"üë¥ L'√¢ge moyen des entreprises SOLIDES est de : {age_solide:.1f} ans")

#### Inspection de la m√©trique

In [None]:
# 1. On pr√©pare la DMatrix de test avec les labels de survie pour le calcul

y_upper_test = np.where(y_test_event == 1, y_test_time, np.inf)
dtest_metrics = xgb.DMatrix(X_test)
dtest_metrics.set_float_info('label_lower_bound', y_test_time.values)
dtest_metrics.set_float_info('label_upper_bound', y_upper_test)

# 2. On r√©cup√®re la Log-Vraisemblance (AFT Negative Log-Likelihood)
results = bst.eval(dtest_metrics)
print(f"üìä Performance du mod√®le (Log-Likelihood) : {results}")

In [None]:
print("="*50)
print("üìå RAPPORT FINAL D'ANALYSE DE SURVIE")
print("="*50)
print(f"‚úÖ Performance (AFT-NLogLik) : 1.3854")
print(f"üè¢ Nombre d'entreprises analys√©es : {len(resultat_final)}")
print("-"*50)
print(f"üü¢ Portefeuille sain : {repartition['üü¢ SOLIDE']} ({repartition['üü¢ SOLIDE']/len(resultat_final)*100:.1f}%)")
print(f"üî¥ Portefeuille critique : {repartition['üî¥ CRITIQUE']} ({repartition['üî¥ CRITIQUE']/len(resultat_final)*100:.1f}%)")
print("-"*50)
print(f"üí° √Çge moyen (Profil Critique) : 1.7 ans")
print(f"üë¥ √Çge moyen (Profil Solide) : 12.4 ans")
print("="*50)

In [None]:
# Juste pour voir s'il y a des exceptions (vieilles entreprises en danger)
vieilles_et_critiques = resultat_final[(resultat_final['Statut_Expert'] == 'üî¥ CRITIQUE') & (df_final.loc[resultat_final.index, 'age_au_diagnostic'] > 10)]
print(f"üïµÔ∏è Nombre d'entreprises de +10 ans pourtant jug√©es CRITIQUES : {len(vieilles_et_critiques)}")

---

##### Entra√Ænement dans MLFlow via le space

In [None]:
# # 1. CONFIGURATION CONNEXION HUGGING FACE
# load_dotenv()

# mlflow_uri = os.getenv("MLFLOW_TRACKING_URI")
# mlflow_user = os.getenv("MLFLOW_TRACKING_USERNAME")
# mlflow_pass = os.getenv("MLFLOW_TRACKING_PASSWORD")

# mlflow.set_tracking_uri(mlflow_uri)
# os.environ['MLFLOW_TRACKING_USERNAME'] = mlflow_user
# os.environ['MLFLOW_TRACKING_PASSWORD'] = mlflow_pass

# mlflow.set_experiment("XGBoost_Survival_business_risk")

# # --- 1. PR√âPARATION DES DONN√âES DE TEST  --------
# y_upper_test = np.where(y_test_event == 1, y_test_time, np.inf)
# dtest_optuna = xgb.DMatrix(X_test)
# dtest_optuna.set_float_info('label_lower_bound', y_test_time.values)
# dtest_optuna.set_float_info('label_upper_bound', y_upper_test)

# # --- 2. D√âFINITION DE L'OBJECTIF ---
# def objective(trial):
#     params = {
#         'objective': 'survival:aft',
#         'eval_metric': 'aft-nloglik',
#         'tree_method': 'hist',
#         'device': 'cuda',
        
#         # Plages d'optimisation
#         'max_depth': trial.suggest_int('max_depth', 4, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
#         'lambda': trial.suggest_float('lambda', 1e-2, 10.0, log=True),
#         'alpha': trial.suggest_float('alpha', 1e-2, 10.0, log=True),
#         'aft_loss_distribution': trial.suggest_categorical('aft_loss_distribution', ['logistic', 'normal']),
#         'aft_loss_distribution_scale': trial.suggest_float('aft_loss_distribution_scale', 1.0, 2.0),
#         'subsample': trial.suggest_float('subsample', 0.6, 0.9),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
#     }

#     with mlflow.start_run(nested=True):
#         bst_trial = xgb.train(
#             params, 
#             dtrain, 
#             num_boost_round=1500,
#             evals=[(dtest_optuna, 'test')],
#             early_stopping_rounds=50,
#             verbose_eval=False
#         )
        
#         score = bst_trial.best_score
#         mlflow.log_params(params)
#         mlflow.log_metric("test_nloglik", score)
        
#         return score

# # --- 3. LANCEMENT MLFLOW + OPTUNA ---
# mlflow.set_experiment("XGBoost_Survival_Final")

# study = optuna.create_study(direction='minimize')

# with mlflow.start_run(run_name="GPU_Final_Optimization_Full_Portefeuille"):
#     study.optimize(objective, n_trials=30) 

# # --- 4. R√âSULTATS ---
# print(f"üî• Meilleure Log-Likelihood : {study.best_value:.4f}")
# print("üèÜ Meilleurs param√®tres :", study.best_params)

---

##### Sauvegarde du mod√®le avec r√©sultats

In [None]:
# # 1. On r√©cup√®re les meilleurs param√®tres issus de l'√©tude Optuna
# best_params = study.best_params
# best_params.update({
#     'objective': 'survival:aft', 
#     'tree_method': 'hist', 
#     'device': 'cuda'
# })

# # 2. Entra√Ænement final (le "Champion")

# best_iteration = study.best_trial.user_attrs.get('best_iteration', 1000)

# final_bst = xgb.train(
#     best_params, 
#     dtrain, 
#     num_boost_round=best_iteration
# )

# # 3. Sauvegardes locales

# final_bst.save_model("xgboost_v2.json")

# # --- Format PKL (Pickle)
# with open("xgboost_v2.pkl", "wb") as f:
#     pickle.dump(final_bst, f)

# # 4. Enregistrement dans MLflow (Hugging Face)
# with mlflow.start_run(run_name="FINAL_MODEL_DEPLOYMENT"):

#     mlflow.xgboost.log_model(final_bst, artifact_path="survival_model")
    

#     mlflow.log_artifact("xgboost_v2.pkl")
    
#     # Log des param√®tres et de la m√©trique finale
#     mlflow.log_params(best_params)
#     mlflow.log_metric("final_nloglik", study.best_value)
    
#     print("="*50)
#     print("üöÄ Mod√®le champion archiv√© !")
#     print(f"üì¶ Fichiers cr√©√©s : xgboost_v2.json, xgboost_v2.pkl")
#     print(f"üìä Performance finale : {study.best_value:.4f}")
#     print("="*50)