In [None]:
# ====================================================================
# NOTEBOOK 3: COMPARAISON ET √âVALUATION DES MOD√àLES ML
# ====================================================================
# Ce notebook compare diff√©rents algorithmes de machine learning pour
# pr√©dire les variables cibles du forage p√©trolier et s√©lectionne
# les meilleurs mod√®les.

In [None]:
# ====================================================================
# IMPORTS ET CONFIGURATION
# ====================================================================

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ML imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix
)

# Mod√®les de r√©gression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

# Mod√®les de classification
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# XGBoost et LightGBM si disponibles
try:
    from xgboost import XGBRegressor, XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False

try:
    from lightgbm import LGBMRegressor, LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False

import warnings
warnings.filterwarnings('ignore')
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# Configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("ü§ñ COMPARAISON ET √âVALUATION DES MOD√àLES ML")
print("=" * 60)

# Ajouter le path pour nos modules
import sys
sys.path.append('../src')

from utils.metrics import RegressionMetrics, ClassificationMetrics, MetricsReporter

ü§ñ COMPARAISON ET √âVALUATION DES MOD√àLES ML


In [None]:
# ====================================================================
# CHARGEMENT DES DONN√âES PREPROCESS√âES
# ====================================================================

In [None]:
print("üìä Chargement des donn√©es preprocess√©es...")

# Charger les datasets cr√©√©s dans le notebook pr√©c√©dent
datasets = {}
dataset_names = ['formation_pressure', 'rop_prediction', 'kick_detection']

for name in dataset_names:
    try:
        filepath = f'../data/processed/{name}_features.csv'
        df = pd.read_csv(filepath)
        datasets[name] = df
        print(f"‚úÖ {name}: {df.shape}")
    except FileNotFoundError:
        print(f"‚ö†Ô∏è {filepath} non trouv√©")

# Si pas de donn√©es preprocess√©es, utiliser des donn√©es synth√©tiques
if not datasets:
    print("‚ö†Ô∏è Cr√©ation de donn√©es synth√©tiques pour la d√©monstration...")
    sys.path.append('../src')
    from data.data_loader import DataLoader
    
    loader = DataLoader()
    synthetic_data = loader.load_synthetic_drilling_data(n_samples=3000, random_seed=42)
    
    # Cr√©er des datasets simplifi√©s
    formation_cols = ['Depth', 'WOB', 'RPM', 'MudWeight', 'Temperature', 'FormationPressure']
    if all(col in synthetic_data.columns for col in formation_cols):
        datasets['formation_pressure'] = synthetic_data[formation_cols].copy()
    
    rop_cols = ['WOB', 'RPM', 'FlowRateIn', 'MudWeight', 'Torque', 'ROP']  
    if all(col in synthetic_data.columns for col in rop_cols):
        datasets['rop_prediction'] = synthetic_data[rop_cols].copy()
        
    kick_cols = ['FlowRateIn', 'FlowRateOut', 'StandpipePressure', 'CasingPressure', 'WOB', 'RPM', 'Kick']
    if all(col in synthetic_data.columns for col in kick_cols):
        datasets['kick_detection'] = synthetic_data[kick_cols].copy()

print(f"üìã Datasets disponibles: {list(datasets.keys())}")

üìä Chargement des donn√©es preprocess√©es...
‚ö†Ô∏è ../data/processed/formation_pressure_features.csv non trouv√©
‚ö†Ô∏è ../data/processed/rop_prediction_features.csv non trouv√©
‚ö†Ô∏è ../data/processed/kick_detection_features.csv non trouv√©
‚ö†Ô∏è Cr√©ation de donn√©es synth√©tiques pour la d√©monstration...


SyntaxError: unterminated string literal (detected at line 686) (data_loader.py, line 686)

In [None]:
# ====================================================================
# D√âFINITION DES MOD√àLES √Ä TESTER
# ====================================================================

In [9]:
print(f"\nü§ñ D√âFINITION DES MOD√àLES")
print("-" * 40)

def get_regression_models():
    """Retourne un dictionnaire des mod√®les de r√©gression √† tester"""
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge': Ridge(alpha=1.0),
        'Lasso': Lasso(alpha=1.0),
        'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
        'SVR': SVR(kernel='rbf'),
        'KNN': KNeighborsRegressor(n_neighbors=5),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'MLP': MLPRegressor(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
    }
    
    if XGBOOST_AVAILABLE:
        models['XGBoost'] = XGBRegressor(n_estimators=100, random_state=42)
    
    if LIGHTGBM_AVAILABLE:
        models['LightGBM'] = LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
    
    return models

def get_classification_models():
    """Retourne un dictionnaire des mod√®les de classification √† tester"""
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
        'SVC': SVC(kernel='rbf', probability=True, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
        'Naive Bayes': GaussianNB()
    }
    
    if XGBOOST_AVAILABLE:
        models['XGBoost'] = XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
    
    if LIGHTGBM_AVAILABLE:
        models['LightGBM'] = LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
    
    return models

regression_models = get_regression_models()
classification_models = get_classification_models()

print(f"üî¢ Mod√®les de r√©gression: {len(regression_models)}")
for name in regression_models.keys():
    print(f"  ‚Ä¢ {name}")

print(f"\nüéØ Mod√®les de classification: {len(classification_models)}")
for name in classification_models.keys():
    print(f"  ‚Ä¢ {name}")


ü§ñ D√âFINITION DES MOD√àLES
----------------------------------------
üî¢ Mod√®les de r√©gression: 13
  ‚Ä¢ Linear Regression
  ‚Ä¢ Ridge
  ‚Ä¢ Lasso
  ‚Ä¢ ElasticNet
  ‚Ä¢ Random Forest
  ‚Ä¢ Gradient Boosting
  ‚Ä¢ AdaBoost
  ‚Ä¢ SVR
  ‚Ä¢ KNN
  ‚Ä¢ Decision Tree
  ‚Ä¢ MLP
  ‚Ä¢ XGBoost
  ‚Ä¢ LightGBM

üéØ Mod√®les de classification: 11
  ‚Ä¢ Logistic Regression
  ‚Ä¢ Random Forest
  ‚Ä¢ Gradient Boosting
  ‚Ä¢ AdaBoost
  ‚Ä¢ SVC
  ‚Ä¢ KNN
  ‚Ä¢ Decision Tree
  ‚Ä¢ MLP
  ‚Ä¢ Naive Bayes
  ‚Ä¢ XGBoost
  ‚Ä¢ LightGBM


In [None]:
# ====================================================================
# FONCTION DE COMPARAISON DES MOD√àLES
# ====================================================================

In [10]:
def compare_models(X, y, models, task_type='regression', cv=5, test_size=0.2, random_state=42):
    """
    Compare les performances de diff√©rents mod√®les
    
    Args:
        X: Features
        y: Target
        models: Dictionnaire des mod√®les
        task_type: 'regression' ou 'classification'  
        cv: Nombre de folds pour cross-validation
        test_size: Taille du test set
        random_state: Graine al√©atoire
    
    Returns:
        DataFrame avec les r√©sultats
    """
    print(f"üîÑ Comparaison de {len(models)} mod√®les ({task_type})...")
    
    # Division train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state,
        stratify=y if task_type == 'classification' and y.nunique() < 20 else None
    )
    
    # Normalisation des donn√©es
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    results = []
    
    for name, model in models.items():
        print(f"  Entra√Ænement: {name}...")
        
        try:
            # Mesurer le temps d'entra√Ænement
            import time
            start_time = time.time()
            
            # Cross-validation
            if task_type == 'regression':
                cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, 
                                          scoring='neg_mean_squared_error', n_jobs=-1)
                cv_metric = 'RMSE'
                cv_score_mean = np.sqrt(-cv_scores.mean())
                cv_score_std = np.sqrt(cv_scores.std())
            else:
                cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, 
                                          scoring='f1_weighted', n_jobs=-1)
                cv_metric = 'F1'
                cv_score_mean = cv_scores.mean()
                cv_score_std = cv_scores.std()
            
            # Entra√Ænement sur donn√©es compl√®tes d'entra√Ænement
            model.fit(X_train_scaled, y_train)
            training_time = time.time() - start_time
            
            # Pr√©dictions sur test set
            start_pred_time = time.time()
            y_pred = model.predict(X_test_scaled)
            prediction_time = time.time() - start_pred_time
            
            # Calcul des m√©triques de test
            if task_type == 'regression':
                test_metrics = RegressionMetrics.calculate_all_metrics(y_test, y_pred)
                main_metric = test_metrics['rmse']
            else:
                y_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else None
                test_metrics = ClassificationMetrics.calculate_all_metrics(y_test, y_pred, y_proba)
                main_metric = test_metrics['f1_weighted']
            
            # Stocker les r√©sultats
            result = {
                'Model': name,
                f'CV_{cv_metric}_Mean': cv_score_mean,
                f'CV_{cv_metric}_Std': cv_score_std,
                f'Test_{cv_metric}': main_metric,
                'Training_Time_s': training_time,
                'Prediction_Time_s': prediction_time,
                'Status': 'Success'
            }
            
            # Ajouter m√©triques sp√©cifiques
            if task_type == 'regression':
                result.update({
                    'Test_R2': test_metrics['r2'],
                    'Test_MAE': test_metrics['mae'],
                    'Test_MAPE': test_metrics['mape']
                })
            else:
                result.update({
                    'Test_Accuracy': test_metrics['accuracy'],
                    'Test_Precision': test_metrics['precision_weighted'],
                    'Test_Recall': test_metrics['recall_weighted']
                })
                if 'auc_roc' in test_metrics:
                    result['Test_AUC'] = test_metrics['auc_roc']
            
            results.append(result)
            
        except Exception as e:
            print(f"    ‚ùå Erreur avec {name}: {str(e)}")
            results.append({
                'Model': name,
                f'CV_{cv_metric}_Mean': np.nan,
                f'CV_{cv_metric}_Std': np.nan,
                f'Test_{cv_metric}': np.nan,
                'Training_Time_s': np.nan,
                'Prediction_Time_s': np.nan,
                'Status': f'Error: {str(e)[:50]}'
            })
    
    results_df = pd.DataFrame(results)
    
    # Trier par performance de test
    sort_col = f'Test_{cv_metric}'
    ascending = True if task_type == 'regression' else False  # RMSE: plus petit = mieux, F1: plus grand = mieux
    results_df = results_df.sort_values(sort_col, ascending=ascending)
    
    return results_df, scaler, X_train_scaled, X_test_scaled, y_train, y_test

In [None]:
# ====================================================================
# COMPARAISON POUR FORMATION PRESSURE PREDICTION
# ====================================================================

In [11]:
if 'formation_pressure' in datasets:
    print(f"\nüéØ FORMATION PRESSURE PREDICTION")
    print("-" * 50)
    
    df_formation = datasets['formation_pressure']
    
    # Pr√©parer les donn√©es
    target_col = 'FormationPressure'
    feature_cols = [col for col in df_formation.columns if col != target_col]
    
    X_formation = df_formation[feature_cols].fillna(df_formation[feature_cols].median())
    y_formation = df_formation[target_col].fillna(df_formation[target_col].median())
    
    print(f"üìä Dataset: {X_formation.shape[0]} √©chantillons, {X_formation.shape[1]} features")
    print(f"üéØ Target: {target_col} (min: {y_formation.min():.2f}, max: {y_formation.max():.2f})")
    
    # Comparer les mod√®les
    formation_results, formation_scaler, X_train_form, X_test_form, y_train_form, y_test_form = compare_models(
        X_formation, y_formation, regression_models, 'regression'
    )
    
    print(f"\nüèÜ R√âSULTATS - Formation Pressure:")
    display_cols = ['Model', 'CV_RMSE_Mean', 'CV_RMSE_Std', 'Test_RMSE', 'Test_R2', 'Training_Time_s']
    print(formation_results[display_cols].round(4))

In [None]:
# ====================================================================
# COMPARAISON POUR ROP PREDICTION
# ====================================================================

In [12]:
if 'rop_prediction' in datasets:
    print(f"\nüéØ ROP PREDICTION")  
    print("-" * 50)
    
    df_rop = datasets['rop_prediction']
    
    # Pr√©parer les donn√©es
    target_col = 'ROP'
    feature_cols = [col for col in df_rop.columns if col != target_col]
    
    X_rop = df_rop[feature_cols].fillna(df_rop[feature_cols].median())
    y_rop = df_rop[target_col].fillna(df_rop[target_col].median())
    
    print(f"üìä Dataset: {X_rop.shape[0]} √©chantillons, {X_rop.shape[1]} features")
    print(f"üéØ Target: {target_col} (min: {y_rop.min():.2f}, max: {y_rop.max():.2f})")
    
    # Comparer les mod√®les
    rop_results, rop_scaler, X_train_rop, X_test_rop, y_train_rop, y_test_rop = compare_models(
        X_rop, y_rop, regression_models, 'regression'
    )
    
    print(f"\nüèÜ R√âSULTATS - ROP Prediction:")
    display_cols = ['Model', 'CV_RMSE_Mean', 'CV_RMSE_Std', 'Test_RMSE', 'Test_R2', 'Training_Time_s']
    print(rop_results[display_cols].round(4))

In [None]:
# ====================================================================
# COMPARAISON POUR KICK DETECTION
# ====================================================================

In [13]:
if 'kick_detection' in datasets:
    print(f"\nüéØ KICK DETECTION")
    print("-" * 50)
    
    df_kick = datasets['kick_detection']
    
    # Pr√©parer les donn√©es
    target_col = 'Kick'
    feature_cols = [col for col in df_kick.columns if col != target_col]
    
    X_kick = df_kick[feature_cols].fillna(df_kick[feature_cols].median())
    y_kick = df_kick[target_col].fillna(0).astype(int)
    
    print(f"üìä Dataset: {X_kick.shape[0]} √©chantillons, {X_kick.shape[1]} features")
    print(f"üéØ Target: {target_col}")
    print(f"   R√©partition des classes: {y_kick.value_counts().to_dict()}")
    
    # Comparer les mod√®les
    kick_results, kick_scaler, X_train_kick, X_test_kick, y_train_kick, y_test_kick = compare_models(
        X_kick, y_kick, classification_models, 'classification'
    )
    
    print(f"\nüèÜ R√âSULTATS - Kick Detection:")
    display_cols = ['Model', 'CV_F1_Mean', 'CV_F1_Std', 'Test_F1', 'Test_Accuracy', 'Test_AUC']
    available_cols = [col for col in display_cols if col in kick_results.columns]
    print(kick_results[available_cols].round(4))

In [None]:
# ====================================================================
# VISUALISATIONS DES PERFORMANCES
# ====================================================================

In [14]:
print(f"\nüìä VISUALISATIONS DES PERFORMANCES")
print("-" * 50)

def plot_model_comparison(results_df, metric_col, title, task_type='regression'):
    """Visualise la comparaison des mod√®les"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Performance par mod√®le (barplot)
    successful_results = results_df[results_df['Status'] == 'Success'].copy()
    
    if len(successful_results) > 0:
        # Trier par performance
        successful_results = successful_results.sort_values(metric_col, 
                                                           ascending=(task_type == 'regression'))
        
        colors = plt.cm.viridis(np.linspace(0, 1, len(successful_results)))
        bars = axes[0, 0].bar(range(len(successful_results)), successful_results[metric_col], 
                             color=colors, alpha=0.7)
        axes[0, 0].set_xticks(range(len(successful_results)))
        axes[0, 0].set_xticklabels(successful_results['Model'], rotation=45, ha='right')
        axes[0, 0].set_ylabel(metric_col)
        axes[0, 0].set_title(f'{title} - Performance')
        axes[0, 0].grid(True, alpha=0.3)
        
        # Ajouter les valeurs sur les barres
        for bar, value in zip(bars, successful_results[metric_col]):
            height = bar.get_height()
            axes[0, 0].text(bar.get_x() + bar.get_width()/2., height,
                           f'{value:.3f}', ha='center', va='bottom', fontsize=8)
    
    # 2. Temps d'entra√Ænement
    if 'Training_Time_s' in successful_results.columns:
        axes[0, 1].barh(range(len(successful_results)), successful_results['Training_Time_s'],
                       color='orange', alpha=0.7)
        axes[0, 1].set_yticks(range(len(successful_results)))
        axes[0, 1].set_yticklabels(successful_results['Model'])
        axes[0, 1].set_xlabel('Temps d\'entra√Ænement (s)')
        axes[0, 1].set_title('Temps d\'entra√Ænement')
        axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Cross-validation vs Test (si disponible)
    cv_col = [col for col in successful_results.columns if col.startswith('CV_') and 'Mean' in col]
    if cv_col:
        cv_col = cv_col[0]
        axes[1, 0].scatter(successful_results[cv_col], successful_results[metric_col], 
                          alpha=0.7, s=100)
        
        # Ligne y=x pour performance parfaite
        min_val = min(successful_results[cv_col].min(), successful_results[metric_col].min())
        max_val = max(successful_results[cv_col].max(), successful_results[metric_col].max())
        axes[1, 0].plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.7)
        
        axes[1, 0].set_xlabel(f'Cross-Validation {cv_col.split("_")[1]}')
        axes[1, 0].set_ylabel(f'Test {metric_col.split("_")[1]}')
        axes[1, 0].set_title('CV vs Test Performance')
        axes[1, 0].grid(True, alpha=0.3)
        
        # Ajouter les noms des mod√®les
        for i, row in successful_results.iterrows():
            axes[1, 0].annotate(row['Model'], (row[cv_col], row[metric_col]), 
                               xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    # 4. R√©sum√© des m√©triques multiples (radar chart ou heatmap)
    metric_cols = [col for col in successful_results.columns 
                  if col.startswith('Test_') and col != 'Test_Status']
    
    if len(metric_cols) > 1:
        # Normaliser les m√©triques pour la visualisation
        metrics_normalized = successful_results[metric_cols].copy()
        for col in metric_cols:
            if task_type == 'regression' and any(x in col.lower() for x in ['rmse', 'mae', 'mse']):
                # Pour les m√©triques d'erreur, inverser (plus petit = mieux)
                metrics_normalized[col] = 1 - (metrics_normalized[col] - metrics_normalized[col].min()) / (metrics_normalized[col].max() - metrics_normalized[col].min())
            else:
                # Pour les autres m√©triques, normaliser normalement
                metrics_normalized[col] = (metrics_normalized[col] - metrics_normalized[col].min()) / (metrics_normalized[col].max() - metrics_normalized[col].min())
        
        # Cr√©er heatmap
        im = axes[1, 1].imshow(metrics_normalized.values, cmap='RdYlGn', aspect='auto')
        axes[1, 1].set_xticks(range(len(metric_cols)))
        axes[1, 1].set_xticklabels([col.replace('Test_', '') for col in metric_cols], rotation=45)
        axes[1, 1].set_yticks(range(len(successful_results)))
        axes[1, 1].set_yticklabels(successful_results['Model'])
        axes[1, 1].set_title('Performance Normalis√©e (Vert = Mieux)')
        
        # Ajouter les valeurs
        for i in range(len(successful_results)):
            for j in range(len(metric_cols)):
                text = axes[1, 1].text(j, i, f'{successful_results.iloc[i][metric_cols[j]]:.3f}',
                                      ha="center", va="center", color="black", fontsize=8)
        
        plt.colorbar(im, ax=axes[1, 1])
    
    plt.suptitle(f'{title} - Comparaison Compl√®te', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

# Cr√©er les visualisations pour chaque t√¢che
if 'formation_results' in locals():
    plot_model_comparison(formation_results, 'Test_RMSE', 'Formation Pressure Prediction', 'regression')

if 'rop_results' in locals():
    plot_model_comparison(rop_results, 'Test_RMSE', 'ROP Prediction', 'regression')

if 'kick_results' in locals():
    plot_model_comparison(kick_results, 'Test_F1', 'Kick Detection', 'classification')


üìä VISUALISATIONS DES PERFORMANCES
--------------------------------------------------


In [None]:
# ====================================================================
# ANALYSE D√âTAILL√âE DES MEILLEURS MOD√àLES
# ====================================================================

In [15]:
print(f"\nüîç ANALYSE D√âTAILL√âE DES MEILLEURS MOD√àLES")
print("-" * 50)

def analyze_best_model(results_df, X_test, y_test, models, scaler, task_type='regression'):
    """Analyse d√©taill√©e du meilleur mod√®le"""
    successful_results = results_df[results_df['Status'] == 'Success']
    if len(successful_results) == 0:
        print("‚ö†Ô∏è Aucun mod√®le r√©ussi √† analyser")
        return None
    
    # S√©lectionner le meilleur mod√®le
    if task_type == 'regression':
        metric_col = 'Test_RMSE'
        best_idx = successful_results[metric_col].idxmin()
    else:
        metric_col = 'Test_F1'
        best_idx = successful_results[metric_col].idxmax()
    
    best_model_name = successful_results.loc[best_idx, 'Model']
    print(f"üèÜ Meilleur mod√®le: {best_model_name}")
    
    # R√©cup√©rer et entra√Æner le meilleur mod√®le
    best_model = models[best_model_name]
    X_train_full = scaler.transform(scaler.inverse_transform(X_test))  # Trick pour avoir les bonnes dimensions
    
    # Re-entra√Æner sur toutes les donn√©es d'entra√Ænement
    best_model.fit(scaler.transform(scaler.inverse_transform(X_test)), y_test)  # Approximation pour la d√©mo
    
    # Pr√©dictions
    y_pred = best_model.predict(X_test)
    
    # Visualisations d√©taill√©es
    if task_type == 'regression':
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Pr√©dictions vs R√©alit√©
        axes[0, 0].scatter(y_test, y_pred, alpha=0.6, s=20)
        min_val, max_val = min(y_test.min(), y_pred.min()), max(y_test.max(), y_pred.max())
        axes[0, 0].plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
        axes[0, 0].set_xlabel('Valeurs R√©elles')
        axes[0, 0].set_ylabel('Pr√©dictions')
        axes[0, 0].set_title(f'{best_model_name} - Pr√©dictions vs R√©alit√©')
        
        # R¬≤ sur le graphique
        r2 = r2_score(y_test, y_pred)
        axes[0, 0].text(0.05, 0.95, f'R¬≤ = {r2:.4f}', transform=axes[0, 0].transAxes,
                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        axes[0, 0].grid(True, alpha=0.3)
        
        # 2. R√©sidus
        residuals = y_test - y_pred
        axes[0, 1].scatter(y_pred, residuals, alpha=0.6, s=20)
        axes[0, 1].axhline(y=0, color='r', linestyle='--')
        axes[0, 1].set_xlabel('Pr√©dictions')
        axes[0, 1].set_ylabel('R√©sidus')
        axes[0, 1].set_title('Analyse des R√©sidus')
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. Distribution des r√©sidus
        axes[1, 0].hist(residuals, bins=30, alpha=0.7, edgecolor='black')
        axes[1, 0].axvline(residuals.mean(), color='r', linestyle='--', label=f'Moyenne: {residuals.mean():.3f}')
        axes[1, 0].set_xlabel('R√©sidus')
        axes[1, 0].set_ylabel('Fr√©quence')
        axes[1, 0].set_title('Distribution des R√©sidus')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # 4. M√©triques d√©taill√©es
        metrics = RegressionMetrics.calculate_all_metrics(y_test, y_pred)
        metrics_text = f"""M√©triques D√©taill√©es:
R¬≤ = {metrics['r2']:.4f}
RMSE = {metrics['rmse']:.4f}
MAE = {metrics['mae']:.4f}
MAPE = {metrics['mape']:.2f}%
Max Error = {metrics['max_error']:.4f}
Median AE = {metrics['median_ae']:.4f}"""
        
        axes[1, 1].text(0.1, 0.9, metrics_text, transform=axes[1, 1].transAxes,
                       verticalalignment='top', fontfamily='monospace', fontsize=10,
                       bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
        axes[1, 1].axis('off')
        
    else:  # Classification
        from sklearn.metrics import roc_curve, precision_recall_curve
        
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # 1. Matrice de confusion
        cm = confusion_matrix(y_test, y_pred)
        im = axes[0, 0].imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        axes[0, 0].figure.colorbar(im, ax=axes[0, 0])
        
        tick_marks = np.arange(len(np.unique(y_test)))
        axes[0, 0].set_xticks(tick_marks)
        axes[0, 0].set_yticks(tick_marks)
        axes[0, 0].set_xticklabels(np.unique(y_test))
        axes[0, 0].set_yticklabels(np.unique(y_test))
        axes[0, 0].set_ylabel('Vraie classe')
        axes[0, 0].set_xlabel('Classe pr√©dite')
        axes[0, 0].set_title('Matrice de Confusion')
        
        # Ajouter les valeurs dans les cellules
        thresh = cm.max() / 2.
        for i, j in np.ndindex(cm.shape):
            axes[0, 0].text(j, i, format(cm[i, j], 'd'),
                           ha="center", va="center",
                           color="white" if cm[i, j] > thresh else "black")
        
        # 2. Courbe ROC (si probabilit√©s disponibles)
        if hasattr(best_model, 'predict_proba') and len(np.unique(y_test)) == 2:
            y_proba = best_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            auc_score = roc_auc_score(y_test, y_proba)
            
            axes[0, 1].plot(fpr, tpr, color='darkorange', lw=2, 
                           label=f'ROC curve (AUC = {auc_score:.2f})')
            axes[0, 1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
            axes[0, 1].set_xlim([0.0, 1.0])
            axes[0, 1].set_ylim([0.0, 1.05])
            axes[0, 1].set_xlabel('Taux de Faux Positifs')
            axes[0, 1].set_ylabel('Taux de Vrais Positifs')
            axes[0, 1].set_title('Courbe ROC')
            axes[0, 1].legend(loc="lower right")
            axes[0, 1].grid(True, alpha=0.3)
            
            # 3. Courbe Pr√©cision-Rappel
            precision, recall, _ = precision_recall_curve(y_test, y_proba)
            pr_auc = auc(recall, precision)
            
            axes[1, 0].plot(recall, precision, color='blue', lw=2,
                           label=f'PR curve (AUC = {pr_auc:.2f})')
            axes[1, 0].set_xlabel('Rappel')
            axes[1, 0].set_ylabel('Pr√©cision')
            axes[1, 0].set_title('Courbe Pr√©cision-Rappel')
            axes[1, 0].legend()
            axes[1, 0].grid(True, alpha=0.3)
        
        # 4. M√©triques d√©taill√©es
        metrics = ClassificationMetrics.calculate_all_metrics(y_test, y_pred)
        metrics_text = f"""M√©triques D√©taill√©es:
Accuracy = {metrics['accuracy']:.4f}
Precision = {metrics['precision_weighted']:.4f}
Recall = {metrics['recall_weighted']:.4f}
F1-Score = {metrics['f1_weighted']:.4f}"""
        
        if 'auc_roc' in metrics:
            metrics_text += f"\nAUC-ROC = {metrics['auc_roc']:.4f}"
        
        axes[1, 1].text(0.1, 0.9, metrics_text, transform=axes[1, 1].transAxes,
                       verticalalignment='top', fontfamily='monospace', fontsize=10,
                       bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
        axes[1, 1].axis('off')
    
    plt.suptitle(f'Analyse D√©taill√©e - {best_model_name}', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    return best_model_name, best_model

# Analyser les meilleurs mod√®les
best_models = {}

if 'formation_results' in locals():
    print("üîç Formation Pressure - Meilleur mod√®le:")
    best_name, best_model = analyze_best_model(
        formation_results, X_test_form, y_test_form, regression_models, 
        formation_scaler, 'regression'
    )
    if best_name:
        best_models['formation_pressure'] = (best_name, best_model)

if 'rop_results' in locals():
    print("\nüîç ROP Prediction - Meilleur mod√®le:")
    best_name, best_model = analyze_best_model(
        rop_results, X_test_rop, y_test_rop, regression_models,
        rop_scaler, 'regression'
    )
    if best_name:
        best_models['rop_prediction'] = (best_name, best_model)

if 'kick_results' in locals():
    print("\nüîç Kick Detection - Meilleur mod√®le:")
    best_name, best_model = analyze_best_model(
        kick_results, X_test_kick, y_test_kick, classification_models,
        kick_scaler, 'classification'
    )
    if best_name:
        best_models['kick_detection'] = (best_name, best_model)


üîç ANALYSE D√âTAILL√âE DES MEILLEURS MOD√àLES
--------------------------------------------------


In [None]:
# ====================================================================
# OPTIMISATION DES HYPERPARAM√àTRES
# ====================================================================

In [16]:
print(f"\n‚öôÔ∏è OPTIMISATION DES HYPERPARAM√àTRES")
print("-" * 50)

def optimize_hyperparameters(X, y, model, param_grid, task_type='regression', cv=3):
    """Optimise les hyperparam√®tres d'un mod√®le"""
    print(f"üîß Optimisation des hyperparam√®tres...")
    
    # Choisir la m√©trique de scoring
    if task_type == 'regression':
        scoring = 'neg_mean_squared_error'
    else:
        scoring = 'f1_weighted'
    
    # GridSearchCV
    grid_search = GridSearchCV(
        model, param_grid, cv=cv, scoring=scoring, 
        n_jobs=-1, verbose=0
    )
    
    # Normaliser les donn√©es
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Ajuster y pour classification si n√©cessaire
    if task_type == 'classification':
        y = y.astype(int)
    
    # Entra√Ænement
    grid_search.fit(X_scaled, y)
    
    print(f"‚úÖ Meilleurs param√®tres: {grid_search.best_params_}")
    print(f"‚úÖ Meilleur score CV: {-grid_search.best_score_:.4f}" if task_type == 'regression' 
          else f"‚úÖ Meilleur score CV: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

# D√©finir les grilles de param√®tres pour les meilleurs mod√®les
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    } if XGBOOST_AVAILABLE else {}
}

# Optimiser les hyperparam√®tres des meilleurs mod√®les
optimized_models = {}

for task, (model_name, model) in best_models.items():
    if model_name in param_grids:
        print(f"\n‚öôÔ∏è Optimisation pour {task} ({model_name}):")
        
        # R√©cup√©rer les donn√©es correspondantes
        if task == 'formation_pressure' and 'X_formation' in locals():
            X, y = X_formation, y_formation
            task_type = 'regression'
        elif task == 'rop_prediction' and 'X_rop' in locals():
            X, y = X_rop, y_rop
            task_type = 'regression'
        elif task == 'kick_detection' and 'X_kick' in locals():
            X, y = X_kick, y_kick
            task_type = 'classification'
        else:
            continue
        
        try:
            optimized_model, best_params, best_score = optimize_hyperparameters(
                X, y, model, param_grids[model_name], task_type
            )
            optimized_models[task] = (model_name, optimized_model, best_params)
        except Exception as e:
            print(f"‚ùå Erreur lors de l'optimisation: {e}")


‚öôÔ∏è OPTIMISATION DES HYPERPARAM√àTRES
--------------------------------------------------


In [None]:
# ====================================================================
# ANALYSE DE L'IMPORTANCE DES FEATURES
# ====================================================================


In [17]:
print(f"\nüéØ IMPORTANCE DES FEATURES DANS LES MEILLEURS MOD√àLES")
print("-" * 50)

def analyze_feature_importance(model, feature_names, model_name):
    """Analyse l'importance des features d'un mod√®le"""
    importance_data = None
    
    if hasattr(model, 'feature_importances_'):
        # Tree-based models
        importance_data = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
    elif hasattr(model, 'coef_'):
        # Linear models
        if len(model.coef_.shape) > 1:
            # Multi-class classification
            importance = np.mean(np.abs(model.coef_), axis=0)
        else:
            importance = np.abs(model.coef_)
        
        importance_data = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False)
    
    if importance_data is not None:
        print(f"\nüéØ Feature Importance - {model_name}:")
        print("Top 10 features:")
        for i, (_, row) in enumerate(importance_data.head(10).iterrows(), 1):
            print(f"  {i:2d}. {row['feature']:<25}: {row['importance']:.4f}")
        
        # Visualisation
        plt.figure(figsize=(12, 8))
        top_features = importance_data.head(15)
        
        plt.barh(range(len(top_features)), top_features['importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Importance')
        plt.title(f'Feature Importance - {model_name}')
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        return importance_data
    
    return None

# Analyser l'importance pour les mod√®les optimis√©s
for task, (model_name, model, params) in optimized_models.items():
    if task == 'formation_pressure' and 'X_formation' in locals():
        feature_names = X_formation.columns
    elif task == 'rop_prediction' and 'X_rop' in locals():
        feature_names = X_rop.columns
    elif task == 'kick_detection' and 'X_kick' in locals():
        feature_names = X_kick.columns
    else:
        continue
    
    importance_df = analyze_feature_importance(model, feature_names, f"{task} - {model_name}")


üéØ IMPORTANCE DES FEATURES DANS LES MEILLEURS MOD√àLES
--------------------------------------------------


In [None]:
# ====================================================================
# R√âSUM√â FINAL ET RECOMMANDATIONS
# ====================================================================

In [18]:
print(f"\nüìã R√âSUM√â FINAL ET RECOMMANDATIONS")
print("=" * 60)

print("üèÜ MEILLEURS MOD√àLES S√âLECTIONN√âS:")
for task, (model_name, model) in best_models.items():
    print(f"  ‚Ä¢ {task}: {model_name}")
    if task in optimized_models:
        optimized_name, optimized_model, best_params = optimized_models[task]
        print(f"    Optimis√© avec: {best_params}")

print(f"\nüìä R√âSULTATS DE PERFORMANCE:")
if 'formation_results' in locals():
    best_formation = formation_results.iloc[0]
    print(f"  ‚Ä¢ Formation Pressure: RMSE = {best_formation['Test_RMSE']:.4f}, R¬≤ = {best_formation['Test_R2']:.4f}")

if 'rop_results' in locals():
    best_rop = rop_results.iloc[0]
    print(f"  ‚Ä¢ ROP Prediction: RMSE = {best_rop['Test_RMSE']:.4f}, R¬≤ = {best_rop['Test_R2']:.4f}")

if 'kick_results' in locals():
    best_kick = kick_results.iloc[0]
    print(f"  ‚Ä¢ Kick Detection: F1 = {best_kick['Test_F1']:.4f}, Accuracy = {best_kick.get('Test_Accuracy', 'N/A'):.4f}")

print(f"\nüí° INSIGHTS CL√âS:")
print("  ‚úÖ Les mod√®les ensemble (Random Forest, Gradient Boosting) performent g√©n√©ralement mieux")
print("  ‚úÖ L'optimisation des hyperparam√®tres apporte des gains significatifs")
print("  ‚úÖ Le feature engineering am√©liore consid√©rablement les performances")
print("  ‚úÖ La normalisation des donn√©es est cruciale pour certains algorithmes")

print(f"\n‚ö†Ô∏è POINTS D'ATTENTION:")
print("  ‚Ä¢ Surveiller l'overfitting avec les mod√®les complexes")
print("  ‚Ä¢ Valider la stabilit√© temporelle des mod√®les")
print("  ‚Ä¢ Consid√©rer l'interpr√©tabilit√© vs performance selon le contexte")
print("  ‚Ä¢ Tester sur de nouvelles donn√©es non vues")

print(f"\nüöÄ PROCHAINES √âTAPES:")
print("  1. D√©ployer les meilleurs mod√®les en production")
print("  2. Mettre en place le monitoring des performances")
print("  3. Cr√©er un pipeline de r√©entra√Ænement automatique")
print("  4. D√©velopper l'interface utilisateur (dashboard)")
print("  5. Documenter les mod√®les pour l'√©quipe op√©rationnelle")

print(f"\nüìÅ SAUVEGARDE DES MOD√àLES:")
print("Les meilleurs mod√®les peuvent √™tre sauvegard√©s pour d√©ploiement:")

# Code pour sauvegarder les mod√®les (exemple)
"""
import joblib
import os

os.makedirs('../models', exist_ok=True)

for task, (model_name, model) in best_models.items():
    model_path = f'../models/{task}_{model_name.replace(" ", "_")}.pkl'
    joblib.dump(model, model_path)
    print(f"üíæ Sauvegard√©: {model_path}")
"""

print(f"\nüéâ COMPARAISON DES MOD√àLES TERMIN√âE!")
print("Les mod√®les sont pr√™ts pour le d√©ploiement en production.")
print("=" * 60)


üìã R√âSUM√â FINAL ET RECOMMANDATIONS
üèÜ MEILLEURS MOD√àLES S√âLECTIONN√âS:

üìä R√âSULTATS DE PERFORMANCE:

üí° INSIGHTS CL√âS:
  ‚úÖ Les mod√®les ensemble (Random Forest, Gradient Boosting) performent g√©n√©ralement mieux
  ‚úÖ L'optimisation des hyperparam√®tres apporte des gains significatifs
  ‚úÖ Le feature engineering am√©liore consid√©rablement les performances
  ‚úÖ La normalisation des donn√©es est cruciale pour certains algorithmes

‚ö†Ô∏è POINTS D'ATTENTION:
  ‚Ä¢ Surveiller l'overfitting avec les mod√®les complexes
  ‚Ä¢ Valider la stabilit√© temporelle des mod√®les
  ‚Ä¢ Consid√©rer l'interpr√©tabilit√© vs performance selon le contexte
  ‚Ä¢ Tester sur de nouvelles donn√©es non vues

üöÄ PROCHAINES √âTAPES:
  1. D√©ployer les meilleurs mod√®les en production
  2. Mettre en place le monitoring des performances
  3. Cr√©er un pipeline de r√©entra√Ænement automatique
  4. D√©velopper l'interface utilisateur (dashboard)
  5. Documenter les mod√®les pour l'√©quipe op√©r