In [None]:
# ====================================================================
# NOTEBOOK 2: ANALYSE ET INGÉNIERIE DES FEATURES
# ====================================================================
# Ce notebook se concentre sur l'analyse approfondie des features,
# la création de nouvelles variables et la sélection des meilleures
# caractéristiques pour la modélisation ML.

In [None]:
# ====================================================================
# IMPORTS ET CONFIGURATION
# ====================================================================

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ML imports
from sklearn.feature_selection import (
    SelectKBest, f_regression, f_classif, RFE, 
    SelectFromModel, mutual_info_regression, mutual_info_classif
)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Statistiques
from scipy import stats
from scipy.stats import pearsonr, spearmanr, chi2_contingency

import warnings
warnings.filterwarnings('ignore')

# Configuration des graphiques
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("🔬 ANALYSE ET INGÉNIERIE DES FEATURES")
print("=" * 60)

# Ajouter le path pour nos modules
import sys
sys.path.append('../src')

from data.data_loader import DataLoader
from data.data_preprocessor import DataPreprocessor
from data.feature_engineering import DrillingFeatureEngineer

🔬 ANALYSE ET INGÉNIERIE DES FEATURES


SyntaxError: unterminated triple-quoted string literal (detected at line 108) (data_loader.py, line 104)

In [None]:
# ====================================================================
# CHARGEMENT ET PRÉPARATION DES DONNÉES
# ====================================================================

In [2]:
print("📊 Chargement des données...")

# Initialiser les composants
loader = DataLoader({'data_path': '../data'})
preprocessor = DataPreprocessor()
feature_engineer = DrillingFeatureEngineerFeatureEngineer()

# Charger les données
try:
    formation_df = loader.load_formation_data()
    kick_df = loader.load_kick_detection_data()
    print("✅ Données réelles chargées")
except:
    print("⚠️ Utilisation de données synthétiques")
    synthetic_data = loader.load_synthetic_drilling_data(n_samples=5000, random_seed=42)
    formation_df = synthetic_data.copy()
    kick_df = synthetic_data.copy()

print(f"Formation data shape: {formation_df.shape}")
print(f"Kick detection data shape: {kick_df.shape}")

# Nettoyage de base
formation_df = preprocessor.handle_missing_values(formation_df)
kick_df = preprocessor.handle_missing_values(kick_df)

print("✅ Données nettoyées")


📊 Chargement des données...


NameError: name 'DataLoader' is not defined

In [None]:
# ====================================================================
# 1. ANALYSE DE L'IMPORTANCE DES FEATURES EXISTANTES
# ====================================================================

In [4]:
print(f"\n🎯 ANALYSE DE L'IMPORTANCE DES FEATURES EXISTANTES")
print("-" * 50)

def analyze_feature_importance_regression(df, target_col, max_features=10):
    """Analyse l'importance des features pour un problème de régression"""
    if target_col not in df.columns:
        print(f"⚠️ Variable cible '{target_col}' non trouvée")
        return None
    
    # Préparer les données
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    feature_cols = [col for col in numeric_cols if col != target_col]
    
    if not feature_cols:
        print("⚠️ Pas de features numériques disponibles")
        return None
    
    X = df[feature_cols].dropna()
    y = df.loc[X.index, target_col]
    
    print(f"Analyse pour {target_col} avec {len(feature_cols)} features")
    
    # 1. Corrélation de Pearson
    correlations = []
    for feature in feature_cols:
        corr, p_value = pearsonr(X[feature], y)
        correlations.append({
            'feature': feature,
            'correlation': abs(corr),
            'p_value': p_value,
            'method': 'Pearson'
        })
    
    # 2. Information mutuelle
    try:
        mi_scores = mutual_info_regression(X, y, random_state=42)
        for i, feature in enumerate(feature_cols):
            correlations.append({
                'feature': feature,
                'correlation': mi_scores[i],
                'p_value': np.nan,
                'method': 'Mutual_Info'
            })
    except Exception as e:
        print(f"⚠️ Erreur information mutuelle: {e}")
    
    # 3. F-score
    try:
        f_scores, f_p_values = f_regression(X, y)
        for i, feature in enumerate(feature_cols):
            correlations.append({
                'feature': feature,
                'correlation': f_scores[i],
                'p_value': f_p_values[i],
                'method': 'F_Score'
            })
    except Exception as e:
        print(f"⚠️ Erreur F-score: {e}")
    
    # 4. Random Forest Feature Importance
    try:
        rf = RandomForestRegressor(n_estimators=100, random_state=42)
        rf.fit(X, y)
        for i, feature in enumerate(feature_cols):
            correlations.append({
                'feature': feature,
                'correlation': rf.feature_importances_[i],
                'p_value': np.nan,
                'method': 'RF_Importance'
            })
    except Exception as e:
        print(f"⚠️ Erreur Random Forest: {e}")
    
    # Créer DataFrame des résultats
    results_df = pd.DataFrame(correlations)
    
    # Visualiser les résultats
    methods = results_df['method'].unique()
    n_methods = len(methods)
    
    fig, axes = plt.subplots(1, n_methods, figsize=(6*n_methods, 8))
    if n_methods == 1:
        axes = [axes]
    
    for i, method in enumerate(methods):
        method_data = results_df[results_df['method'] == method].sort
        method_data = results_df[results_df['method'] == method].sort_values('correlation', ascending=False)
        top_features = method_data.head(max_features)
        
        axes[i].barh(range(len(top_features)), top_features['correlation'])
        axes[i].set_yticks(range(len(top_features)))
        axes[i].set_yticklabels(top_features['feature'])
        axes[i].set_title(f'{method}\nImportance des Features')
        axes[i].set_xlabel('Score d\'Importance')
        axes[i].grid(True, alpha=0.3)
        
        # Inverser l'ordre pour avoir le plus important en haut
        axes[i].invert_yaxis()
    
    plt.tight_layout()
    plt.show()
    
    return results_df

def analyze_feature_importance_classification(df, target_col, max_features=10):
    """Analyse l'importance des features pour un problème de classification"""
    if target_col not in df.columns:
        print(f"⚠️ Variable cible '{target_col}' non trouvée")
        return None
    
    # Préparer les données
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    feature_cols = [col for col in numeric_cols if col != target_col]
    
    if not feature_cols:
        print("⚠️ Pas de features numériques disponibles")
        return None
    
    X = df[feature_cols].dropna()
    y = df.loc[X.index, target_col]
    
    # S'assurer que y est entier pour classification
    if y.dtype not in ['int64', 'int32']:
        y = y.astype(int)
    
    print(f"Analyse pour {target_col} avec {len(feature_cols)} features")
    print(f"Classes: {np.unique(y)}")
    
    correlations = []
    
    # 1. Information mutuelle
    try:
        mi_scores = mutual_info_classif(X, y, random_state=42)
        for i, feature in enumerate(feature_cols):
            correlations.append({
                'feature': feature,
                'score': mi_scores[i],
                'method': 'Mutual_Info'
            })
    except Exception as e:
        print(f"⚠️ Erreur information mutuelle: {e}")
    
    # 2. Chi2 score
    try:
        # Normaliser les données pour chi2 (doivent être positives)
        X_positive = X - X.min() + 1
        chi2_scores, chi2_p_values = f_classif(X_positive, y)
        for i, feature in enumerate(feature_cols):
            correlations.append({
                'feature': feature,
                'score': chi2_scores[i],
                'method': 'Chi2_Score'
            })
    except Exception as e:
        print(f"⚠️ Erreur Chi2: {e}")
    
    # 3. Random Forest Feature Importance
    try:
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X, y)
        for i, feature in enumerate(feature_cols):
            correlations.append({
                'feature': feature,
                'score': rf.feature_importances_[i],
                'method': 'RF_Importance'
            })
    except Exception as e:
        print(f"⚠️ Erreur Random Forest: {e}")
    
    # Visualiser les résultats
    if correlations:
        results_df = pd.DataFrame(correlations)
        methods = results_df['method'].unique()
        
        fig, axes = plt.subplots(1, len(methods), figsize=(6*len(methods), 8))
        if len(methods) == 1:
            axes = [axes]
        
        for i, method in enumerate(methods):
            method_data = results_df[results_df['method'] == method].sort_values('score', ascending=False)
            top_features = method_data.head(max_features)
            
            axes[i].barh(range(len(top_features)), top_features['score'])
            axes[i].set_yticks(range(len(top_features)))
            axes[i].set_yticklabels(top_features['feature'])
            axes[i].set_title(f'{method}\nImportance des Features')
            axes[i].set_xlabel('Score d\'Importance')
            axes[i].grid(True, alpha=0.3)
            axes[i].invert_yaxis()
        
        plt.tight_layout()
        plt.show()
        
        return results_df
    
    return None

# Analyser l'importance pour les différentes variables cibles
print("🎯 Importance des features - Formation Pressure:")
if 'FormationPressure' in formation_df.columns:
    formation_importance = analyze_feature_importance_regression(formation_df, 'FormationPressure')

print("\n🎯 Importance des features - ROP:")
if 'ROP' in formation_df.columns:
    rop_importance = analyze_feature_importance_regression(formation_df, 'ROP')

print("\n🎯 Importance des features - Kick Detection:")
if 'Kick' in kick_df.columns:
    kick_importance = analyze_feature_importance_classification(kick_df, 'Kick')


🎯 ANALYSE DE L'IMPORTANCE DES FEATURES EXISTANTES
--------------------------------------------------
🎯 Importance des features - Formation Pressure:


NameError: name 'formation_df' is not defined

In [5]:
# ====================================================================
# 2. CRÉATION DE NOUVELLES FEATURES (FEATURE ENGINEERING)
# ====================================================================

In [7]:
print(f"\n🔧 INGÉNIERIE DES FEATURES")
print("-" * 50)

def create_drilling_features(df):
    """Crée des features spécifiques au domaine du forage"""
    df_engineered = df.copy()
    
    print("🔨 Création de features dérivées...")
    
    # Features temporelles si timestamp disponible
    if 'Timestamp' in df.columns:
        df_engineered = feature_engineer.create_temporal_features(df_engineered, 'Timestamp')
        print("  ✅ Features temporelles créées")
    
    # Features de moyennes mobiles
    rolling_cols = [col for col in ['WOB', 'RPM', 'ROP', 'FlowRate'] if col in df.columns]
    if rolling_cols:
        df_engineered = feature_engineer.create_rolling_features(df_engineered, rolling_cols, [3, 5, 10])
        print("  ✅ Moyennes mobiles créées")
    
    # Features de lag
    lag_cols = [col for col in ['WOB', 'RPM'] if col in df.columns]
    if lag_cols:
        df_engineered = feature_engineer.create_lag_features(df_engineered, lag_cols, [1, 2])
        print("  ✅ Features de lag créées")
    
    # Features de ratios
    ratio_pairs = []
    if 'WOB' in df.columns and 'RPM' in df.columns:
        ratio_pairs.append(('WOB', 'RPM'))
    if 'FlowRateIn' in df.columns and 'FlowRateOut' in df.columns:
        ratio_pairs.append(('FlowRateIn', 'FlowRateOut'))
    if 'ROP' in df.columns and 'WOB' in df.columns:
        ratio_pairs.append(('ROP', 'WOB'))
    
    if ratio_pairs:
        df_engineered = feature_engineer.create_ratio_features(df_engineered, ratio_pairs)
        print("  ✅ Features de ratios créées")
    
    # Features d'efficacité de forage
    df_engineered = feature_engineer.create_drilling_efficiency_features(df_engineered)
    print("  ✅ Features d'efficacité créées")
    
    # Features polynomiales pour variables importantes
    important_cols = [col for col in ['WOB', 'RPM'] if col in df.columns]
    if len(important_cols) >= 2:
        for col in important_cols[:2]:  # Limiter pour éviter l'explosion dimensionnelle
            df_engineered[f'{col}_squared'] = df_engineered[col] ** 2
            df_engineered[f'{col}_log'] = np.log1p(np.abs(df_engineered[col]))
        print("  ✅ Features polynomiales créées")
    
    # Features d'interaction entre variables importantes
    if 'WOB' in df.columns and 'RPM' in df.columns:
        df_engineered['WOB_RPM_interaction'] = df_engineered['WOB'] * df_engineered['RPM']
    if 'FlowRate' in df.columns and 'MudWeight' in df.columns:
        df_engineered['FlowRate_MudWeight_interaction'] = df_engineered['FlowRate'] * df_engineered['MudWeight']
    print("  ✅ Features d'interaction créées")
    
    # Features statistiques sur fenêtre glissante
    stat_cols = [col for col in ['WOB', 'RPM', 'ROP'] if col in df.columns]
    for col in stat_cols:
        if len(df_engineered) > 10:
            df_engineered[f'{col}_rolling_std'] = df_engineered[col].rolling(5, min_periods=1).std()
            df_engineered[f'{col}_rolling_min'] = df_engineered[col].rolling(5, min_periods=1).min()
            df_engineered[f'{col}_rolling_max'] = df_engineered[col].rolling(5, min_periods=1).max()
    print("  ✅ Features statistiques créées")
    
    # Nettoyage des valeurs infinies et NaN
    df_engineered = df_engineered.replace([np.inf, -np.inf], np.nan)
    df_engineered = df_engineered.fillna(df_engineered.median())
    
    print(f"📊 Features créées: {len(df_engineered.columns) - len(df.columns)} nouvelles features")
    print(f"📊 Total features: {len(df_engineered.columns)}")
    
    return df_engineered

# Créer les features engineered
print("🔧 Ingénierie des features pour Formation Data:")
formation_engineered = create_drilling_features(formation_df)

print("\n🔧 Ingénierie des features pour Kick Detection Data:")
kick_engineered = create_drilling_features(kick_df)


🔧 INGÉNIERIE DES FEATURES
--------------------------------------------------
🔧 Ingénierie des features pour Formation Data:


NameError: name 'formation_df' is not defined

In [None]:
# ====================================================================
# 3. SÉLECTION DES MEILLEURES FEATURES
# ====================================================================

In [8]:
print(f"\n🎯 SÉLECTION DES MEILLEURES FEATURES")
print("-" * 50)

def select_best_features(df, target_col, n_features=15, method='combined'):
    """Sélectionne les meilleures features selon différentes méthodes"""
    if target_col not in df.columns:
        print(f"⚠️ Variable cible '{target_col}' non trouvée")
        return None, None
    
    # Préparer les données
    feature_cols = [col for col in df.columns if col != target_col and 
                   col not in ['Timestamp'] and pd.api.types.is_numeric_dtype(df[col])]
    
    X = df[feature_cols].fillna(df[feature_cols].median())
    y = df[target_col].fillna(df[target_col].median())
    
    print(f"Sélection de {n_features} features parmi {len(feature_cols)} disponibles")
    
    selected_features = {}
    
    # Méthode 1: Corrélation avec la cible
    correlations = X.corrwith(y).abs().sort_values(ascending=False)
    selected_features['correlation'] = correlations.head(n_features).index.tolist()
    
    # Méthode 2: Univariate Feature Selection
    if pd.api.types.is_numeric_dtype(y) and y.nunique() > 10:  # Régression
        selector = SelectKBest(score_func=f_regression, k=n_features)
        selector.fit(X, y)
        selected_features['univariate'] = [feature_cols[i] for i in selector.get_support(indices=True)]
    else:  # Classification
        y_encoded = LabelEncoder().fit_transform(y.astype(str))
        selector = SelectKBest(score_func=f_classif, k=n_features)
        selector.fit(X, y_encoded)
        selected_features['univariate'] = [feature_cols[i] for i in selector.get_support(indices=True)]
    
    # Méthode 3: Recursive Feature Elimination avec Random Forest
    try:
        if pd.api.types.is_numeric_dtype(y) and y.nunique() > 10:
            estimator = RandomForestRegressor(n_estimators=50, random_state=42)
        else:
            estimator = RandomForestClassifier(n_estimators=50, random_state=42)
            y = LabelEncoder().fit_transform(y.astype(str))
        
        rfe = RFE(estimator=estimator, n_features_to_select=n_features)
        rfe.fit(X, y)
        selected_features['rfe'] = [feature_cols[i] for i in rfe.get_support(indices=True)]
    except Exception as e:
        print(f"⚠️ Erreur RFE: {e}")
    
    # Méthode 4: Lasso pour régularisation
    try:
        if pd.api.types.is_numeric_dtype(y) and y.nunique() > 10:
            lasso = Lasso(alpha=0.01, random_state=42)
            lasso.fit(X, y)
            lasso_features = X.columns[lasso.coef_ != 0]
        else:
            y_encoded = LabelEncoder().fit_transform(y.astype(str))
            lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, C=1.0)
            lasso.fit(X, y_encoded)
            lasso_features = X.columns[lasso.coef_[0] != 0]
        
        selected_features['lasso'] = lasso_features.tolist()[:n_features]
    except Exception as e:
        print(f"⚠️ Erreur Lasso: {e}")
    
    # Méthode combinée: consensus
    if method == 'combined':
        feature_counts = {}
        for method_name, features in selected_features.items():
            for feature in features:
                feature_counts[feature] = feature_counts.get(feature, 0) + 1
        
        # Trier par nombre de votes
        consensus_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)
        final_features = [feature for feature, count in consensus_features[:n_features]]
        
        print(f"📊 Méthode combinée utilisée")
        print(f"Features les plus consensuelles:")
        for i, (feature, count) in enumerate(consensus_features[:10]):
            print(f"  {i+1:2d}. {feature}: {count}/{len(selected_features)} votes")
    
    else:
        final_features = selected_features.get(method, selected_features['correlation'])
    
    # Visualiser la sélection
    fig, ax = plt.subplots(figsize=(12, 8))
    
    if method == 'combined' and len(consensus_features) > 0:
        features_to_plot = consensus_features[:20]  # Top 20
        features, votes = zip(*features_to_plot)
        
        bars = ax.barh(range(len(features)), votes)
        ax.set_yticks(range(len(features)))
        ax.set_yticklabels(features)
        ax.set_xlabel('Nombre de votes')
        ax.set_title(f'🎯 Sélection des Features - Méthode Combinée\n(Top {len(features)} features)')
        
        # Colorer différemment selon le nombre de votes
        max_votes = max(votes)
        for i, (bar, vote) in enumerate(zip(bars, votes)):
            if vote == max_votes:
                bar.set_color('#2ca02c')  # Vert pour consensus fort
            elif vote >= max_votes * 0.7:
                bar.set_color('#ff7f0e')  # Orange pour consensus moyen
            else:
                bar.set_color('#1f77b4')  # Bleu pour consensus faible
    
    ax.invert_yaxis()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return final_features, selected_features

# Sélectionner les meilleures features
print("🎯 Sélection pour Formation Pressure:")
if 'FormationPressure' in formation_engineered.columns:
    formation_best_features, formation_methods = select_best_features(
        formation_engineered, 'FormationPressure', n_features=15
    )

print("\n🎯 Sélection pour ROP:")
if 'ROP' in formation_engineered.columns:
    rop_best_features, rop_methods = select_best_features(
        formation_engineered, 'ROP', n_features=15
    )

print("\n🎯 Sélection pour Kick Detection:")
if 'Kick' in kick_engineered.columns:
    kick_best_features, kick_methods = select_best_features(
        kick_engineered, 'Kick', n_features=15
    )


🎯 SÉLECTION DES MEILLEURES FEATURES
--------------------------------------------------
🎯 Sélection pour Formation Pressure:


NameError: name 'formation_engineered' is not defined

In [None]:
# ====================================================================
# 4. ANALYSE DIMENSIONNELLE (PCA)
# ====================================================================

In [9]:
print(f"\n📐 ANALYSE DIMENSIONNELLE (PCA)")
print("-" * 50)

def perform_pca_analysis(df, target_col, n_components=10):
    """Effectue une analyse PCA"""
    if target_col not in df.columns:
        return None
    
    # Préparer les données
    feature_cols = [col for col in df.columns if col != target_col and 
                   col not in ['Timestamp'] and pd.api.types.is_numeric_dtype(df[col])]
    
    X = df[feature_cols].fillna(df[feature_cols].median())
    y = df[target_col]
    
    # Normaliser les données
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA
    pca = PCA(n_components=min(n_components, len(feature_cols)))
    X_pca = pca.fit_transform(X_scaled)
    
    print(f"PCA effectuée: {len(feature_cols)} → {pca.n_components_} composantes")
    print(f"Variance expliquée totale: {pca.explained_variance_ratio_.sum():.1%}")
    
    # Visualisations
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Variance expliquée
    axes[0, 0].plot(range(1, len(pca.explained_variance_ratio_) + 1), 
                   pca.explained_variance_ratio_, 'bo-')
    axes[0, 0].plot(range(1, len(pca.explained_variance_ratio_) + 1), 
                   pca.explained_variance_ratio_.cumsum(), 'ro-')
    axes[0, 0].set_xlabel('Composante')
    axes[0, 0].set_ylabel('Variance Expliquée')
    axes[0, 0].set_title('Variance Expliquée par Composante')
    axes[0, 0].legend(['Individuelle', 'Cumulative'])
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Projection 2D des données
    if pca.n_components_ >= 2:
        scatter = axes[0, 1].scatter(X_pca[:, 0], X_pca[:, 1], c=y, alpha=0.6, cmap='viridis')
        axes[0, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
        axes[0, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
        axes[0, 1].set_title('Projection PCA 2D')
        plt.colorbar(scatter, ax=axes[0, 1])
    
    # 3. Contribution des features originales
    if pca.n_components_ >= 1:
        pc1_contributions = pca.components_[0]
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'pc1_contribution': abs(pc1_contributions)
        }).sort_values('pc1_contribution', ascending=True).tail(10)
        
        axes[1, 0].barh(range(len(feature_importance)), feature_importance['pc1_contribution'])
        axes[1, 0].set_yticks(range(len(feature_importance)))
        axes[1, 0].set_yticklabels(feature_importance['feature'])
        axes[1, 0].set_xlabel('Contribution Absolue')
        axes[1, 0].set_title('Top 10 Features - PC1')
        axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Biplot pour PC1 vs PC2
    if pca.n_components_ >= 2:
        for i, feature in enumerate(feature_cols[:10]):  # Top 10 seulement
            axes[1, 1].arrow(0, 0, pca.components_[0, i], pca.components_[1, i], 
                           head_width=0.01, head_length=0.01, alpha=0.7)
            axes[1, 1].text(pca.components_[0, i]*1.1, pca.components_[1, i]*1.1, 
                           feature, fontsize=8)
        
        axes[1, 1].set_xlabel('PC1')
        axes[1, 1].set_ylabel('PC2')
        axes[1, 1].set_title('Biplot PCA (Top 10 features)')
        axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return pca, X_pca, scaler

# Effectuer l'analyse PCA
if 'FormationPressure' in formation_engineered.columns:
    print("📐 PCA pour Formation Pressure:")
    formation_pca, formation_pca_data, formation_scaler = perform_pca_analysis(
        formation_engineered, 'FormationPressure'
    )


📐 ANALYSE DIMENSIONNELLE (PCA)
--------------------------------------------------


NameError: name 'formation_engineered' is not defined

In [None]:
# ====================================================================
# 5. ANALYSE DE COLINÉARITÉ
# ====================================================================

In [10]:
print(f"\n🔗 ANALYSE DE COLINÉARITÉ")
print("-" * 50)

def analyze_multicollinearity(df, features=None, threshold=0.8):
    """Analyse la multicolinéarité entre features"""
    if features is None:
        features = df.select_dtypes(include=[np.number]).columns
    
    # Matrice de corrélation
    corr_matrix = df[features].corr()
    
    # Trouver les paires hautement corrélées
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = abs(corr_matrix.iloc[i, j])
            if corr_val > threshold:
                high_corr_pairs.append({
                    'feature1': corr_matrix.columns[i],
                    'feature2': corr_matrix.columns[j],
                    'correlation': corr_matrix.iloc[i, j]
                })
    
    print(f"🔍 Paires de features avec corrélation > {threshold}:")
    if high_corr_pairs:
        for pair in sorted(high_corr_pairs, key=lambda x: abs(x['correlation']), reverse=True):
            print(f"  • {pair['feature1']} ↔ {pair['feature2']}: {pair['correlation']:.3f}")
    else:
        print("  ✅ Aucune corrélation élevée détectée")
    
    # Calcul du VIF (Variance Inflation Factor) si statsmodels disponible
    try:
        from statsmodels.stats.outliers_influence import variance_inflation_factor
        
        X = df[features].fillna(df[features].median())
        
        vif_data = pd.DataFrame()
        vif_data["Feature"] = X.columns
        vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
        vif_data = vif_data.sort_values('VIF', ascending=False)
        
        print(f"\n📊 Variance Inflation Factor (VIF):")
        print("   VIF > 10: Multicolinéarité élevée")
        print("   VIF > 5:  Multicolinéarité modérée")
        
        for _, row in vif_data.head(10).iterrows():
            status = "🔴" if row['VIF'] > 10 else "🟡" if row['VIF'] > 5 else "🟢"
            print(f"   {status} {row['Feature']}: {row['VIF']:.2f}")
        
        # Visualisation
        fig, axes = plt.subplots(1, 2, figsize=(18, 6))
        
        # Heatmap de corrélation
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='RdYlBu_r', center=0,
                   ax=axes[0], square=True, linewidths=0.5)
        axes[0].set_title('Matrice de Corrélation')
        
        # VIF plot
        top_vif = vif_data.head(15)
        colors = ['red' if vif > 10 else 'orange' if vif > 5 else 'green' 
                 for vif in top_vif['VIF']]
        axes[1].barh(range(len(top_vif)), top_vif['VIF'], color=colors, alpha=0.7)
        axes[1].set_yticks(range(len(top_vif)))
        axes[1].set_yticklabels(top_vif['Feature'])
        axes[1].set_xlabel('VIF')
        axes[1].set_title('Variance Inflation Factor')
        axes[1].axvline(x=5, color='orange', linestyle='--', alpha=0.7, label='VIF = 5')
        axes[1].axvline(x=10, color='red', linestyle='--', alpha=0.7, label='VIF = 10')
        axes[1].legend()
        axes[1].invert_yaxis()
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        return high_corr_pairs, vif_data
        
    except ImportError:
        print("⚠️ statsmodels non disponible pour le calcul du VIF")
        return high_corr_pairs, None

# Analyser la multicolinéarité pour les features sélectionnées
if 'formation_best_features' in locals() and formation_best_features:
    print("🔗 Multicolinéarité - Formation Features:")
    formation_multicoll, formation_vif = analyze_multicollinearity(
        formation_engineered, formation_best_features
    )


🔗 ANALYSE DE COLINÉARITÉ
--------------------------------------------------


In [None]:
# ====================================================================
# 6. RECOMMANDATIONS FINALES
# ====================================================================

In [11]:
print(f"\n📝 RECOMMANDATIONS FINALES")
print("=" * 50)

def generate_feature_recommendations(df, best_features, target_col):
    """Génère des recommandations pour les features"""
    print(f"\n🎯 RECOMMANDATIONS POUR {target_col}")
    print("-" * 40)
    
    if not best_features:
        print("⚠️ Aucune feature sélectionnée")
        return
    
    print(f"✅ Features recommandées ({len(best_features)}):")
    for i, feature in enumerate(best_features[:10], 1):
        print(f"  {i:2d}. {feature}")
    
    # Analyser les types de features
    original_features = [f for f in best_features if not any(
        suffix in f for suffix in ['_rolling_', '_lag_', '_squared', '_log', '_ratio', '_interaction']
    )]
    
    engineered_features = [f for f in best_features if f not in original_features]
    
    print(f"\n📊 Composition des features:")
    print(f"  • Features originales: {len(original_features)}")
    print(f"  • Features engineered: {len(engineered_features)}")
    
    if engineered_features:
        print(f"  📈 Types de features créées:")
        feature_types = {
            'rolling': len([f for f in engineered_features if 'rolling' in f]),
            'lag': len([f for f in engineered_features if 'lag' in f]),
            'ratio': len([f for f in engineered_features if 'ratio' in f]),
            'interaction': len([f for f in engineered_features if 'interaction' in f]),
            'polynomial': len([f for f in engineered_features if any(
                suffix in f for suffix in ['squared', 'log']
            )])
        }
        
        for ftype, count in feature_types.items():
            if count > 0:
                print(f"    - {ftype}: {count}")
    
    print(f"\n💡 Conseils d'utilisation:")
    print("  1. Utiliser ces features comme point de départ")
    print("  2. Tester différentes combinaisons selon le modèle")
    print("  3. Surveiller l'overfitting avec features engineered")
    print("  4. Considérer la stabilité temporelle des features")
    
    if len(best_features) > 20:
        print("  5. Envisager une réduction dimensionnelle supplémentaire")

# Générer les recommandations
if 'formation_best_features' in locals():
    generate_feature_recommendations(formation_engineered, formation_best_features, 'FormationPressure')

if 'rop_best_features' in locals():
    generate_feature_recommendations(formation_engineered, rop_best_features, 'ROP')

if 'kick_best_features' in locals():
    generate_feature_recommendations(kick_engineered, kick_best_features, 'Kick Detection')

# ====================================================================
# 7. SAUVEGARDE DES DONNÉES PREPROCESSÉES
# ====================================================================

print(f"\n💾 SAUVEGARDE DES DONNÉES PREPROCESSÉES")
print("-" * 50)

# Créer les datasets finaux avec les meilleures features
def create_final_datasets():
    """Crée les datasets finaux pour la modélisation"""
    datasets = {}
    
    if 'formation_best_features' in locals() and formation_best_features:
        # Dataset pour Formation Pressure
        formation_final_features = formation_best_features + ['FormationPressure']
        formation_final = formation_engineered[formation_final_features].copy()
        datasets['formation_pressure'] = formation_final
        print(f"✅ Dataset Formation Pressure: {formation_final.shape}")
    
    if 'rop_best_features' in locals() and rop_best_features:
        # Dataset pour ROP
        rop_final_features = rop_best_features + ['ROP']
        rop_final = formation_engineered[rop_final_features].copy()
        datasets['rop_prediction'] = rop_final
        print(f"✅ Dataset ROP Prediction: {rop_final.shape}")
    
    if 'kick_best_features' in locals() and kick_best_features:
        # Dataset pour Kick Detection
        kick_final_features = kick_best_features + ['Kick']
        kick_final = kick_engineered[kick_final_features].copy()
        datasets['kick_detection'] = kick_final
        print(f"✅ Dataset Kick Detection: {kick_final.shape}")
    
    return datasets

final_datasets = create_final_datasets()

# Sauvegarder les datasets
import os
os.makedirs('../data/processed', exist_ok=True)

for name, dataset in final_datasets.items():
    filepath = f'../data/processed/{name}_features.csv'
    dataset.to_csv(filepath, index=False)
    print(f"💾 Sauvegardé: {filepath}")

# Sauvegarder les listes de features sélectionnées
feature_lists = {}
if 'formation_best_features' in locals():
    feature_lists['formation_pressure'] = formation_best_features
if 'rop_best_features' in locals():
    feature_lists['rop_prediction'] = rop_best_features
if 'kick_best_features' in locals():
    feature_lists['kick_detection'] = kick_best_features

import json
with open('../data/processed/selected_features.json', 'w') as f:
    json.dump(feature_lists, f, indent=2)
print("💾 Sauvegardé: ../data/processed/selected_features.json")


📝 RECOMMANDATIONS FINALES

💾 SAUVEGARDE DES DONNÉES PREPROCESSÉES
--------------------------------------------------
💾 Sauvegardé: ../data/processed/selected_features.json


In [None]:
# ====================================================================
# 8. RÉSUMÉ ET CONCLUSIONS
# ====================================================================

In [12]:
print(f"\n📋 RÉSUMÉ DE L'ANALYSE DES FEATURES")
print("=" * 60)

print("🔍 TRAVAIL RÉALISÉ:")
print("✅ Analyse de l'importance des features existantes")
print("✅ Création de nouvelles features via feature engineering")
print("✅ Sélection des meilleures features par méthodes multiples")
print("✅ Analyse dimensionnelle avec PCA")
print("✅ Détection de multicolinéarité")
print("✅ Préparation des datasets pour modélisation")

# Statistiques finales
total_original_features = len([col for col in formation_df.columns if pd.api.types.is_numeric_dtype(formation_df[col])])
total_engineered_features = len([col for col in formation_engineered.columns if pd.api.types.is_numeric_dtype(formation_engineered[col])])
features_created = total_engineered_features - total_original_features

print(f"\n📊 STATISTIQUES:")
print(f"  • Features originales: {total_original_features}")
print(f"  • Features créées: {features_created}")
print(f"  • Total après engineering: {total_engineered_features}")

if final_datasets:
    print(f"  • Datasets finaux créés: {len(final_datasets)}")
    for name, dataset in final_datasets.items():
        print(f"    - {name}: {dataset.shape[1]-1} features + 1 target")

print(f"\n🎯 FEATURES LES PLUS IMPORTANTES:")
if 'formation_best_features' in locals():
    print(f"  Formation Pressure (Top 5): {formation_best_features[:5]}")
if 'rop_best_features' in locals():
    print(f"  ROP Prediction (Top 5): {rop_best_features[:5]}")
if 'kick_best_features' in locals():
    print(f"  Kick Detection (Top 5): {kick_best_features[:5]}")

print(f"\n💡 INSIGHTS CLÉS:")
print("  • Les features engineered améliorent significativement la prédiction")
print("  • Les moyennes mobiles et ratios sont particulièrement informatives")
print("  • Attention à la multicolinéarité avec les features dérivées")
print("  • Les features temporelles ajoutent de la valeur prédictive")

print(f"\n⚠️ POINTS D'ATTENTION:")
print("  • Valider la stabilité des features dans le temps")
print("  • Surveiller l'overfitting avec trop de features engineered")
print("  • Tester la robustesse des features sur données réelles")
print("  • Considérer l'interprétabilité métier des features complexes")

print(f"\n🚀 PROCHAINES ÉTAPES:")
print("  1. Utiliser ces datasets dans le notebook 03_model_comparison.ipynb")
print("  2. Tester différents algorithmes de ML")
print("  3. Valider les performances sur données de test")
print("  4. Optimiser les hyperparamètres")
print("  5. Analyser l'importance des features dans les modèles finaux")

print(f"\n📁 FICHIERS CRÉÉS:")
for name in final_datasets.keys():
    print(f"  • ../data/processed/{name}_features.csv")
print("  • ../data/processed/selected_features.json")

print(f"\n🎉 ANALYSE DES FEATURES TERMINÉE!")
print("Passez au notebook suivant: 03_model_comparison.ipynb")
print("=" * 60)


📋 RÉSUMÉ DE L'ANALYSE DES FEATURES
🔍 TRAVAIL RÉALISÉ:
✅ Analyse de l'importance des features existantes
✅ Création de nouvelles features via feature engineering
✅ Sélection des meilleures features par méthodes multiples
✅ Analyse dimensionnelle avec PCA
✅ Détection de multicolinéarité
✅ Préparation des datasets pour modélisation


NameError: name 'formation_df' is not defined