In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [19]:
def diagnose_dataframe(df, target_col=None):
    """Effectue un diagnostic complet d'un DataFrame"""

    print(f"\n📊 DIAGNOSTIC GÉNÉRAL")
    print(f"Shape: {df.shape}")
    print(f"Mémoire utilisée: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Types de données
    print(f"\n📋 TYPES DE DONNÉES:")
    print(df.dtypes.value_counts())

    # Valeurs manquantes
    print(f"\n❓ VALEURS MANQUANTES:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Colonne': missing.index,
        'Manquantes': missing.values,
        'Pourcentage': missing_pct.values
    }).query('Manquantes > 0').sort_values('Manquantes', ascending=False)

    if len(missing_df) > 0:
        print(missing_df)
    else:
        print("✅ Aucune valeur manquante")

    # Valeurs infinies
    print(f"\n♾️ VALEURS INFINIES:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    infinite_counts = {}
    for col in numeric_cols:
        inf_count = np.isinf(df[col]).sum()
        if inf_count > 0:
            infinite_counts[col] = inf_count

    if infinite_counts:
        for col, count in infinite_counts.items():
            print(f"   {col}: {count} valeurs infinies")
    else:
        print("✅ Aucune valeur infinie")

    # Variables catégorielles
    print(f"\n🏷️ VARIABLES CATÉGORIELLES:")
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_cols) > 0:
        for col in categorical_cols:
            unique_vals = df[col].nunique()
            print(f"   {col}: {unique_vals} valeurs uniques")
            if unique_vals <= 10:
                print(f"      Valeurs: {df[col].unique()}")
    else:
        print("✅ Aucune variable catégorielle")

    # Variables numériques avec peu de valeurs uniques (potentiellement catégorielles)
    print(f"\n🔢 VARIABLES NUMÉRIQUES SUSPECTES (peu de valeurs uniques):")
    for col in numeric_cols:
        unique_vals = df[col].nunique()
        if unique_vals <= 10:
            print(f"   {col}: {unique_vals} valeurs uniques - {df[col].unique()}")

    # Colonnes dupliquées
    print(f"\n🔍 COLONNES DUPLIQUÉES:")
    duplicated_cols = []
    for i, col1 in enumerate(df.columns):
        for col2 in df.columns[i+1:]:
            if df[col1].equals(df[col2]):
                duplicated_cols.append((col1, col2))

    if duplicated_cols:
        for col1, col2 in duplicated_cols:
            print(f"   {col1} = {col2}")
    else:
        print("✅ Aucune colonne dupliquée")

    # Statistiques pour la variable cible
    if target_col and target_col in df.columns:
        print(f"\n🎯 ANALYSE DE LA VARIABLE CIBLE '{target_col}':")
        print(df[target_col].value_counts())
        print(f"Pourcentages:")
        print(df[target_col].value_counts(normalize=True) * 100)

In [20]:
def clean_dataframe(df, target_col=None):
    """Nettoie automatiquement un DataFrame"""

    print(f"\n🧹 NETTOYAGE AUTOMATIQUE")
    df_clean = df.copy()

    # 1. Identifier et séparer la variable cible
    if target_col:
        if target_col in df_clean.columns:
            y = df_clean[target_col]
            # Si c'est du texte, l'encoder
            if y.dtype == 'object':
                print(f"   Encodage de la variable cible '{target_col}'")
                if 'Not_Canceled' in y.unique():
                    y = (y == 'Not_Canceled').astype(int)
                else:
                    le = LabelEncoder()
                    y = le.fit_transform(y.fillna('Unknown'))
            X = df_clean.drop(target_col, axis=1)
        else:
            print(f"❌ Variable cible '{target_col}' non trouvée")
            return None, None
    else:
        X = df_clean
        y = None

    # 2. Traiter les variables catégorielles
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    if len(categorical_cols) > 0:
        print(f"   Encodage de {len(categorical_cols)} variables catégorielles")
        for col in categorical_cols:
            # Remplir les valeurs manquantes
            X[col] = X[col].fillna('Unknown')

            # Encoder
            le = LabelEncoder()
            X[f'{col}_encoded'] = le.fit_transform(X[col])

            # Supprimer la colonne originale
            X = X.drop(col, axis=1)

    # 3. Gérer les valeurs manquantes dans les colonnes numériques
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    missing_numeric = X[numeric_cols].isnull().sum()
    cols_with_missing = missing_numeric[missing_numeric > 0].index

    if len(cols_with_missing) > 0:
        print(f"   Remplacement des valeurs manquantes dans {len(cols_with_missing)} colonnes")
        for col in cols_with_missing:
            X[col] = X[col].fillna(X[col].median())

    # 4. Gérer les valeurs infinies
    print(f"   Remplacement des valeurs infinies")
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())

    # 5. Vérifier les colonnes constantes (variance nulle)
    constant_cols = []
    for col in numeric_cols:
        if col in X.columns and X[col].nunique() <= 1:
            constant_cols.append(col)

    if constant_cols:
        print(f"   Suppression de {len(constant_cols)} colonnes constantes: {constant_cols}")
        X = X.drop(constant_cols, axis=1)

    # 6. Vérification finale
    print(f"   Shape final: {X.shape}")
    print(f"   Toutes colonnes numériques: {X.dtypes.apply(lambda x: np.issubdtype(x, np.number)).all()}")
    print(f"   Valeurs manquantes: {X.isnull().sum().sum()}")
    print(f"   Valeurs infinies: {np.isinf(X).sum().sum()}")

    return X, y

In [21]:
def create_feature_engineering_safe(df):
    """Crée de nouvelles features de façon sécurisée"""

    print(f"\n🔧 FEATURE ENGINEERING SÉCURISÉ")
    df_enhanced = df.copy()

    # Variables dérivées seulement si les colonnes existent
    feature_created = 0

    # Lead time features
    if 'lead_time' in df.columns:
        print("   Création des features lead_time")
        # Catégories de lead_time
        df_enhanced['lead_time_very_short'] = (df['lead_time'] <= 7).astype(int)
        df_enhanced['lead_time_short'] = ((df['lead_time'] > 7) & (df['lead_time'] <= 30)).astype(int)
        df_enhanced['lead_time_medium'] = ((df['lead_time'] > 30) & (df['lead_time'] <= 90)).astype(int)
        df_enhanced['lead_time_long'] = (df['lead_time'] > 90).astype(int)

        # Transformation log
        df_enhanced['lead_time_log'] = np.log1p(df['lead_time'])

        # Lead time standardisé
        df_enhanced['lead_time_std'] = (df['lead_time'] - df['lead_time'].mean()) / (df['lead_time'].std() + 1e-8)

        feature_created += 6

    # Variables de nuits et guests
    if all(col in df.columns for col in ['no_of_weekend_nights', 'no_of_week_nights']):
        print("   Création des features de séjour")
        df_enhanced['total_nights'] = df['no_of_weekend_nights'] + df['no_of_week_nights']
        df_enhanced['weekend_ratio'] = df['no_of_weekend_nights'] / (df_enhanced['total_nights'] + 1e-8)
        feature_created += 2

    if all(col in df.columns for col in ['no_of_adults', 'no_of_children']):
        print("   Création des features d'invités")
        df_enhanced['total_guests'] = df['no_of_adults'] + df['no_of_children']
        df_enhanced['children_ratio'] = df['no_of_children'] / (df_enhanced['total_guests'] + 1e-8)
        feature_created += 2

    # Variables de prix
    if 'avg_price_per_room' in df.columns:
        print("   Création des features de prix")
        if 'total_guests' in df_enhanced.columns:
            df_enhanced['price_per_guest'] = df['avg_price_per_room'] / (df_enhanced['total_guests'] + 1e-8)
        if 'total_nights' in df_enhanced.columns:
            df_enhanced['price_per_night'] = df['avg_price_per_room'] / (df_enhanced['total_nights'] + 1e-8)

        # Catégories de prix
        price_q75 = df['avg_price_per_room'].quantile(0.75)
        df_enhanced['expensive_booking'] = (df['avg_price_per_room'] > price_q75).astype(int)
        feature_created += 3

    # Variables d'expérience client
    if all(col in df.columns for col in ['no_of_previous_bookings_not_canceled', 'no_of_previous_cancellations']):
        print("   Création des features d'expérience")
        df_enhanced['customer_experience'] = (df['no_of_previous_bookings_not_canceled'] -
                                            df['no_of_previous_cancellations'])

        if 'repeated_guest' in df.columns:
            df_enhanced['loyalty_score'] = df['repeated_guest'] * df_enhanced['customer_experience']
        feature_created += 2

    # Variables temporelles
    if 'arrival_month' in df.columns:
        print("   Création des features temporelles")
        df_enhanced['peak_season'] = df['arrival_month'].isin([6, 7, 8, 12]).astype(int)
        df_enhanced['low_season'] = df['arrival_month'].isin([1, 2, 11]).astype(int)
        feature_created += 2

    print(f"   ✅ {feature_created} nouvelles features créées")
    print(f"   Shape final: {df_enhanced.shape}")

    return df_enhanced

In [22]:
# EXÉCUTION DU DIAGNOSTIC ET NETTOYAGE
print("🚀 DÉMARRAGE DU DIAGNOSTIC")

# Chargement des données
file_paths = [
    'C:/Users/tneron2023/PycharmProjects/Python_IA/project_hotel/datas/Hotel_enhanced.csv',
    'C:/Users/tneron2023/PycharmProjects/Python_IA/project_hotel/datas/Hotel_clean.csv',
    'C:/Users/tneron2023/PycharmProjects/Python_IA/project_hotel/datas/Hotel_Reservations.csv'
]

df = None
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)
        print(f"✅ Fichier chargé: {file_path}")
        break
    except FileNotFoundError:
        continue

if df is None:
    print("❌ Aucun fichier de données trouvé")
else:
    # Identifier la variable cible
    target_candidates = ['booking_status_Not_Canceled', 'booking_status']
    target = None
    for candidate in target_candidates:
        if candidate in df.columns:
            target = candidate
            break

    if target is None:
        print("⚠️ Variable cible non identifiée automatiquement")
        print(f"Colonnes disponibles: {df.columns.tolist()}")

    # Diagnostic
    diagnose_dataframe(df, target)

    # Nettoyage
    X_clean, y_clean = clean_dataframe(df, target)

    if X_clean is not None:
        # Feature engineering sécurisé
        df_enhanced = create_feature_engineering_safe(X_clean)

        # Sauvegarde du dataset nettoyé
        if y_clean is not None:
            df_final = df_enhanced.copy()
            df_final['target'] = y_clean
        else:
            df_final = df_enhanced

        output_path = 'C:/Users/tneron2023/PycharmProjects/Python_IA/project_hotel/datas/Hotel_cleaned_safe.csv'
        df_final.to_csv(output_path, index=False)
        print(f"\n💾 Dataset nettoyé sauvegardé: {output_path}")

        # Diagnostic final
        print(f"\n✅ NETTOYAGE TERMINÉ")
        print(f"Shape original: {df.shape}")
        print(f"Shape final: {df_final.shape}")
        print(f"Features ajoutées: {df_final.shape[1] - df.shape[1]}")

        # Vérification que le dataset est prêt pour ML
        if X_clean.dtypes.apply(lambda x: np.issubdtype(x, np.number)).all():
            print("✅ Dataset prêt pour Machine Learning")
        else:
            print("❌ Il reste des problèmes dans le dataset")

print("\n🎉 DIAGNOSTIC TERMINÉ!")

🚀 DÉMARRAGE DU DIAGNOSTIC
✅ Fichier chargé: C:/Users/tneron2023/PycharmProjects/Python_IA/project_hotel/datas/Hotel_clean.csv

📊 DIAGNOSTIC GÉNÉRAL
Shape: (29999, 31)
Mémoire utilisée: 7.10 MB

📋 TYPES DE DONNÉES:
float64    18
int64      13
Name: count, dtype: int64

❓ VALEURS MANQUANTES:
✅ Aucune valeur manquante

♾️ VALEURS INFINIES:
✅ Aucune valeur infinie

🏷️ VARIABLES CATÉGORIELLES:
✅ Aucune variable catégorielle

🔢 VARIABLES NUMÉRIQUES SUSPECTES (peu de valeurs uniques):
   no_of_adults: 5 valeurs uniques - [2 1 3 0 4]
   no_of_children: 6 valeurs uniques - [ 0  2  1  3 10  9]
   no_of_weekend_nights: 8 valeurs uniques - [1 2 0 4 3 6 5 7]
   required_car_parking_space: 2 valeurs uniques - [0 1]
   arrival_year: 2 valeurs uniques - [2017 2018]
   repeated_guest: 2 valeurs uniques - [0 1]
   no_of_previous_cancellations: 9 valeurs uniques - [ 0  3  1  2 11  4  5 13  6]
   no_of_special_requests: 6 valeurs uniques - [0 1 3 2 4 5]
   type_of_meal_plan_Meal Plan 1: 2 valeurs uniques 