# Importation des packages nécessaires

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose

## Chargement et description des données

In [None]:
# Chargement des données
data = pd.read_csv('Data/smart_grid_dataset.csv', parse_dates=['Timestamp'])

# Description initiale
print("=== Informations de base ===")
print(f"Nombre d'observations: {len(data)}")
print(f"Nombre de variables: {len(data.columns)}")
print("\n=== Types de données ===")
print(data.dtypes)
print("\n=== Statistiques descriptives ===")
print(data.describe().transpose())

# Vérification des valeurs manquantes
print("\n=== Valeurs manquantes ===")
print(data.isnull().sum())

## Analyse exploratoire des données (EDA)

In [None]:
# Analyse temporelle
plt.figure(figsize=(15, 6))
data.set_index('Timestamp')['Power Consumption (kW)'].plot(title='Consommation électrique au fil du temps')
plt.ylabel('Consommation (kW)')
plt.show()

# Distribution des variables clés
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
sns.histplot(data['Power Consumption (kW)'], kde=True, ax=axes[0, 0])
sns.histplot(data['Voltage (V)'], kde=True, ax=axes[0, 1])
sns.histplot(data['Current (A)'], kde=True, ax=axes[1, 0])
sns.histplot(data['Temperature (°C)'], kde=True, ax=axes[1, 1])
sns.histplot(data['Solar Power (kW)'], kde=True, ax=axes[2, 0])
sns.histplot(data['Wind Power (kW)'], kde=True, ax=axes[2, 1])
plt.tight_layout()
plt.show()

# Matrice de corrélation
corr_matrix = data.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Matrice de corrélation')
plt.show()

# Analyse des variables catégorielles
print("\n=== Analyse des conditions de surcharge ===")
print(data['Overload Condition'].value_counts())
print("\n=== Analyse des défauts du transformateur ===")
print(data['Transformer Fault'].value_counts())

## Prétraitement des données

In [None]:
# Suppression des doublons temporels
data = data.drop_duplicates(subset=['Timestamp'])

# Feature engineering
data['Hour'] = data['Timestamp'].dt.hour
data['DayOfWeek'] = data['Timestamp'].dt.dayofweek
data['Month'] = data['Timestamp'].dt.month

# Séparation des features et target
X = data.drop(['Timestamp', 'Predicted Load (kW)'], axis=1)
y = data['Predicted Load (kW)']

# Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# Pipeline de prétraitement
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('feature_selection', SelectKBest(score_func=f_regression, k=10))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

# Ajout de PCA pour la réduction de dimension
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95))  # Conserve 95% de la variance
])

## Division des données

In [None]:
# Nous avons déjà divisé les données en train/test (80/20)
# Pour la validation croisée, nous utiliserons GridSearchCV plus tard

# Vérification des dimensions
print(f"Dimensions X_train: {X_train.shape}")
print(f"Dimensions X_test: {X_test.shape}")
print(f"Dimensions y_train: {y_train.shape}")
print(f"Dimensions y_test: {y_test.shape}")

## Construction des modèles

In [None]:
# Fonction pour évaluer les modèles
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")
    
    # Plot des résultats
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.3)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
    plt.xlabel('Valeurs Réelles')
    plt.ylabel('Prédictions')
    plt.title('Prédictions vs Valeurs Réelles')
    plt.show()
    
    return {'model': model, 'rmse': rmse, 'mae': mae, 'r2': r2}

# Modèle 1: Random Forest
rf_pipeline = Pipeline([
    ('preprocessing', full_pipeline),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Modèle 2: Gradient Boosting
gb_pipeline = Pipeline([
    ('preprocessing', full_pipeline),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# Modèle 3: SVM (nécessite un scaling différent)
svm_pipeline = Pipeline([
    ('preprocessing', preprocessor),  # Sans PCA pour SVM
    ('regressor', SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1))
])

# Évaluation des modèles
print("=== Random Forest ===")
rf_results = evaluate_model(rf_pipeline, X_train, y_train, X_test, y_test)

print("\n=== Gradient Boosting ===")
gb_results = evaluate_model(gb_pipeline, X_train, y_train, X_test, y_test)

print("\n=== SVM ===")
svm_results = evaluate_model(svm_pipeline, X_train, y_train, X_test, y_test)

## Visualisation des résultats

In [None]:
# Comparaison des modèles
results_df = pd.DataFrame({
    'Model': ['Random Forest', 'Gradient Boosting', 'SVM'],
    'RMSE': [rf_results['rmse'], gb_results['rmse'], svm_results['rmse']],
    'R²': [rf_results['r2'], gb_results['r2'], svm_results['r2']]
})

# Graphique de comparaison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
sns.barplot(x='Model', y='RMSE', data=results_df, ax=axes[0])
axes[0].set_title('Comparaison des RMSE')
sns.barplot(x='Model', y='R²', data=results_df, ax=axes[1])
axes[1].set_title('Comparaison des R²')
plt.tight_layout()
plt.show()

# Importance des caractéristiques (pour Random Forest)
rf_model = rf_pipeline.named_steps['regressor']
feature_importances = rf_model.feature_importances_

# Récupération des noms de features après prétraitement
processed_features = numeric_features[rf_pipeline.named_steps['preprocessing']
                                    .named_transformers_['num']
                                    .named_steps['feature_selection']
                                    .get_support()]

importance_df = pd.DataFrame({'Feature': processed_features, 'Importance': feature_importances})
importance_df = importance_df.sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Importance des caractéristiques (Random Forest)')
plt.show()

# Analyse des résidus
y_pred_rf = rf_pipeline.predict(X_test)
residuals = y_test - y_pred_rf

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred_rf, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Prédictions')
plt.ylabel('Résidus')
plt.title('Analyse des résidus')
plt.show()

## Évaluation et optimisation du modèle

In [None]:
from sklearn.model_selection import GridSearchCV

# Optimisation du Random Forest
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, 
                          scoring='neg_mean_squared_error', 
                          n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Meilleurs paramètres
print("Meilleurs paramètres:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Évaluation finale
y_pred_final = best_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_final))
final_r2 = r2_score(y_test, y_pred_final)

print(f"RMSE final: {final_rmse:.4f}")
print(f"R² final: {final_r2:.4f}")

# Visualisation des prédictions finales
plt.figure(figsize=(15, 6))
plt.plot(data['Timestamp'][-len(y_test):], y_test.values, label='Réel', alpha=0.7)
plt.plot(data['Timestamp'][-len(y_test):], y_pred_final, label='Prédiction', alpha=0.7)
plt.title('Comparaison des prédictions finales avec les valeurs réelles')
plt.xlabel('Date')
plt.ylabel('Charge prédite (kW)')
plt.legend()
plt.show()

# Sauvegarde du modèle
import joblib
joblib.dump(best_model, 'smart_grid_load_predictor.pkl')

# Chargement du modèle pour utilisation future
# loaded_model = joblib.load('smart_grid_load_predictor.pkl')