In [3]:
# =="""
# ==============================================================================
# @title 1. Instalasi dan Impor Library
# ==============================================================================
# Jalankan sel ini terlebih dahulu untuk mengimpor semua library yang dibutuhkan.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import os
import joblib
from collections import Counter
import warnings

warnings.filterwarnings('ignore')
print("✅ Library berhasil diimpor.")

# ==============================================================================
# @title 2. Konfigurasi Utama
# ==============================================================================
# Sel ini mendefinisikan variabel-variabel penting seperti lokasi folder
# dan daftar kolom yang akan digunakan dalam model.

# Tentukan path folder sumber data dan folder untuk menyimpan hasil
SOURCE_DATA_DIR = 'sumber_data'
# PEMBARUAN: Mengubah nama folder hasil
RESULTS_DIR = 'hasil_model_aggregate_pergedung'

# Pastikan folder hasil utama dan folder sumber ada
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(SOURCE_DATA_DIR, exist_ok=True)

# Daftar kolom fitur yang akan digunakan (tanpa 'Apparent Temperature')
RELEVANT_COLUMNS = [
    'Konsumsi Energi', 'Temperature', 'Showers', 'Cloud Cover', 'Weather Code',
    'Relative Humidity', 'Dew Point', 'Precipitation',
    'Pressure MSL', 'Surface Pressure', 'Evapotranspiration',
    'Vapour Pressure Deficit', 'Wind Speed', 'Wind Direction', 'Wind Gusts',
    'Soil Temperature', 'Sunshine Duration', 'UV Index', 'Direct Radiation'
]
TARGET_VARIABLE = 'Konsumsi Energi'
# MINIMUM_ROWS = 3000 # Batas minimum data untuk melatih model
MINIMUM_ROWS = 500 # Batas minimum data untuk melatih model (diubah untuk pengujian)

print(f"📁 Folder sumber data diatur ke: '{SOURCE_DATA_DIR}'")
print(f"📁 Folder hasil akan disimpan di: '{RESULTS_DIR}'")
print(f"📊 Batas minimum data untuk pelatihan: {MINIMUM_ROWS} baris")

# ==============================================================================
# @title 3. Persiapan Folder dan Unggah Data
# ==============================================================================
# PENTING: Sebelum menjalankan sel-sel berikutnya, unggah data Anda.
#
# 1. Di panel file sebelah kiri Google Colab, Anda akan melihat folder 'sumber_data'.
# 2. Klik kanan pada folder 'sumber_data' tersebut dan pilih 'Upload'.
# 3. Unggah folder 'witel' dan 'opmc' Anda yang berisi semua data CSV
#    ke dalam folder 'sumber_data'.
#
# Setelah selesai, Anda bisa melanjutkan menjalankan sel-sel berikutnya.

print("✅ Sel ini siap.")
print(f"Pastikan Anda telah mengunggah data Anda ke dalam folder '{SOURCE_DATA_DIR}'.")

# ==============================================================================
# @title 4. Definisi Fungsi-Fungsi Pembantu
# ==============================================================================
# Sel ini berisi fungsi-fungsi utama untuk melatih model dan membuat visualisasi.
# Jalankan sel ini untuk mendefinisikan fungsi agar bisa digunakan nanti.

def train_and_evaluate_models(X_train, y_train, X_val, y_val, X_test, y_test):
    """
    Fungsi untuk melatih semua model (RF, GB, LSTM) dan mengembalikan
    prediksi serta metrik kinerjanya.
    """
    results = {}

    # --- Model 1: Random Forest Regressor ---
    print("   - Melatih Random Forest...")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    results['RandomForest'] = {'model': rf_model, 'predictions': y_pred_rf, 'mae': mean_absolute_error(y_test, y_pred_rf), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_rf)), 'r2': r2_score(y_test, y_pred_rf)}

    # --- Model 2: Gradient Boosting Regressor ---
    print("   - Melatih Gradient Boosting...")
    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)
    y_pred_gb = gb_model.predict(X_test)
    results['GradientBoosting'] = {'model': gb_model, 'predictions': y_pred_gb, 'mae': mean_absolute_error(y_test, y_pred_gb), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_gb)), 'r2': r2_score(y_test, y_pred_gb)}

    # --- Model 3: LSTM ---
    print("   - Melatih LSTM...")
    scaler_X = MinMaxScaler(feature_range=(0, 1)); scaler_y = MinMaxScaler(feature_range=(0, 1))
    X_train_scaled = scaler_X.fit_transform(X_train); y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
    X_val_scaled = scaler_X.transform(X_val); y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1))
    X_test_scaled = scaler_X.transform(X_test)
    X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
    X_val_lstm = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
    X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))
    lstm_model = Sequential([LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])), Dense(1)])
    lstm_model.compile(optimizer='adam', loss='mean_squared_error')
    lstm_model.fit(X_train_lstm, y_train_scaled, epochs=50, batch_size=32, validation_data=(X_val_lstm, y_val_scaled), verbose=0, shuffle=False)
    y_pred_lstm_scaled = lstm_model.predict(X_test_lstm)
    y_pred_lstm = scaler_y.inverse_transform(y_pred_lstm_scaled)
    results['LSTM'] = {'model': lstm_model, 'predictions': y_pred_lstm.flatten(), 'mae': mean_absolute_error(y_test, y_pred_lstm), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_lstm)), 'r2': r2_score(y_test, y_pred_lstm)}
    return results

def create_prediction_plots(y_test, predictions, plot_suffix, output_dir):
    """Membuat dan menyimpan scatter plot dan line graph untuk prediksi."""
    # --- Konversi ke kWh untuk semua plot ---
    y_test_kwh = y_test / 1000
    predictions_kwh = {name: pred / 1000 for name, pred in predictions.items()}

    # --- Scatter Plot (Semua Data Test) ---
    plt.figure(figsize=(20, 6))
    colors = ['green', 'red', 'orange']
    for i, (model_name, pred_kwh) in enumerate(predictions_kwh.items()):
        mae_kwh = mean_absolute_error(y_test_kwh, pred_kwh)
        rmse_kwh = np.sqrt(mean_squared_error(y_test_kwh, pred_kwh))
        r2 = r2_score(y_test_kwh, pred_kwh)
        plt.subplot(1, 3, i + 1)
        plt.scatter(y_test_kwh, pred_kwh, alpha=0.6, edgecolors='k', color=colors[i])
        plt.plot([y_test_kwh.min(), y_test_kwh.max()], [y_test_kwh.min(), y_test_kwh.max()], '--r', linewidth=2)
        plt.title(f'{model_name}\nR2: {r2:.2f} | RMSE: {rmse_kwh:.2f} | MAE: {mae_kwh:.2f} kWh')
        plt.xlabel('Nilai Aktual (kWh)')
        plt.ylabel('Nilai Prediksi (kWh)')
        plt.grid(True)
    plt.suptitle(f'Scatter Plot (Semua Data Test) - {plot_suffix}', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(os.path.join(output_dir, f'scatter_plot_{plot_suffix}.png'))
    plt.close()

    # --- Persiapan DataFrame untuk Plot Zoom ---
    plot_df_full = pd.DataFrame({'Aktual (kWh)': y_test_kwh})
    for model_name, pred_kwh in predictions_kwh.items():
        plot_df_full[model_name] = pred_kwh
    
    # PEMBARUAN: Mengambil 500 data terakhir untuk zoom
    plot_df_zoom = plot_df_full.tail(500)

    # --- Scatter Plot Zoom (500 Data Terakhir) ---
    if not plot_df_zoom.empty:
        plt.figure(figsize=(20, 6))
        for i, model_name in enumerate(predictions_kwh.keys()):
            y_test_zoom = plot_df_zoom['Aktual (kWh)']
            pred_zoom = plot_df_zoom[model_name]
            mae_kwh = mean_absolute_error(y_test_zoom, pred_zoom)
            rmse_kwh = np.sqrt(mean_squared_error(y_test_zoom, pred_zoom))
            r2 = r2_score(y_test_zoom, pred_zoom)
            plt.subplot(1, 3, i + 1)
            plt.scatter(y_test_zoom, pred_zoom, alpha=0.6, edgecolors='k', color=colors[i])
            plt.plot([y_test_zoom.min(), y_test_zoom.max()], [y_test_zoom.min(), y_test_zoom.max()], '--r', linewidth=2)
            plt.title(f'{model_name}\nR2: {r2:.2f} | RMSE: {rmse_kwh:.2f} | MAE: {mae_kwh:.2f} kWh')
            plt.xlabel('Nilai Aktual (kWh)')
            plt.ylabel('Nilai Prediksi (kWh)')
            plt.grid(True)
        plt.suptitle(f'Scatter Plot Zoom (100 Data Terakhir) - {plot_suffix}', fontsize=16)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig(os.path.join(output_dir, f'scatter_plot_zoom_{plot_suffix}.png'))
        plt.close()

    # --- Grafik Waktu (Titik Acak) ---
    plot_df_full.sort_index(inplace=True)
    plt.figure(figsize=(20, 8))
    plt.plot(plot_df_full.index, plot_df_full['Aktual (kWh)'], label='Nilai Aktual', color='blue', marker='o', linestyle='None', markersize=5, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['RandomForest'], label='Prediksi RF', color='green', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['GradientBoosting'], label='Prediksi GB', color='red', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['LSTM'], label='Prediksi LSTM', color='orange', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.title(f'Grafik Waktu: Prediksi vs Aktual - {plot_suffix}\n(Menampilkan titik data dari Test Set yang acak)', fontsize=16)
    plt.xlabel('Waktu'); plt.ylabel('Konsumsi Energi (kWh)')
    plt.legend(); plt.grid(True); plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'time_series_plot_{plot_suffix}.png'))
    plt.close()
    
    # --- Grafik Waktu Zoom (100 Data Terakhir) ---
    if not plot_df_zoom.empty:
        plt.figure(figsize=(20, 8))
        plt.plot(plot_df_zoom.index, plot_df_zoom['Aktual (kWh)'], label='Nilai Aktual', color='blue', marker='o', linestyle='None', markersize=5, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['RandomForest'], label='Prediksi RF', color='green', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['GradientBoosting'], label='Prediksi GB', color='red', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['LSTM'], label='Prediksi LSTM', color='orange', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.title(f'Grafik Waktu Zoom (100 Data Terakhir) - {plot_suffix}', fontsize=16)
        plt.xlabel('Waktu'); plt.ylabel('Konsumsi Energi (kWh)')
        plt.legend(); plt.grid(True); plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'time_series_plot_zoom_{plot_suffix}.png'))
        plt.close()


def create_combined_heatmap(performance_data, title_suffix, output_dir):
    """Membuat dan menyimpan heatmap gabungan dari data kinerja model."""
    if not performance_data:
        print(f"Tidak ada data kinerja untuk membuat heatmap.")
        return
    df = pd.DataFrame(performance_data)
    
    df['Label Perangkat'] = df['Gedung'] + ' - ' + df['Perangkat']
    try:
        df.sort_values(by=['Gedung', 'Label Perangkat'], inplace=True)
        mae_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='MAE')
        rmse_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='RMSE')
        r2_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='R2')
    except Exception as e:
        print(f"Error saat membuat pivot table untuk {title_suffix}: {e}\nData: {df}")
        return
        
    num_devices = len(df['Label Perangkat'].unique())
    fig_width = max(18, num_devices * 1.5)
    
    fig, axes = plt.subplots(3, 1, figsize=(fig_width, 21))
    fig.suptitle(f'Heatmap Kinerja Model - {title_suffix}', fontsize=20)
    
    # Heatmap R2
    sns.heatmap(r2_pivot, annot=True, fmt=".2f", cmap="viridis", ax=axes[0], linewidths=.5)
    axes[0].set_title('R2 Score - Lebih Tinggi Lebih Baik', fontsize=16)
    axes[0].set_xlabel(''); axes[0].set_ylabel('Model', fontsize=12)
    axes[0].tick_params(axis='x', rotation=45)

    # Heatmap RMSE
    sns.heatmap(rmse_pivot, annot=True, fmt=".2f", cmap="viridis_r", ax=axes[1], linewidths=.5)
    axes[1].set_title('RMSE (kWh) - Lebih Rendah Lebih Baik', fontsize=16)
    axes[1].set_xlabel(''); axes[1].set_ylabel('Model', fontsize=12)
    axes[1].tick_params(axis='x', rotation=45)
    
    # Heatmap MAE
    sns.heatmap(mae_pivot, annot=True, fmt=".2f", cmap="viridis_r", ax=axes[2], linewidths=.5)
    axes[2].set_title('MAE (kWh) - Lebih Rendah Lebih Baik', fontsize=16)
    axes[2].set_xlabel('Gedung - Perangkat / Lokasi', fontsize=12)
    axes[2].set_ylabel('Model', fontsize=12)
    axes[2].tick_params(axis='x', rotation=45)

    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    heatmap_path = os.path.join(output_dir, f'heatmap_{title_suffix}.png')
    plt.savefig(heatmap_path, bbox_inches='tight')
    plt.close()
    print(f"\nHeatmap gabungan disimpan di: {heatmap_path}")

print("✅ Fungsi-fungsi pembantu berhasil didefinisikan.")

# ==============================================================================
# @title 5. Memuat, Membersihkan, dan Menggabungkan Data
# ==============================================================================
# Sel ini akan memuat semua data, membersihkannya, dan menggabungkannya
# berdasarkan jenis perangkat di setiap gedung.

grouped_data = {}
consumption_ranking = []

print("Memulai proses pemuatan dan penggabungan data...")
for root, dirs, files in os.walk(SOURCE_DATA_DIR):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            try:
                df = pd.read_csv(file_path, index_col='id_time', parse_dates=True)
                
                if TARGET_VARIABLE not in df.columns or df[TARGET_VARIABLE].isnull().all():
                    continue
                
                df_cleaned = df[[TARGET_VARIABLE]].dropna()
                df_cleaned = df_cleaned[df_cleaned[TARGET_VARIABLE] > 0]

                if not df_cleaned.empty:
                    path_parts = os.path.relpath(root, SOURCE_DATA_DIR).split(os.sep)
                    building = path_parts[0]
                    device_label = "_".join(path_parts[1:])
                    avg_consumption = df_cleaned[TARGET_VARIABLE].mean()
                    consumption_ranking.append({
                        'label': f"{building} - {device_label}",
                        'avg_kwh': avg_consumption / 1000
                    })

                path_parts = os.path.relpath(root, SOURCE_DATA_DIR).split(os.sep)
                building = path_parts[0]
                device_type = path_parts[-1].split('_')[0].lower()
                
                group_key = f"{building}_{device_type}"
                
                if group_key not in grouped_data:
                    grouped_data[group_key] = []
                
                grouped_data[group_key].append(df)

            except Exception as e:
                print(f"   - Gagal memproses {file_path}: {e}")

for key, df_list in grouped_data.items():
    grouped_data[key] = pd.concat(df_list, ignore_index=False)
    print(f"✅ Data untuk '{key}' berhasil digabungkan, total {len(grouped_data[key])} baris.")

print("\n--- Peringkat Konsumsi Energi Rata-Rata Terbesar ---")
consumption_ranking.sort(key=lambda x: x['avg_kwh'], reverse=True)
for i, item in enumerate(consumption_ranking):
    print(f"{i+1}. {item['label']}: {item['avg_kwh']:.2f} kWh")

# ==============================================================================
# @title 6. Proses Utama: Melatih Model per Jenis Perangkat
# ==============================================================================
# Sel ini akan melakukan loop melalui data yang sudah digabungkan,
# melatih model, dan menyimpan hasilnya.

best_model_counter = Counter()
all_model_stats = []
total_models_trained = 0
building_predictions_tracker = {}


for group_name, df_group in grouped_data.items():
    print(f"\n{'='*50}\nMemproses Grup: {group_name.upper()}\n{'='*50}")
    
    existing_cols = [col for col in RELEVANT_COLUMNS if col in df_group.columns]
    df_processed = df_group.reindex(columns=existing_cols).copy()
    df_processed.dropna(subset=[TARGET_VARIABLE], inplace=True)
    
    for lag in range(1, 4):
        df_processed[f'Konsumsi_Energi_Lag_{lag}'] = df_processed[TARGET_VARIABLE].shift(lag)
    
    df_processed.dropna(inplace=True)
    df_final = df_processed[df_processed[TARGET_VARIABLE] > 0].copy()

    if len(df_final) < MINIMUM_ROWS:
        print(f"   - ⚠️ PERINGATAN: Data tidak cukup ({len(df_final)} baris). Melewati pelatihan untuk grup ini.")
        continue

    print("   - Menghitung matriks korelasi...")
    correlation_matrix = df_final.corr()
    
    output_dir = os.path.join(RESULTS_DIR, *group_name.split('_'))
    os.makedirs(output_dir, exist_ok=True)
    
    plt.figure(figsize=(22, 18))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
    plt.title(f'Correlation Matrix - {group_name}', fontsize=16)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'correlation_matrix_{group_name}.png'))
    plt.close()
    print(f"   - Heatmap korelasi disimpan di: {output_dir}")

    correlations = correlation_matrix[TARGET_VARIABLE].abs()
    selected_features = correlations[correlations >= 0.4].index.tolist()
    features_for_model = [f for f in selected_features if f != TARGET_VARIABLE]

    print(f"   - Fitur terpilih dengan korelasi >= 0.4: {features_for_model}")
    if not features_for_model:
        print("   - Tidak ada fitur yang memenuhi ambang korelasi."); continue

    X = df_final[features_for_model]; y = df_final[TARGET_VARIABLE]
    
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    if len(X_train) == 0 or len(X_test) == 0: continue

    print(f"   - Ukuran Data: Latih={len(X_train)}, Validasi={len(X_val)}, Uji={len(X_test)} (Acak)")
    model_evaluations = train_and_evaluate_models(X_train, y_train, X_val, y_val, X_test, y_test)
    
    device_predictions = {name: data['predictions'] for name, data in model_evaluations.items()}
    create_prediction_plots(y_test, device_predictions, group_name, output_dir)
    print(f"   - Plot prediksi untuk grup '{group_name}' disimpan.")

    building_name = group_name.split('_')[0]
    if building_name not in building_predictions_tracker:
        building_predictions_tracker[building_name] = {'y_true': [], 'preds': {'RandomForest': [], 'GradientBoosting': [], 'LSTM': []}}

    building_predictions_tracker[building_name]['y_true'].append(y_test)
    for model_name, preds in device_predictions.items():
        building_predictions_tracker[building_name]['preds'][model_name].append(preds)

    best_model_name, best_model_mae = '', float('inf')
    for model_name, eval_data in model_evaluations.items():
        if eval_data['mae'] < best_model_mae:
            best_model_mae = eval_data['mae']
            best_model_name = model_name
        
        all_model_stats.append({
            'group': group_name,
            'model': model_name,
            'mae': eval_data['mae'] / 1000,
            'rmse': eval_data['rmse'] / 1000,
            'r2': eval_data['r2']
        })
    
    best_model_counter[best_model_name] += 1
    total_models_trained += 1
    print(f"   ==> Model terbaik untuk grup ini adalah {best_model_name} (berdasarkan MAE).")

# ==============================================================================
# @title 7. Ringkasan dan Analisis Final
# ==============================================================================
# Sel ini akan menampilkan ringkasan model terbaik dan statistik evaluasi.

if total_models_trained > 0:
    stats_df = pd.DataFrame(all_model_stats)

    # --- 1. Ringkasan Model Terbaik ---
    print(f"\n{'='*50}\nRingkasan Model Terbaik (dari {total_models_trained} grup perangkat)\n{'='*50}")
    sorted_models = best_model_counter.most_common()
    for model, count in sorted_models:
        percentage = (count / total_models_trained) * 100
        print(f"🏆 {model}: Model terbaik sebanyak {count}/{total_models_trained} kali ({percentage:.1f}%)")

    # --- 2. Statistik Evaluasi Keseluruhan ---
    print(f"\n{'='*50}\nStatistik Evaluasi Keseluruhan (Rata-rata dari semua model)\n{'='*50}")
    overall_avg = stats_df[['r2', 'rmse', 'mae']].mean()
    print(f"Rata-rata R2 Score : {overall_avg['r2']:.2f}")
    print(f"Rata-rata RMSE      : {overall_avg['rmse']:.2f} kWh")
    print(f"Rata-rata MAE       : {overall_avg['mae']:.2f} kWh")

    # --- 3. Rata-Rata Metrik per Model ---
    print(f"\n{'='*50}\nRata-Rata Metrik per Model\n{'='*50}")
    per_model_avg = stats_df.groupby('model')[['r2', 'rmse', 'mae']].mean()
    print(per_model_avg.round(2))
    
    # --- 4. Detail Metrik Evaluasi per Grup Perangkat ---
    print(f"\n{'='*50}\nDetail Metrik Evaluasi per Grup Perangkat\n{'='*50}")
    for group, group_df in stats_df.groupby('group'):
        print(f"\n--- Grup: {group.upper()} ---")
        best_model_in_group = group_df.loc[group_df['mae'].idxmin()]
        for index, row in group_df.iterrows():
            is_best = "🏆" if row['model'] == best_model_in_group['model'] else ""
            print(f"  - Model: {row['model']:<17} | R2: {row['r2']:.2f} | RMSE: {row['rmse']:.2f} kWh | MAE: {row['mae']:.2f} kWh {is_best}")

    # --- 5. Membuat Plot Gabungan per Gedung ---
    print(f"\n{'='*50}\nMembuat Plot Gabungan per Gedung\n{'='*50}")
    for building_name, data in building_predictions_tracker.items():
        y_true_combined = pd.concat(data['y_true'])
        preds_combined = {model: np.concatenate(preds) for model, preds in data['preds'].items()}
        
        building_output_dir = os.path.join(RESULTS_DIR, building_name)
        os.makedirs(building_output_dir, exist_ok=True)
        
        create_prediction_plots(y_true_combined, preds_combined, f"Gabungan_{building_name.upper()}", building_output_dir)
        print(f"✅ Plot prediksi gabungan untuk gedung '{building_name}' disimpan di: {building_output_dir}")

else:
    print("\nTidak ada model yang dilatih, ringkasan tidak dapat dibuat.")

print(f"\n\n🏁 Proses Selesai. Semua hasil telah disimpan di folder '{RESULTS_DIR}'.")

✅ Library berhasil diimpor.
📁 Folder sumber data diatur ke: 'sumber_data'
📁 Folder hasil akan disimpan di: 'hasil_model_aggregate_pergedung'
📊 Batas minimum data untuk pelatihan: 500 baris
✅ Sel ini siap.
Pastikan Anda telah mengunggah data Anda ke dalam folder 'sumber_data'.
✅ Fungsi-fungsi pembantu berhasil didefinisikan.
Memulai proses pemuatan dan penggabungan data...
✅ Data untuk 'opmc_sdp' berhasil digabungkan, total 52674 baris.
✅ Data untuk 'opmc_ahu' berhasil digabungkan, total 43893 baris.
✅ Data untuk 'opmc_lift' berhasil digabungkan, total 1121 baris.
✅ Data untuk 'witel_ahu' berhasil digabungkan, total 96391 baris.
✅ Data untuk 'witel_sdp' berhasil digabungkan, total 96598 baris.
✅ Data untuk 'witel_lift' berhasil digabungkan, total 17554 baris.
✅ Data untuk 'witel_chiller' berhasil digabungkan, total 1121 baris.

--- Peringkat Konsumsi Energi Rata-Rata Terbesar ---
1. opmc - LIFT: 326.63 kWh
2. witel - CHILLER: 326.63 kWh
3. witel - Lantai7_SDP: 57.95 kWh
4. witel - Lanta

In [7]:
# ==============================================================================
# @title 1. Instalasi dan Impor Library
# ==============================================================================
# Jalankan sel ini terlebih dahulu untuk mengimpor semua library yang dibutuhkan.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import os
import joblib
from collections import Counter
import warnings

warnings.filterwarnings('ignore')
print("✅ Library berhasil diimpor.")

# ==============================================================================
# @title 2. Konfigurasi Utama
# ==============================================================================
# Sel ini mendefinisikan variabel-variabel penting seperti lokasi folder
# dan daftar kolom yang akan digunakan dalam model.

# Tentukan path folder sumber data dan folder untuk menyimpan hasil
SOURCE_DATA_DIR = 'sumber_data'
# PEMBARUAN: Mengubah nama folder hasil
RESULTS_DIR = 'hasil_model_aggregate_pergedung_revisikorelasimatrix'

# Pastikan folder hasil utama dan folder sumber ada
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(SOURCE_DATA_DIR, exist_ok=True)

# Daftar kolom fitur yang akan digunakan (tanpa 'Apparent Temperature')
RELEVANT_COLUMNS = [
    'Konsumsi Energi', 'Temperature', 'Showers', 'Cloud Cover', 'Weather Code',
    'Relative Humidity', 'Dew Point', 'Precipitation',
    'Pressure MSL', 'Surface Pressure', 'Evapotranspiration',
    'Vapour Pressure Deficit', 'Wind Speed', 'Wind Direction', 'Wind Gusts',
    'Soil Temperature', 'Sunshine Duration', 'UV Index', 'Direct Radiation'
]
TARGET_VARIABLE = 'Konsumsi Energi'
# MINIMUM_ROWS = 3000 # Batas minimum data untuk melatih model
MINIMUM_ROWS = 500 # Batas minimum data untuk melatih model (diubah untuk pengujian)

print(f"📁 Folder sumber data diatur ke: '{SOURCE_DATA_DIR}'")
print(f"📁 Folder hasil akan disimpan di: '{RESULTS_DIR}'")
print(f"📊 Batas minimum data untuk pelatihan: {MINIMUM_ROWS} baris")

# ==============================================================================
# @title 3. Persiapan Folder dan Unggah Data
# ==============================================================================
# PENTING: Sebelum menjalankan sel-sel berikutnya, unggah data Anda.
#
# 1. Di panel file sebelah kiri Google Colab, Anda akan melihat folder 'sumber_data'.
# 2. Klik kanan pada folder 'sumber_data' tersebut dan pilih 'Upload'.
# 3. Unggah folder 'witel' dan 'opmc' Anda yang berisi semua data CSV
#    ke dalam folder 'sumber_data'.
#
# Setelah selesai, Anda bisa melanjutkan menjalankan sel-sel berikutnya.

print("✅ Sel ini siap.")
print(f"Pastikan Anda telah mengunggah data Anda ke dalam folder '{SOURCE_DATA_DIR}'.")

# ==============================================================================
# @title 4. Definisi Fungsi-Fungsi Pembantu
# ==============================================================================
# Sel ini berisi fungsi-fungsi utama untuk melatih model dan membuat visualisasi.
# Jalankan sel ini untuk mendefinisikan fungsi agar bisa digunakan nanti.

def train_and_evaluate_models(X_train, y_train, X_val, y_val, X_test, y_test):
    """
    Fungsi untuk melatih semua model (RF, GB, LSTM) dan mengembalikan
    prediksi serta metrik kinerjanya.
    """
    results = {}

    # --- Model 1: Random Forest Regressor ---
    print("   - Melatih Random Forest...")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    results['RandomForest'] = {'model': rf_model, 'predictions': y_pred_rf, 'mae': mean_absolute_error(y_test, y_pred_rf), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_rf)), 'r2': r2_score(y_test, y_pred_rf)}

    # --- Model 2: Gradient Boosting Regressor ---
    print("   - Melatih Gradient Boosting...")
    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)
    y_pred_gb = gb_model.predict(X_test)
    results['GradientBoosting'] = {'model': gb_model, 'predictions': y_pred_gb, 'mae': mean_absolute_error(y_test, y_pred_gb), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_gb)), 'r2': r2_score(y_test, y_pred_gb)}

    # --- Model 3: LSTM ---
    print("   - Melatih LSTM...")
    scaler_X = MinMaxScaler(feature_range=(0, 1)); scaler_y = MinMaxScaler(feature_range=(0, 1))
    X_train_scaled = scaler_X.fit_transform(X_train); y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
    X_val_scaled = scaler_X.transform(X_val); y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1))
    X_test_scaled = scaler_X.transform(X_test)
    X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
    X_val_lstm = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
    X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))
    lstm_model = Sequential([LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])), Dense(1)])
    lstm_model.compile(optimizer='adam', loss='mean_squared_error')
    lstm_model.fit(X_train_lstm, y_train_scaled, epochs=50, batch_size=32, validation_data=(X_val_lstm, y_val_scaled), verbose=0, shuffle=False)
    y_pred_lstm_scaled = lstm_model.predict(X_test_lstm)
    y_pred_lstm = scaler_y.inverse_transform(y_pred_lstm_scaled)
    results['LSTM'] = {'model': lstm_model, 'predictions': y_pred_lstm.flatten(), 'mae': mean_absolute_error(y_test, y_pred_lstm), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_lstm)), 'r2': r2_score(y_test, y_pred_lstm)}
    return results

def create_prediction_plots(y_test, predictions, plot_suffix, output_dir):
    """Membuat dan menyimpan scatter plot dan line graph untuk prediksi."""
    # --- Konversi ke kWh untuk semua plot ---
    y_test_kwh = y_test / 1000
    predictions_kwh = {name: pred / 1000 for name, pred in predictions.items()}

    # --- Scatter Plot (Semua Data Test) ---
    plt.figure(figsize=(20, 6))
    colors = ['green', 'red', 'orange']
    for i, (model_name, pred_kwh) in enumerate(predictions_kwh.items()):
        mae_kwh = mean_absolute_error(y_test_kwh, pred_kwh)
        rmse_kwh = np.sqrt(mean_squared_error(y_test_kwh, pred_kwh))
        r2 = r2_score(y_test_kwh, pred_kwh)
        plt.subplot(1, 3, i + 1)
        plt.scatter(y_test_kwh, pred_kwh, alpha=0.6, edgecolors='k', color=colors[i])
        plt.plot([y_test_kwh.min(), y_test_kwh.max()], [y_test_kwh.min(), y_test_kwh.max()], '--r', linewidth=2)
        plt.title(f'{model_name}\nR2: {r2:.2f} | RMSE: {rmse_kwh:.2f} | MAE: {mae_kwh:.2f} kWh')
        plt.xlabel('Nilai Aktual (kWh)')
        plt.ylabel('Nilai Prediksi (kWh)')
        plt.grid(True)
    plt.suptitle(f'Scatter Plot (Semua Data Test) - {plot_suffix}', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(os.path.join(output_dir, f'scatter_plot_{plot_suffix}.png'))
    plt.close()

    # --- Persiapan DataFrame untuk Plot Zoom ---
    plot_df_full = pd.DataFrame({'Aktual (kWh)': y_test_kwh})
    for model_name, pred_kwh in predictions_kwh.items():
        plot_df_full[model_name] = pred_kwh

    # PEMBARUAN: Mengambil 500 data terakhir untuk zoom
    plot_df_zoom = plot_df_full.tail(500)

    # --- Scatter Plot Zoom (500 Data Terakhir) ---
    if not plot_df_zoom.empty:
        plt.figure(figsize=(20, 6))
        for i, model_name in enumerate(predictions_kwh.keys()):
            y_test_zoom = plot_df_zoom['Aktual (kWh)']
            pred_zoom = plot_df_zoom[model_name]
            mae_kwh = mean_absolute_error(y_test_zoom, pred_zoom)
            rmse_kwh = np.sqrt(mean_squared_error(y_test_zoom, pred_zoom))
            r2 = r2_score(y_test_zoom, pred_zoom)
            plt.subplot(1, 3, i + 1)
            plt.scatter(y_test_zoom, pred_zoom, alpha=0.6, edgecolors='k', color=colors[i])
            plt.plot([y_test_zoom.min(), y_test_zoom.max()], [y_test_zoom.min(), y_test_zoom.max()], '--r', linewidth=2)
            plt.title(f'{model_name}\nR2: {r2:.2f} | RMSE: {rmse_kwh:.2f} | MAE: {mae_kwh:.2f} kWh')
            plt.xlabel('Nilai Aktual (kWh)')
            plt.ylabel('Nilai Prediksi (kWh)')
            plt.grid(True)
        plt.suptitle(f'Scatter Plot Zoom (500 Data Terakhir) - {plot_suffix}', fontsize=16)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig(os.path.join(output_dir, f'scatter_plot_zoom_{plot_suffix}.png'))
        plt.close()

    # --- Grafik Waktu (Titik Acak) ---
    plot_df_full.sort_index(inplace=True)
    plt.figure(figsize=(20, 8))
    plt.plot(plot_df_full.index, plot_df_full['Aktual (kWh)'], label='Nilai Aktual', color='blue', marker='o', linestyle='None', markersize=5, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['RandomForest'], label='Prediksi RF', color='green', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['GradientBoosting'], label='Prediksi GB', color='red', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['LSTM'], label='Prediksi LSTM', color='orange', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.title(f'Grafik Waktu: Prediksi vs Aktual - {plot_suffix}\n(Menampilkan titik data dari Test Set yang acak)', fontsize=16)
    plt.xlabel('Waktu'); plt.ylabel('Konsumsi Energi (kWh)')
    plt.legend(); plt.grid(True); plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'time_series_plot_{plot_suffix}.png'))
    plt.close()

    # --- Grafik Waktu Zoom (500 Data Terakhir) ---
    if not plot_df_zoom.empty:
        plt.figure(figsize=(20, 8))
        plt.plot(plot_df_zoom.index, plot_df_zoom['Aktual (kWh)'], label='Nilai Aktual', color='blue', marker='o', linestyle='None', markersize=5, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['RandomForest'], label='Prediksi RF', color='green', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['GradientBoosting'], label='Prediksi GB', color='red', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['LSTM'], label='Prediksi LSTM', color='orange', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.title(f'Grafik Waktu Zoom (500 Data Terakhir) - {plot_suffix}', fontsize=16)
        plt.xlabel('Waktu'); plt.ylabel('Konsumsi Energi (kWh)')
        plt.legend(); plt.grid(True); plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'time_series_plot_zoom_{plot_suffix}.png'))
        plt.close()


def create_combined_heatmap(performance_data, title_suffix, output_dir):
    """Membuat dan menyimpan heatmap gabungan dari data kinerja model."""
    if not performance_data:
        print(f"Tidak ada data kinerja untuk membuat heatmap.")
        return
    df = pd.DataFrame(performance_data)

    df['Label Perangkat'] = df['Gedung'] + ' - ' + df['Perangkat']
    try:
        df.sort_values(by=['Gedung', 'Label Perangkat'], inplace=True)
        mae_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='MAE')
        rmse_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='RMSE')
        r2_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='R2')
    except Exception as e:
        print(f"Error saat membuat pivot table untuk {title_suffix}: {e}\nData: {df}")
        return

    num_devices = len(df['Label Perangkat'].unique())
    fig_width = max(18, num_devices * 1.5)

    fig, axes = plt.subplots(3, 1, figsize=(fig_width, 21))
    fig.suptitle(f'Heatmap Kinerja Model - {title_suffix}', fontsize=20)

    # Heatmap R2
    sns.heatmap(r2_pivot, annot=True, fmt=".2f", cmap="viridis", ax=axes[0], linewidths=.5)
    axes[0].set_title('R2 Score - Lebih Tinggi Lebih Baik', fontsize=16)
    axes[0].set_xlabel(''); axes[0].set_ylabel('Model', fontsize=12)
    axes[0].tick_params(axis='x', rotation=45)

    # Heatmap RMSE
    sns.heatmap(rmse_pivot, annot=True, fmt=".2f", cmap="viridis_r", ax=axes[1], linewidths=.5)
    axes[1].set_title('RMSE (kWh) - Lebih Rendah Lebih Baik', fontsize=16)
    axes[1].set_xlabel(''); axes[1].set_ylabel('Model', fontsize=12)
    axes[1].tick_params(axis='x', rotation=45)

    # Heatmap MAE
    sns.heatmap(mae_pivot, annot=True, fmt=".2f", cmap="viridis_r", ax=axes[2], linewidths=.5)
    axes[2].set_title('MAE (kWh) - Lebih Rendah Lebih Baik', fontsize=16)
    axes[2].set_xlabel('Gedung - Perangkat / Lokasi', fontsize=12)
    axes[2].set_ylabel('Model', fontsize=12)
    axes[2].tick_params(axis='x', rotation=45)

    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    heatmap_path = os.path.join(output_dir, f'heatmap_{title_suffix}.png')
    plt.savefig(heatmap_path, bbox_inches='tight')
    plt.close()
    print(f"\nHeatmap gabungan disimpan di: {heatmap_path}")

print("✅ Fungsi-fungsi pembantu berhasil didefinisikan.")

# ==============================================================================
# @title 5. Memuat, Membersihkan, dan Menggabungkan Data
# ==============================================================================
# Sel ini akan memuat semua data, membersihkannya, dan menggabungkannya
# berdasarkan jenis perangkat di setiap gedung.

grouped_data = {}
consumption_ranking = []

print("Memulai proses pemuatan dan penggabungan data...")
for root, dirs, files in os.walk(SOURCE_DATA_DIR):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            try:
                df = pd.read_csv(file_path, index_col='id_time', parse_dates=True)

                if TARGET_VARIABLE not in df.columns or df[TARGET_VARIABLE].isnull().all():
                    continue

                df_cleaned = df[[TARGET_VARIABLE]].dropna()
                df_cleaned = df_cleaned[df_cleaned[TARGET_VARIABLE] > 0]

                if not df_cleaned.empty:
                    path_parts = os.path.relpath(root, SOURCE_DATA_DIR).split(os.sep)
                    building = path_parts[0]
                    device_label = "_".join(path_parts[1:])
                    avg_consumption = df_cleaned[TARGET_VARIABLE].mean()
                    consumption_ranking.append({
                        'label': f"{building} - {device_label}",
                        'avg_kwh': avg_consumption / 1000
                    })

                path_parts = os.path.relpath(root, SOURCE_DATA_DIR).split(os.sep)
                building = path_parts[0]
                device_type = path_parts[-1].split('_')[0].lower()

                group_key = f"{building}_{device_type}"

                if group_key not in grouped_data:
                    grouped_data[group_key] = []

                grouped_data[group_key].append(df)

            except Exception as e:
                print(f"   - Gagal memproses {file_path}: {e}")

for key, df_list in grouped_data.items():
    grouped_data[key] = pd.concat(df_list, ignore_index=False)
    print(f"✅ Data untuk '{key}' berhasil digabungkan, total {len(grouped_data[key])} baris.")

print("\n--- Peringkat Konsumsi Energi Rata-Rata Terbesar ---")
consumption_ranking.sort(key=lambda x: x['avg_kwh'], reverse=True)
for i, item in enumerate(consumption_ranking):
    print(f"{i+1}. {item['label']}: {item['avg_kwh']:.2f} kWh")

# ==============================================================================
# @title 6. Proses Utama: Melatih Model per Jenis Perangkat
# ==============================================================================
# Sel ini akan melakukan loop melalui data yang sudah digabungkan,
# melatih model, dan menyimpan hasilnya.

best_model_counter = Counter()
all_model_stats = []
total_models_trained = 0
building_predictions_tracker = {}


for group_name, df_group in grouped_data.items():
    print(f"\n{'='*50}\nMemproses Grup: {group_name.upper()}\n{'='*50}")

    existing_cols = [col for col in RELEVANT_COLUMNS if col in df_group.columns]
    df_processed = df_group.reindex(columns=existing_cols).copy()
    df_processed.dropna(subset=[TARGET_VARIABLE], inplace=True)

    # --- Penanganan Fitur ---
    # 1. Tambahkan fitur lag 1 jam, yang akan selalu digunakan
    df_processed['Konsumsi_Energi_Lag_1'] = df_processed[TARGET_VARIABLE].shift(1)

    # 2. Ubah 'Wind Direction' menjadi komponen sinus dan kosinus
    if 'Wind Direction' in df_processed.columns:
        df_processed['Wind_Direction_sin'] = np.sin(np.deg2rad(df_processed['Wind Direction']))
        df_processed['Wind_Direction_cos'] = np.cos(np.deg2rad(df_processed['Wind Direction']))
        df_processed.drop('Wind Direction', axis=1, inplace=True)

    # 3. Terapkan one-hot encoding pada 'Weather Code'
    if 'Weather Code' in df_processed.columns:
        df_processed = pd.get_dummies(df_processed, columns=['Weather Code'], prefix='WeatherCode')

    # Hapus baris dengan NaN yang mungkin muncul dari fitur lag
    df_processed.dropna(inplace=True)
    df_final = df_processed[df_processed[TARGET_VARIABLE] > 0].copy()

    if len(df_final) < MINIMUM_ROWS:
        print(f"   - ⚠️ PERINGATAN: Data tidak cukup ({len(df_final)} baris). Melewati pelatihan untuk grup ini.")
        continue

    # --- PEMBARUAN: Logika Seleksi Fitur Baru ---
    print("\n   --- Memulai Seleksi Fitur ---")
    correlation_matrix = df_final.corr(method='spearman')

    output_dir = os.path.join(RESULTS_DIR, *group_name.split('_'))
    os.makedirs(output_dir, exist_ok=True)

    # Simpan heatmap korelasi
    show_annotations = len(correlation_matrix.columns) < 40
    plt.figure(figsize=(22, 18))
    sns.heatmap(correlation_matrix, annot=show_annotations, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation Matrix (Spearman) - {group_name}', fontsize=16)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'correlation_matrix_{group_name}.png'))
    plt.close()
    print(f"   - Heatmap korelasi disimpan di: {output_dir}")

    # 1. TAHAP 1: Analisis Multikolinearitas (diluar fitur target dan lag)
    potential_features = [col for col in df_final.columns if col not in [TARGET_VARIABLE, 'Konsumsi_Energi_Lag_1']]
    feature_corr_matrix = df_final[potential_features].corr().abs()
    
    upper_tri = feature_corr_matrix.where(np.triu(np.ones(feature_corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] >= 0.8)]
    
    independent_features = [f for f in potential_features if f not in to_drop]
    print(f"   - Tahap 1: Ditemukan {len(to_drop)} fitur dengan multikolinearitas (dihapus): {to_drop}")
    print(f"   - Tahap 1: Terdapat {len(independent_features)} fitur independen yang tersisa.")

    # 2. TAHAP 2: Analisis Korelasi dengan Target dari Fitur Independen (THRESHOLD DINAMIS)
    highly_correlated_features = []
    if independent_features:
        target_correlations = correlation_matrix.loc[independent_features, TARGET_VARIABLE]
        # Coba threshold 0.4 terlebih dahulu
        highly_correlated_features = target_correlations[
            (target_correlations >= 0.4) | (target_correlations <= -0.4)
        ].index.tolist()
        
        # Jika tidak ada, turunkan threshold ke 0.3
        if not highly_correlated_features:
            print(f"   - Tahap 2: Tidak ditemukan fitur dengan korelasi |r| >= 0.4. Menurunkan threshold ke 0.3...")
            highly_correlated_features = target_correlations[
                (target_correlations >= 0.3) | (target_correlations <= -0.3)
            ].index.tolist()
            print(f"   - Tahap 2 (Revisi): Ditemukan {len(highly_correlated_features)} fitur dengan korelasi |r| >= 0.3: {highly_correlated_features}")
        else:
            print(f"   - Tahap 2: Ditemukan {len(highly_correlated_features)} fitur independen yang berkorelasi kuat dengan target (|r| >= 0.4): {highly_correlated_features}")

    # 3. TAHAP 3: Fitur Terpilih Final
    features_for_model = ['Konsumsi_Energi_Lag_1'] + highly_correlated_features
    features_for_model = list(dict.fromkeys(features_for_model)) # Pastikan tidak ada duplikat

    print(f"   - Tahap 3: Fitur final untuk model: {features_for_model}\n")
    
    # Simpan fitur terpilih ke CSV
    pd.DataFrame({'fitur_terpilih': features_for_model}).to_csv(os.path.join(output_dir, f'fitur_terpilih_{group_name}.csv'), index=False)

    # PEMBARUAN: Lanjutkan pelatihan meskipun hanya ada 1 fitur
    if not features_for_model:
        print("   - Tidak ada fitur sama sekali untuk model. Melewati grup ini."); continue

    X = df_final[features_for_model]; y = df_final[TARGET_VARIABLE]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    if len(X_train) == 0 or len(X_test) == 0: continue

    print(f"   - Ukuran Data: Latih={len(X_train)}, Validasi={len(X_val)}, Uji={len(X_test)} (Acak)")
    model_evaluations = train_and_evaluate_models(X_train, y_train, X_val, y_val, X_test, y_test)

    device_predictions = {name: data['predictions'] for name, data in model_evaluations.items()}
    create_prediction_plots(y_test, device_predictions, group_name, output_dir)
    print(f"   - Plot prediksi untuk grup '{group_name}' disimpan.")

    building_name, device_type = group_name.split('_', 1)
    if building_name not in building_predictions_tracker:
        building_predictions_tracker[building_name] = {'y_true': [], 'preds': {'RandomForest': [], 'GradientBoosting': [], 'LSTM': []}}

    building_predictions_tracker[building_name]['y_true'].append(y_test)
    for model_name, preds in device_predictions.items():
        building_predictions_tracker[building_name]['preds'][model_name].append(preds)

    best_model_name, best_model_mae = '', float('inf')
    for model_name, eval_data in model_evaluations.items():
        if eval_data['mae'] < best_model_mae:
            best_model_mae = eval_data['mae']
            best_model_name = model_name

        all_model_stats.append({
            'Gedung': building_name,
            'Perangkat': device_type,
            'Model': model_name,
            'MAE': eval_data['mae'] / 1000,
            'RMSE': eval_data['rmse'] / 1000,
            'R2': eval_data['r2']
        })

    best_model_counter[best_model_name] += 1
    total_models_trained += 1
    print(f"   ==> Model terbaik untuk grup ini adalah {best_model_name} (berdasarkan MAE).")

# ==============================================================================
# @title 7. Ringkasan dan Analisis Final
# ==============================================================================
# Sel ini akan menampilkan ringkasan model terbaik dan statistik evaluasi.

if total_models_trained > 0:
    stats_df = pd.DataFrame(all_model_stats)
    stats_df.to_csv(os.path.join(RESULTS_DIR, 'laporan_kinerja_semua_model.csv'), index=False)
    print(f"\n✅ Laporan kinerja lengkap disimpan di: {os.path.join(RESULTS_DIR, 'laporan_kinerja_semua_model.csv')}")


    # --- 1. Ringkasan Model Terbaik ---
    print(f"\n{'='*50}\nRingkasan Model Terbaik (dari {total_models_trained} grup perangkat)\n{'='*50}")
    sorted_models = best_model_counter.most_common()
    for model, count in sorted_models:
        percentage = (count / total_models_trained) * 100
        print(f"🏆 {model}: Model terbaik sebanyak {count}/{total_models_trained} kali ({percentage:.1f}%)")

    # --- 2. Statistik Evaluasi Keseluruhan ---
    print(f"\n{'='*50}\nStatistik Evaluasi Keseluruhan (Rata-rata dari semua model)\n{'='*50}")
    overall_avg = stats_df[['R2', 'RMSE', 'MAE']].mean()
    print(f"Rata-rata R2 Score : {overall_avg['R2']:.2f}")
    print(f"Rata-rata RMSE     : {overall_avg['RMSE']:.2f} kWh")
    print(f"Rata-rata MAE      : {overall_avg['MAE']:.2f} kWh")

    # --- 3. Rata-Rata Metrik per Model ---
    print(f"\n{'='*50}\nRata-Rata Metrik per Model\n{'='*50}")
    per_model_avg = stats_df.groupby('Model')[['R2', 'RMSE', 'MAE']].mean()
    print(per_model_avg.round(2))

    # --- 4. Detail Metrik Evaluasi per Grup Perangkat ---
    print(f"\n{'='*50}\nDetail Metrik Evaluasi per Grup Perangkat\n{'='*50}")
    for group, group_df in stats_df.groupby(['Gedung', 'Perangkat']):
        print(f"\n--- Grup: {group[0].upper()} - {group[1].upper()} ---")
        best_model_in_group = group_df.loc[group_df['MAE'].idxmin()]
        for index, row in group_df.iterrows():
            is_best = "🏆" if row['Model'] == best_model_in_group['Model'] else ""
            print(f"  - Model: {row['Model']:<17} | R2: {row['R2']:.2f} | RMSE: {row['RMSE']:.2f} kWh | MAE: {row['MAE']:.2f} kWh {is_best}")

    # --- 5. Membuat Plot Gabungan per Gedung ---
    print(f"\n{'='*50}\nMembuat Plot Gabungan per Gedung\n{'='*50}")
    for building_name, data in building_predictions_tracker.items():
        y_true_combined = pd.concat(data['y_true'])
        preds_combined = {model: np.concatenate(preds) for model, preds in data['preds'].items()}

        building_output_dir = os.path.join(RESULTS_DIR, building_name)
        os.makedirs(building_output_dir, exist_ok=True)

        create_prediction_plots(y_true_combined, preds_combined, f"Gabungan_{building_name.upper()}", building_output_dir)
        print(f"✅ Plot prediksi gabungan untuk gedung '{building_name}' disimpan di: {building_output_dir}")
        
    # --- 6. Membuat Heatmap Kinerja Gabungan ---
    print(f"\n{'='*50}\nMembuat Heatmap Kinerja Gabungan\n{'='*50}")
    create_combined_heatmap(all_model_stats, "Kinerja_Gabungan_Semua_Gedung", RESULTS_DIR)


else:
    print("\nTidak ada model yang dilatih, ringkasan tidak dapat dibuat.")

print(f"\n\n🏁 Proses Selesai. Semua hasil telah disimpan di folder '{RESULTS_DIR}'.")


✅ Library berhasil diimpor.
📁 Folder sumber data diatur ke: 'sumber_data'
📁 Folder hasil akan disimpan di: 'hasil_model_aggregate_pergedung_revisikorelasimatrix'
📊 Batas minimum data untuk pelatihan: 500 baris
✅ Sel ini siap.
Pastikan Anda telah mengunggah data Anda ke dalam folder 'sumber_data'.
✅ Fungsi-fungsi pembantu berhasil didefinisikan.
Memulai proses pemuatan dan penggabungan data...
✅ Data untuk 'opmc_sdp' berhasil digabungkan, total 52674 baris.
✅ Data untuk 'opmc_ahu' berhasil digabungkan, total 43893 baris.
✅ Data untuk 'opmc_lift' berhasil digabungkan, total 1121 baris.
✅ Data untuk 'witel_ahu' berhasil digabungkan, total 96391 baris.
✅ Data untuk 'witel_sdp' berhasil digabungkan, total 96598 baris.
✅ Data untuk 'witel_lift' berhasil digabungkan, total 17554 baris.
✅ Data untuk 'witel_chiller' berhasil digabungkan, total 1121 baris.

--- Peringkat Konsumsi Energi Rata-Rata Terbesar ---
1. opmc - LIFT: 326.63 kWh
2. witel - CHILLER: 326.63 kWh
3. witel - Lantai7_SDP: 57.95

In [5]:
# ==============================================================================
# @title 1. Instalasi dan Impor Library
# ==============================================================================
# Jalankan sel ini terlebih dahulu untuk mengimpor semua library yang dibutuhkan.
# PEMBARUAN: Menambahkan library 'holidays' untuk data hari libur nasional
!pip install holidays -q
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import os
import joblib
from collections import Counter
import warnings
import holidays

warnings.filterwarnings('ignore')
print("✅ Library berhasil diimpor.")

# ==============================================================================
# @title 2. Konfigurasi Utama
# ==============================================================================
# Sel ini mendefinisikan variabel-variabel penting seperti lokasi folder
# dan daftar kolom yang akan digunakan dalam model.

# Tentukan path folder sumber data dan folder untuk menyimpan hasil
SOURCE_DATA_DIR = 'sumber_data'
# PEMBARUAN: Mengubah nama folder hasil
RESULTS_DIR = 'hasil_model_aggregate_pergedung_revisikorelasimatrix_denganharilibur'

# Pastikan folder hasil utama dan folder sumber ada
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(SOURCE_DATA_DIR, exist_ok=True)

# Daftar kolom fitur yang akan digunakan (tanpa 'Apparent Temperature')
RELEVANT_COLUMNS = [
    'Konsumsi Energi', 'Temperature', 'Showers', 'Cloud Cover', 'Weather Code',
    'Relative Humidity', 'Dew Point', 'Precipitation',
    'Pressure MSL', 'Surface Pressure', 'Evapotranspiration',
    'Vapour Pressure Deficit', 'Wind Speed', 'Wind Direction', 'Wind Gusts',
    'Soil Temperature', 'Sunshine Duration', 'UV Index', 'Direct Radiation'
]
TARGET_VARIABLE = 'Konsumsi Energi'
# MINIMUM_ROWS = 3000 # Batas minimum data untuk melatih model
MINIMUM_ROWS = 500 # Batas minimum data untuk melatih model (diubah untuk pengujian)

print(f"📁 Folder sumber data diatur ke: '{SOURCE_DATA_DIR}'")
print(f"📁 Folder hasil akan disimpan di: '{RESULTS_DIR}'")
print(f"📊 Batas minimum data untuk pelatihan: {MINIMUM_ROWS} baris")

# ==============================================================================
# @title 3. Persiapan Folder dan Unggah Data
# ==============================================================================
# PENTING: Sebelum menjalankan sel-sel berikutnya, unggah data Anda.
#
# 1. Di panel file sebelah kiri Google Colab, Anda akan melihat folder 'sumber_data'.
# 2. Klik kanan pada folder 'sumber_data' tersebut dan pilih 'Upload'.
# 3. Unggah folder 'witel' dan 'opmc' Anda yang berisi semua data CSV
#    ke dalam folder 'sumber_data'.
#
# Setelah selesai, Anda bisa melanjutkan menjalankan sel-sel berikutnya.

print("✅ Sel ini siap.")
print(f"Pastikan Anda telah mengunggah data Anda ke dalam folder '{SOURCE_DATA_DIR}'.")

# ==============================================================================
# @title 4. Definisi Fungsi-Fungsi Pembantu
# ==============================================================================
# Sel ini berisi fungsi-fungsi utama untuk melatih model dan membuat visualisasi.
# Jalankan sel ini untuk mendefinisikan fungsi agar bisa digunakan nanti.

def train_and_evaluate_models(X_train, y_train, X_val, y_val, X_test, y_test):
    """
    Fungsi untuk melatih semua model (RF, GB, LSTM) dan mengembalikan
    prediksi serta metrik kinerjanya.
    """
    results = {}

    # --- Model 1: Random Forest Regressor ---
    print("   - Melatih Random Forest...")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    results['RandomForest'] = {'model': rf_model, 'predictions': y_pred_rf, 'mae': mean_absolute_error(y_test, y_pred_rf), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_rf)), 'r2': r2_score(y_test, y_pred_rf)}

    # --- Model 2: Gradient Boosting Regressor ---
    print("   - Melatih Gradient Boosting...")
    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)
    y_pred_gb = gb_model.predict(X_test)
    results['GradientBoosting'] = {'model': gb_model, 'predictions': y_pred_gb, 'mae': mean_absolute_error(y_test, y_pred_gb), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_gb)), 'r2': r2_score(y_test, y_pred_gb)}

    # --- Model 3: LSTM ---
    print("   - Melatih LSTM...")
    scaler_X = MinMaxScaler(feature_range=(0, 1)); scaler_y = MinMaxScaler(feature_range=(0, 1))
    X_train_scaled = scaler_X.fit_transform(X_train); y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
    X_val_scaled = scaler_X.transform(X_val); y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1))
    X_test_scaled = scaler_X.transform(X_test)
    X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
    X_val_lstm = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
    X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))
    lstm_model = Sequential([LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])), Dense(1)])
    lstm_model.compile(optimizer='adam', loss='mean_squared_error')
    lstm_model.fit(X_train_lstm, y_train_scaled, epochs=50, batch_size=32, validation_data=(X_val_lstm, y_val_scaled), verbose=0, shuffle=False)
    y_pred_lstm_scaled = lstm_model.predict(X_test_lstm)
    y_pred_lstm = scaler_y.inverse_transform(y_pred_lstm_scaled)
    results['LSTM'] = {'model': lstm_model, 'predictions': y_pred_lstm.flatten(), 'mae': mean_absolute_error(y_test, y_pred_lstm), 'rmse': np.sqrt(mean_squared_error(y_test, y_pred_lstm)), 'r2': r2_score(y_test, y_pred_lstm)}
    return results

def create_prediction_plots(y_test, predictions, plot_suffix, output_dir):
    """Membuat dan menyimpan scatter plot dan line graph untuk prediksi."""
    # --- Konversi ke kWh untuk semua plot ---
    y_test_kwh = y_test / 1000
    predictions_kwh = {name: pred / 1000 for name, pred in predictions.items()}

    # --- Scatter Plot (Semua Data Test) ---
    plt.figure(figsize=(20, 6))
    colors = ['green', 'red', 'orange']
    for i, (model_name, pred_kwh) in enumerate(predictions_kwh.items()):
        mae_kwh = mean_absolute_error(y_test_kwh, pred_kwh)
        rmse_kwh = np.sqrt(mean_squared_error(y_test_kwh, pred_kwh))
        r2 = r2_score(y_test_kwh, pred_kwh)
        plt.subplot(1, 3, i + 1)
        plt.scatter(y_test_kwh, pred_kwh, alpha=0.6, edgecolors='k', color=colors[i])
        plt.plot([y_test_kwh.min(), y_test_kwh.max()], [y_test_kwh.min(), y_test_kwh.max()], '--r', linewidth=2)
        plt.title(f'{model_name}\nR2: {r2:.2f} | RMSE: {rmse_kwh:.2f} | MAE: {mae_kwh:.2f} kWh')
        plt.xlabel('Nilai Aktual (kWh)')
        plt.ylabel('Nilai Prediksi (kWh)')
        plt.grid(True)
    plt.suptitle(f'Scatter Plot (Semua Data Test) - {plot_suffix}', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(os.path.join(output_dir, f'scatter_plot_{plot_suffix}.png'))
    plt.close()

    # --- Persiapan DataFrame untuk Plot Zoom ---
    plot_df_full = pd.DataFrame({'Aktual (kWh)': y_test_kwh})
    for model_name, pred_kwh in predictions_kwh.items():
        plot_df_full[model_name] = pred_kwh

    # PEMBARUAN: Mengambil 500 data terakhir untuk zoom
    plot_df_zoom = plot_df_full.tail(500)

    # --- Scatter Plot Zoom (500 Data Terakhir) ---
    if not plot_df_zoom.empty:
        plt.figure(figsize=(20, 6))
        for i, model_name in enumerate(predictions_kwh.keys()):
            y_test_zoom = plot_df_zoom['Aktual (kWh)']
            pred_zoom = plot_df_zoom[model_name]
            mae_kwh = mean_absolute_error(y_test_zoom, pred_zoom)
            rmse_kwh = np.sqrt(mean_squared_error(y_test_zoom, pred_zoom))
            r2 = r2_score(y_test_zoom, pred_zoom)
            plt.subplot(1, 3, i + 1)
            plt.scatter(y_test_zoom, pred_zoom, alpha=0.6, edgecolors='k', color=colors[i])
            plt.plot([y_test_zoom.min(), y_test_zoom.max()], [y_test_zoom.min(), y_test_zoom.max()], '--r', linewidth=2)
            plt.title(f'{model_name}\nR2: {r2:.2f} | RMSE: {rmse_kwh:.2f} | MAE: {mae_kwh:.2f} kWh')
            plt.xlabel('Nilai Aktual (kWh)')
            plt.ylabel('Nilai Prediksi (kWh)')
            plt.grid(True)
        plt.suptitle(f'Scatter Plot Zoom (500 Data Terakhir) - {plot_suffix}', fontsize=16)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plt.savefig(os.path.join(output_dir, f'scatter_plot_zoom_{plot_suffix}.png'))
        plt.close()

    # --- Grafik Waktu (Titik Acak) ---
    plot_df_full.sort_index(inplace=True)
    plt.figure(figsize=(20, 8))
    plt.plot(plot_df_full.index, plot_df_full['Aktual (kWh)'], label='Nilai Aktual', color='blue', marker='o', linestyle='None', markersize=5, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['RandomForest'], label='Prediksi RF', color='green', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['GradientBoosting'], label='Prediksi GB', color='red', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.plot(plot_df_full.index, plot_df_full['LSTM'], label='Prediksi LSTM', color='orange', marker='x', linestyle='None', markersize=4, alpha=0.8)
    plt.title(f'Grafik Waktu: Prediksi vs Aktual - {plot_suffix}\n(Menampilkan titik data dari Test Set yang acak)', fontsize=16)
    plt.xlabel('Waktu'); plt.ylabel('Konsumsi Energi (kWh)')
    plt.legend(); plt.grid(True); plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'time_series_plot_{plot_suffix}.png'))
    plt.close()

    # --- Grafik Waktu Zoom (500 Data Terakhir) ---
    if not plot_df_zoom.empty:
        plt.figure(figsize=(20, 8))
        plt.plot(plot_df_zoom.index, plot_df_zoom['Aktual (kWh)'], label='Nilai Aktual', color='blue', marker='o', linestyle='None', markersize=5, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['RandomForest'], label='Prediksi RF', color='green', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['GradientBoosting'], label='Prediksi GB', color='red', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.plot(plot_df_zoom.index, plot_df_zoom['LSTM'], label='Prediksi LSTM', color='orange', marker='x', linestyle='None', markersize=4, alpha=0.8)
        plt.title(f'Grafik Waktu Zoom (500 Data Terakhir) - {plot_suffix}', fontsize=16)
        plt.xlabel('Waktu'); plt.ylabel('Konsumsi Energi (kWh)')
        plt.legend(); plt.grid(True); plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'time_series_plot_zoom_{plot_suffix}.png'))
        plt.close()


def create_combined_heatmap(performance_data, title_suffix, output_dir):
    """Membuat dan menyimpan heatmap gabungan dari data kinerja model."""
    if not performance_data:
        print(f"Tidak ada data kinerja untuk membuat heatmap.")
        return
    df = pd.DataFrame(performance_data)

    df['Label Perangkat'] = df['Gedung'] + ' - ' + df['Perangkat']
    try:
        df.sort_values(by=['Gedung', 'Label Perangkat'], inplace=True)
        mae_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='MAE')
        rmse_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='RMSE')
        r2_pivot = df.pivot_table(index='Model', columns='Label Perangkat', values='R2')
    except Exception as e:
        print(f"Error saat membuat pivot table untuk {title_suffix}: {e}\nData: {df}")
        return

    num_devices = len(df['Label Perangkat'].unique())
    fig_width = max(18, num_devices * 1.5)

    fig, axes = plt.subplots(3, 1, figsize=(fig_width, 21))
    fig.suptitle(f'Heatmap Kinerja Model - {title_suffix}', fontsize=20)

    # Heatmap R2
    sns.heatmap(r2_pivot, annot=True, fmt=".2f", cmap="viridis", ax=axes[0], linewidths=.5)
    axes[0].set_title('R2 Score - Lebih Tinggi Lebih Baik', fontsize=16)
    axes[0].set_xlabel(''); axes[0].set_ylabel('Model', fontsize=12)
    axes[0].tick_params(axis='x', rotation=45)

    # Heatmap RMSE
    sns.heatmap(rmse_pivot, annot=True, fmt=".2f", cmap="viridis_r", ax=axes[1], linewidths=.5)
    axes[1].set_title('RMSE (kWh) - Lebih Rendah Lebih Baik', fontsize=16)
    axes[1].set_xlabel(''); axes[1].set_ylabel('Model', fontsize=12)
    axes[1].tick_params(axis='x', rotation=45)

    # Heatmap MAE
    sns.heatmap(mae_pivot, annot=True, fmt=".2f", cmap="viridis_r", ax=axes[2], linewidths=.5)
    axes[2].set_title('MAE (kWh) - Lebih Rendah Lebih Baik', fontsize=16)
    axes[2].set_xlabel('Gedung - Perangkat / Lokasi', fontsize=12)
    axes[2].set_ylabel('Model', fontsize=12)
    axes[2].tick_params(axis='x', rotation=45)

    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    heatmap_path = os.path.join(output_dir, f'heatmap_{title_suffix}.png')
    plt.savefig(heatmap_path, bbox_inches='tight')
    plt.close()
    print(f"\nHeatmap gabungan disimpan di: {heatmap_path}")

print("✅ Fungsi-fungsi pembantu berhasil didefinisikan.")

# ==============================================================================
# @title 5. Memuat, Membersihkan, dan Menggabungkan Data
# ==============================================================================
# Sel ini akan memuat semua data, membersihkannya, dan menggabungkannya
# berdasarkan jenis perangkat di setiap gedung.

grouped_data = {}
consumption_ranking = []

print("Memulai proses pemuatan dan penggabungan data...")
for root, dirs, files in os.walk(SOURCE_DATA_DIR):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            try:
                df = pd.read_csv(file_path, index_col='id_time', parse_dates=True)

                if TARGET_VARIABLE not in df.columns or df[TARGET_VARIABLE].isnull().all():
                    continue

                df_cleaned = df[[TARGET_VARIABLE]].dropna()
                df_cleaned = df_cleaned[df_cleaned[TARGET_VARIABLE] > 0]

                if not df_cleaned.empty:
                    path_parts = os.path.relpath(root, SOURCE_DATA_DIR).split(os.sep)
                    building = path_parts[0]
                    device_label = "_".join(path_parts[1:])
                    avg_consumption = df_cleaned[TARGET_VARIABLE].mean()
                    consumption_ranking.append({
                        'label': f"{building} - {device_label}",
                        'avg_kwh': avg_consumption / 1000
                    })

                path_parts = os.path.relpath(root, SOURCE_DATA_DIR).split(os.sep)
                building = path_parts[0]
                device_type = path_parts[-1].split('_')[0].lower()

                group_key = f"{building}_{device_type}"

                if group_key not in grouped_data:
                    grouped_data[group_key] = []

                grouped_data[group_key].append(df)

            except Exception as e:
                print(f"   - Gagal memproses {file_path}: {e}")

for key, df_list in grouped_data.items():
    grouped_data[key] = pd.concat(df_list, ignore_index=False)
    print(f"✅ Data untuk '{key}' berhasil digabungkan, total {len(grouped_data[key])} baris.")

print("\n--- Peringkat Konsumsi Energi Rata-Rata Terbesar ---")
consumption_ranking.sort(key=lambda x: x['avg_kwh'], reverse=True)
for i, item in enumerate(consumption_ranking):
    print(f"{i+1}. {item['label']}: {item['avg_kwh']:.2f} kWh")

# ==============================================================================
# @title 6. Proses Utama: Melatih Model per Jenis Perangkat
# ==============================================================================
# Sel ini akan melakukan loop melalui data yang sudah digabungkan,
# melatih model, dan menyimpan hasilnya.

best_model_counter = Counter()
all_model_stats = []
total_models_trained = 0
building_predictions_tracker = {}

# PEMBARUAN: Inisialisasi daftar hari libur Indonesia
# Ambil tahun unik dari data untuk mendapatkan hari libur yang relevan
all_years = []
for df_group in grouped_data.values():
    all_years.extend(df_group.index.year.unique())
unique_years = sorted(list(set(all_years)))
id_holidays = holidays.Indonesia(years=unique_years)
print(f"\n📅 Mengambil data hari libur nasional Indonesia untuk tahun: {unique_years}")


for group_name, df_group in grouped_data.items():
    print(f"\n{'='*50}\nMemproses Grup: {group_name.upper()}\n{'='*50}")

    existing_cols = [col for col in RELEVANT_COLUMNS if col in df_group.columns]
    df_processed = df_group.reindex(columns=existing_cols).copy()
    df_processed.dropna(subset=[TARGET_VARIABLE], inplace=True)

    # --- Penanganan Fitur ---
    # 1. Tambahkan fitur lag 1 jam, yang akan selalu digunakan
    df_processed['Konsumsi_Energi_Lag_1'] = df_processed[TARGET_VARIABLE].shift(1)
    
    # --- PEMBARUAN: Tambahkan Fitur Hari Libur dan Akhir Pekan ---
    # Cek apakah hari adalah Sabtu (5) atau Minggu (6)
    df_processed['is_weekend'] = (df_processed.index.dayofweek >= 5).astype(int)
    # Cek apakah tanggal ada di daftar hari libur nasional
    df_processed['isHoliday'] = df_processed.index.isin(id_holidays).astype(int)
    print(f"   - Menambahkan fitur 'is_weekend' dan 'isHoliday'.")

    # 2. Ubah 'Wind Direction' menjadi komponen sinus dan kosinus
    if 'Wind Direction' in df_processed.columns:
        df_processed['Wind_Direction_sin'] = np.sin(np.deg2rad(df_processed['Wind Direction']))
        df_processed['Wind_Direction_cos'] = np.cos(np.deg2rad(df_processed['Wind Direction']))
        df_processed.drop('Wind Direction', axis=1, inplace=True)

    # 3. Terapkan one-hot encoding pada 'Weather Code'
    if 'Weather Code' in df_processed.columns:
        df_processed = pd.get_dummies(df_processed, columns=['Weather Code'], prefix='WeatherCode')

    # Hapus baris dengan NaN yang mungkin muncul dari fitur lag
    df_processed.dropna(inplace=True)
    df_final = df_processed[df_processed[TARGET_VARIABLE] > 0].copy()

    if len(df_final) < MINIMUM_ROWS:
        print(f"   - ⚠️ PERINGATAN: Data tidak cukup ({len(df_final)} baris). Melewati pelatihan untuk grup ini.")
        continue

    # --- PEMBARUAN: Logika Seleksi Fitur Baru ---
    print("\n   --- Memulai Seleksi Fitur ---")
    correlation_matrix = df_final.corr(method='spearman')

    output_dir = os.path.join(RESULTS_DIR, *group_name.split('_'))
    os.makedirs(output_dir, exist_ok=True)

    # Simpan heatmap korelasi
    show_annotations = len(correlation_matrix.columns) < 40
    plt.figure(figsize=(22, 18))
    sns.heatmap(correlation_matrix, annot=show_annotations, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation Matrix (Spearman) - {group_name}', fontsize=16)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'correlation_matrix_{group_name}.png'))
    plt.close()
    print(f"   - Heatmap korelasi disimpan di: {output_dir}")

    # 1. TAHAP 1: Analisis Multikolinearitas (diluar fitur target dan lag)
    potential_features = [col for col in df_final.columns if col not in [TARGET_VARIABLE, 'Konsumsi_Energi_Lag_1']]
    feature_corr_matrix = df_final[potential_features].corr().abs()
    
    upper_tri = feature_corr_matrix.where(np.triu(np.ones(feature_corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] >= 0.8)]
    
    independent_features = [f for f in potential_features if f not in to_drop]
    print(f"   - Tahap 1: Ditemukan {len(to_drop)} fitur dengan multikolinearitas (dihapus): {to_drop}")
    print(f"   - Tahap 1: Terdapat {len(independent_features)} fitur independen yang tersisa.")

    # 2. TAHAP 2: Analisis Korelasi dengan Target dari Fitur Independen (THRESHOLD DINAMIS)
    highly_correlated_features = []
    if independent_features:
        target_correlations = correlation_matrix.loc[independent_features, TARGET_VARIABLE]
        # Coba threshold 0.4 terlebih dahulu
        highly_correlated_features = target_correlations[
            (target_correlations >= 0.4) | (target_correlations <= -0.4)
        ].index.tolist()
        
        # Jika tidak ada, turunkan threshold ke 0.3
        if not highly_correlated_features:
            print(f"   - Tahap 2: Tidak ditemukan fitur dengan korelasi |r| >= 0.4. Menurunkan threshold ke 0.3...")
            highly_correlated_features = target_correlations[
                (target_correlations >= 0.3) | (target_correlations <= -0.3)
            ].index.tolist()
            print(f"   - Tahap 2 (Revisi): Ditemukan {len(highly_correlated_features)} fitur dengan korelasi |r| >= 0.3: {highly_correlated_features}")
        else:
            print(f"   - Tahap 2: Ditemukan {len(highly_correlated_features)} fitur independen yang berkorelasi kuat dengan target (|r| >= 0.4): {highly_correlated_features}")

    # 3. TAHAP 3: Fitur Terpilih Final
    # Memulai dengan fitur lag dan fitur yang berkorelasi tinggi
    final_features = ['Konsumsi_Energi_Lag_1'] + highly_correlated_features

    # PEMBARUAN: Logika baru untuk fitur weekend dan holiday dengan threshold 0.3
    # Hanya tambahkan 'is_weekend' jika korelasinya >= 0.3
    if 'is_weekend' in independent_features:
        weekend_corr = abs(target_correlations.get('is_weekend', 0))
        if weekend_corr >= 0.3:
            final_features.append('is_weekend')
            print(f"   - Fitur 'is_weekend' ditambahkan (korelasi: {weekend_corr:.2f}).")
        else:
            print(f"   - Fitur 'is_weekend' dilewati (korelasi: {weekend_corr:.2f} < 0.3).")


    # Hanya tambahkan 'isHoliday' jika korelasinya >= 0.3
    if 'isHoliday' in independent_features:
        holiday_corr = abs(target_correlations.get('isHoliday', 0))
        if holiday_corr >= 0.3:
            final_features.append('isHoliday')
            print(f"   - Fitur 'isHoliday' ditambahkan (korelasi: {holiday_corr:.2f}).")
        else:
            print(f"   - Fitur 'isHoliday' dilewati (korelasi: {holiday_corr:.2f} < 0.3).")
        
    features_for_model = list(dict.fromkeys(final_features)) # Pastikan tidak ada duplikat

    print(f"   - Tahap 3: Fitur final untuk model: {features_for_model}\n")
    
    # Simpan fitur terpilih ke CSV
    pd.DataFrame({'fitur_terpilih': features_for_model}).to_csv(os.path.join(output_dir, f'fitur_terpilih_{group_name}.csv'), index=False)

    if not features_for_model:
        print("   - Tidak ada fitur sama sekali untuk model. Melewati grup ini."); continue

    X = df_final[features_for_model]; y = df_final[TARGET_VARIABLE]

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    if len(X_train) == 0 or len(X_test) == 0: continue

    print(f"   - Ukuran Data: Latih={len(X_train)}, Validasi={len(X_val)}, Uji={len(X_test)} (Acak)")
    model_evaluations = train_and_evaluate_models(X_train, y_train, X_val, y_val, X_test, y_test)

    device_predictions = {name: data['predictions'] for name, data in model_evaluations.items()}
    create_prediction_plots(y_test, device_predictions, group_name, output_dir)
    print(f"   - Plot prediksi untuk grup '{group_name}' disimpan.")

    building_name, device_type = group_name.split('_', 1)
    if building_name not in building_predictions_tracker:
        building_predictions_tracker[building_name] = {'y_true': [], 'preds': {'RandomForest': [], 'GradientBoosting': [], 'LSTM': []}}

    building_predictions_tracker[building_name]['y_true'].append(y_test)
    for model_name, preds in device_predictions.items():
        building_predictions_tracker[building_name]['preds'][model_name].append(preds)

    best_model_name, best_model_mae = '', float('inf')
    for model_name, eval_data in model_evaluations.items():
        if eval_data['mae'] < best_model_mae:
            best_model_mae = eval_data['mae']
            best_model_name = model_name

        all_model_stats.append({
            'Gedung': building_name,
            'Perangkat': device_type,
            'Model': model_name,
            'MAE': eval_data['mae'] / 1000,
            'RMSE': eval_data['rmse'] / 1000,
            'R2': eval_data['r2']
        })

    best_model_counter[best_model_name] += 1
    total_models_trained += 1
    print(f"   ==> Model terbaik untuk grup ini adalah {best_model_name} (berdasarkan MAE).")

# ==============================================================================
# @title 7. Ringkasan dan Analisis Final
# ==============================================================================
# Sel ini akan menampilkan ringkasan model terbaik dan statistik evaluasi.

if total_models_trained > 0:
    stats_df = pd.DataFrame(all_model_stats)
    stats_df.to_csv(os.path.join(RESULTS_DIR, 'laporan_kinerja_semua_model.csv'), index=False)
    print(f"\n✅ Laporan kinerja lengkap disimpan di: {os.path.join(RESULTS_DIR, 'laporan_kinerja_semua_model.csv')}")


    # --- 1. Ringkasan Model Terbaik ---
    print(f"\n{'='*50}\nRingkasan Model Terbaik (dari {total_models_trained} grup perangkat)\n{'='*50}")
    sorted_models = best_model_counter.most_common()
    for model, count in sorted_models:
        percentage = (count / total_models_trained) * 100
        print(f"🏆 {model}: Model terbaik sebanyak {count}/{total_models_trained} kali ({percentage:.1f}%)")

    # --- 2. Statistik Evaluasi Keseluruhan ---
    print(f"\n{'='*50}\nStatistik Evaluasi Keseluruhan (Rata-rata dari semua model)\n{'='*50}")
    overall_avg = stats_df[['R2', 'RMSE', 'MAE']].mean()
    print(f"Rata-rata R2 Score : {overall_avg['R2']:.2f}")
    print(f"Rata-rata RMSE     : {overall_avg['RMSE']:.2f} kWh")
    print(f"Rata-rata MAE      : {overall_avg['MAE']:.2f} kWh")

    # --- 3. Rata-Rata Metrik per Model ---
    print(f"\n{'='*50}\nRata-Rata Metrik per Model\n{'='*50}")
    per_model_avg = stats_df.groupby('Model')[['R2', 'RMSE', 'MAE']].mean()
    print(per_model_avg.round(2))

    # --- 4. Detail Metrik Evaluasi per Grup Perangkat ---
    print(f"\n{'='*50}\nDetail Metrik Evaluasi per Grup Perangkat\n{'='*50}")
    for group, group_df in stats_df.groupby(['Gedung', 'Perangkat']):
        print(f"\n--- Grup: {group[0].upper()} - {group[1].upper()} ---")
        best_model_in_group = group_df.loc[group_df['MAE'].idxmin()]
        for index, row in group_df.iterrows():
            is_best = "🏆" if row['Model'] == best_model_in_group['Model'] else ""
            print(f"  - Model: {row['Model']:<17} | R2: {row['R2']:.2f} | RMSE: {row['RMSE']:.2f} kWh | MAE: {row['MAE']:.2f} kWh {is_best}")

    # --- 5. Membuat Plot Gabungan per Gedung ---
    print(f"\n{'='*50}\nMembuat Plot Gabungan per Gedung\n{'='*50}")
    for building_name, data in building_predictions_tracker.items():
        y_true_combined = pd.concat(data['y_true'])
        preds_combined = {model: np.concatenate(preds) for model, preds in data['preds'].items()}

        building_output_dir = os.path.join(RESULTS_DIR, building_name)
        os.makedirs(building_output_dir, exist_ok=True)

        create_prediction_plots(y_true_combined, preds_combined, f"Gabungan_{building_name.upper()}", building_output_dir)
        print(f"✅ Plot prediksi gabungan untuk gedung '{building_name}' disimpan di: {building_output_dir}")
        
    # --- 6. Membuat Heatmap Kinerja Gabungan ---
    print(f"\n{'='*50}\nMembuat Heatmap Kinerja Gabungan\n{'='*50}")
    create_combined_heatmap(all_model_stats, "Kinerja_Gabungan_Semua_Gedung", RESULTS_DIR)


else:
    print("\nTidak ada model yang dilatih, ringkasan tidak dapat dibuat.")

print(f"\n\n🏁 Proses Selesai. Semua hasil telah disimpan di folder '{RESULTS_DIR}'.")

✅ Library berhasil diimpor.
📁 Folder sumber data diatur ke: 'sumber_data'
📁 Folder hasil akan disimpan di: 'hasil_model_aggregate_pergedung_revisikorelasimatrix_denganharilibur'
📊 Batas minimum data untuk pelatihan: 500 baris
✅ Sel ini siap.
Pastikan Anda telah mengunggah data Anda ke dalam folder 'sumber_data'.
✅ Fungsi-fungsi pembantu berhasil didefinisikan.
Memulai proses pemuatan dan penggabungan data...
✅ Data untuk 'opmc_sdp' berhasil digabungkan, total 52674 baris.
✅ Data untuk 'opmc_ahu' berhasil digabungkan, total 43893 baris.
✅ Data untuk 'opmc_lift' berhasil digabungkan, total 1121 baris.
✅ Data untuk 'witel_sdp' berhasil digabungkan, total 70259 baris.
✅ Data untuk 'witel_ahu' berhasil digabungkan, total 70039 baris.
✅ Data untuk 'witel_lift' berhasil digabungkan, total 8777 baris.
✅ Data untuk 'witel_chiller' berhasil digabungkan, total 1121 baris.

--- Peringkat Konsumsi Energi Rata-Rata Terbesar ---
1. opmc - LIFT: 326.63 kWh
2. witel - CHILLER: 326.63 kWh
3. witel - Lan