In [25]:
# IMPORT & LOAD LIBRARY + DATA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_submission_df = pd.read_csv("submission.csv")
print("Data Berhasil Terload")

# Simpan ID test_df untuk submission
test_ids = test_df['ID'] 

# Menambahkan kolos is_test di masing masing data
train_df['is_test'] = 0
test_df['is_test'] = 1

# Rename Electricity Consumption di train_df
train_df.rename(columns={'electricity_consumption_gwh': 'electricity_consumption'}, inplace=True)

# MERGING & RAPIHIN DATA TRAIN DAN TEST 

# Menggabungkan train_df dan test_df
test_df['electricity_consumption'] = np.nan
all_df = pd.concat([train_df.drop(columns=['ID']), test_df.drop(columns=['ID'])], ignore_index=True)

# Menampilkan informasi DataFrame
print("\n--- Kombinasi dua data (5 Baris Pertama) ---")
print(all_df.head())
print("\n--- Info Kombinasi dua data ---")
print(all_df.info())

# Mengconvert kolom 'date' ke tipe datetime
all_df['date'] = pd.to_datetime(all_df['date'])

# Sort & Tampilkan data berdasarkan tahunnya 
all_df = all_df.sort_values(by=['cluster_id', 'date']).reset_index(drop=True)
print("\n--- Data setelah diurutkan ---")
print(all_df.head())

Data Berhasil Terload

--- Kombinasi dua data (5 Baris Pertama) ---
         date cluster_id  electricity_consumption  temperature_2m_max  \
0  2014-01-01  cluster_1                  358.032                10.8   
1  2014-01-01  cluster_2                  548.247                12.2   
2  2014-01-01  cluster_3                  758.303                12.9   
3  2014-01-01  cluster_4                 1072.077                10.8   
4  2014-01-02  cluster_1                  386.908                10.7   

   temperature_2m_min  apparent_temperature_max  apparent_temperature_min  \
0                 4.2                       5.5                       0.4   
1                 4.3                       8.6                      -0.4   
2                -0.8                      10.1                      -4.9   
3                 4.7                       6.7                       0.6   
4                 7.0                       6.6                       3.3   

   sunshine_duration  daylight

In [26]:
# FEATURE ENGINEERING 

# --- Time Based Features --- 
all_df['year'] = all_df['date'].dt.year
all_df['month'] = all_df['date'].dt.month
all_df['day'] = all_df['date'].dt.day
all_df['day_of_week'] = all_df['date'].dt.dayofweek
all_df['day_of_year'] = all_df['date'].dt.dayofyear
all_df['quarter'] = all_df['date'].dt.quarter
all_df['week_of_year'] = all_df['date'].dt.isocalendar().week.astype(int)

# Fitur Siklis untuk Waktu
# Menggunakan sin/cos untuk bulan dan hari dalam seminggu/tahun agar model memahami siklus
all_df['month_sin'] = np.sin(2 * np.pi * all_df['month'] / 12)
all_df['month_cos'] = np.cos(2 * np.pi * all_df['month'] / 12)
all_df['day_of_week_sin'] = np.sin(2 * np.pi * all_df['day_of_week'] / 7)
all_df['day_of_week_cos'] = np.cos(2 * np.pi * all_df['day_of_week'] / 7)
all_df['day_of_year_sin'] = np.sin(2 * np.pi * all_df['day_of_year'] / 365.25)
all_df['day_of_year_cos'] = np.cos(2 * np.pi * all_df['day_of_year'] / 365.25)

# --- Lag Features ---
print("\n--- Membuat Lag Features ---")
lag_periods = [1, 2, 3, 7, 14, 21, 28, 365]

for lag in lag_periods:
    all_df[f'consumption_lag_{lag}'] = all_df.groupby('cluster_id')['electricity_consumption'].shift(lag)

# --- Rolling Mean Features ---
print("--- Generating Rolling Mean Features for Weather Data ---")
rolling_features = [    'temperature_2m_max',
    'temperature_2m_min',
    'apparent_temperature_max',
    'apparent_temperature_min',
    'sunshine_duration',
    'daylight_duration',
    'wind_speed_10m_max',
    'wind_gusts_10m_max',
    'wind_direction_10m_dominant',
    'shortwave_radiation_sum',
    'et0_fao_evapotranspiration']
rolling_periods = [3, 7]

for feature in rolling_features:
    for period in rolling_periods:
        all_df[f'{feature}_rollmean_{period}'] = all_df.groupby('cluster_id')[feature].transform(
            lambda x: x.rolling(window=period, min_periods=1).mean()
        )

        all_df[f'{feature}_rollmean_{period}'].fillna(all_df[feature], inplace=True)



print("\nFeatures after engineering (5 Baris Pertama):")
print(all_df.head())
print("\nInfo after Feature Engineering:")
all_df.info()


--- Membuat Lag Features ---
--- Generating Rolling Mean Features for Weather Data ---

Features after engineering (5 Baris Pertama):
        date cluster_id  electricity_consumption  temperature_2m_max  \
0 2014-01-01  cluster_1                  358.032                10.8   
1 2014-01-02  cluster_1                  386.908                10.7   
2 2014-01-03  cluster_1                  395.319                11.7   
3 2014-01-04  cluster_1                  378.062                10.6   
4 2014-01-05  cluster_1                  372.324                11.4   

   temperature_2m_min  apparent_temperature_max  apparent_temperature_min  \
0                 4.2                       5.5                       0.4   
1                 7.0                       6.6                       3.3   
2                 7.1                       7.7                       2.5   
3                 5.9                       7.5                       1.5   
4                 2.4                       7.3

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df[f'{feature}_rollmean_{period}'].fillna(all_df[feature], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df[f'{feature}_rollmean_{period}'].fillna(all_df[feature], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work be

In [27]:
# SPLIT BACK DATA TRAIN DAN TEST
train_processed_df = all_df[all_df['is_test'] == 0].copy()
test_processed_df = all_df[all_df['is_test'] == 1].copy()

train_processed_df.drop(columns=['is_test'], inplace=True)
test_processed_df.drop(columns=['is_test'], inplace=True)

print(f"\nTrain processed shape: {train_processed_df.shape}")
print(f"Test processed shape: {test_processed_df.shape}")

for lag in lag_periods:
    col_name = f'consumption_lag_{lag}'
    mean_val = train_processed_df[col_name].mean()
    train_processed_df[col_name].fillna(mean_val, inplace=True)
    test_processed_df[col_name].fillna(mean_val, inplace=True)

print("\nMissing values di processed train data setelah lag imputation:")
print(train_processed_df.isnull().sum())
print("\nMissing values di processed test data setelah lag imputation (target should be NaN):")
print(test_processed_df.isnull().sum())

# Handle `cluster_id`
print("\n--- Handling Cluster ID ---")

le = LabelEncoder()
# Fit encoder on all unique cluster IDs from combined data to ensure consistency
all_cluster_ids = pd.concat([train_processed_df['cluster_id'], test_processed_df['cluster_id']]).unique()
le.fit(all_cluster_ids)

train_processed_df['cluster_id_encoded'] = le.transform(train_processed_df['cluster_id'])
test_processed_df['cluster_id_encoded'] = le.transform(test_processed_df['cluster_id'])

# Buang kolom 'cluster_id' asli (string), dan 'date' karena sudah diekstrak fiturnya
train_processed_df.drop(columns=['cluster_id', 'date'], inplace=True)
test_processed_df.drop(columns=['cluster_id', 'date'], inplace=True)

print("\nProcessed Train Data Head (after encoding and drop):")
print(train_processed_df.head())
print("\nProcessed Test Data Head (after encoding and drop):")
print(test_processed_df.head())


Train processed shape: (11688, 57)
Test processed shape: (3400, 57)

Missing values di processed train data setelah lag imputation:
date                                      0
cluster_id                                0
electricity_consumption                   0
temperature_2m_max                        0
temperature_2m_min                        0
apparent_temperature_max                  0
apparent_temperature_min                  0
sunshine_duration                         0
daylight_duration                         0
wind_speed_10m_max                        0
wind_gusts_10m_max                        0
wind_direction_10m_dominant               0
shortwave_radiation_sum                   0
et0_fao_evapotranspiration                0
year                                      0
month                                     0
day                                       0
day_of_week                               0
day_of_year                               0
quarter                        

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_processed_df[col_name].fillna(mean_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_processed_df[col_name].fillna(mean_val, inplace=True)
