In [51]:
#pip install tensorflow

In [52]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [53]:
# df = pd.read_excel('updated_dataset_final.xlsx')

In [54]:
#df.info()

In [55]:
df = pd.read_excel("updated_dataset_final.xlsx", parse_dates=['Tanggal']) #parse = untuk ngubah kolom Tanggal jadi datetime
df = df.sort_values('Tanggal')
df = df.reset_index(drop=True)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4020 entries, 0 to 4019
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   Tanggal                             4020 non-null   datetime64[ns]
 1   Temperatur Minimum                  4020 non-null   float64       
 2   Temperatur Maksimum                 4020 non-null   float64       
 3   Temperatur Rata-rata                4020 non-null   float64       
 4   Kelembapan Rata-rata                4020 non-null   float64       
 5   Curah Hujan (mm)                    4020 non-null   float64       
 6   Lamanya Penyinaran Matahari         4020 non-null   float64       
 7   Kecepatan Angin Maksimum            4020 non-null   int64         
 8   Arah Angin Saat Kecepatan Maksimum  4020 non-null   int64         
 9   Kecepatan Angin Rata-rata           4020 non-null   int64         
 10  Arah Angin Terbanyak (°)

In [57]:
def add_safe_features(df):
    df = df.copy()

    # Extract month and day in Tanggal column
    df['month'] = df['Tanggal'].dt.month
    df['day']   = df['Tanggal'].dt.day

    # Create coordinat cos dan sin menggunakan rumus  
    df['month_sin'] = np.sin(2*np.pi*df['month']/12)
    df['month_cos'] = np.cos(2*np.pi*df['month']/12)
    df['day_sin']   = np.sin(2*np.pi*df['day']/31)
    df['day_cos']   = np.cos(2*np.pi*df['day']/31)

    df['temp_range'] = df['Temperatur Maksimum'] - df['Temperatur Minimum']

    return df

df = add_safe_features(df)


In [58]:
train = df[df['Tanggal'] < '2024-01-01']
val   = df[(df['Tanggal'] >= '2024-01-01') & (df['Tanggal'] < '2025-01-01')]
test  = df[df['Tanggal'] >= '2025-01-01']


In [59]:
def add_leakage_safe_lags(train, val, test):
    # Gabung supaya index tetap utuh (kemudian akan di-slice kembali)
    full = pd.concat([train, val, test], axis=0).sort_index()

    # --- LAG features (aman) ---
    full['Curah Hujan (mm)_lag1']  = full['Curah Hujan (mm)'].shift(1)
    full['Curah Hujan (mm)_lag2']  = full['Curah Hujan (mm)'].shift(2)
    full['Curah Hujan (mm)_lag3']  = full['Curah Hujan (mm)'].shift(3)
    full['Curah Hujan (mm)_lag7']  = full['Curah Hujan (mm)'].shift(7)
    full['Temperatur Rata-rata_lag1'] = full['Temperatur Rata-rata'].shift(1)

    # --- Differencing (aman) ---
    full['Curah Hujan (mm)_diff1'] = full['Curah Hujan (mm)'] - full['Curah Hujan (mm)_lag1']
    full['RH_diff1'] = full['Kelembapan Rata-rata'] - full['Kelembapan Rata-rata'].shift(1)

    # --- Rolling features (SAFE: shift first, then rolling on past values) ---
    # buat seri hujan yang hanya berisi nilai sampai t-1 (tidak termasuk t)
    rain_past = full['Curah Hujan (mm)'].shift(1)

    full['Curah Hujan (mm)_7d']  = rain_past.rolling(window=7, min_periods=1).mean()
    full['Curah Hujan (mm)_14d'] = rain_past.rolling(window=14, min_periods=1).mean()
    full['Curah Hujan (mm)_30d'] = rain_past.rolling(window=30, min_periods=1).mean()

    # Rain count: ubah ke indikator lalu shift lalu rolling sum
    rain_indicator_past = (full['Curah Hujan (mm)'] > 0).astype(int).shift(1)
    full['Rain_7d_count'] = rain_indicator_past.rolling(window=7, min_periods=1).sum()

    # Jika ingin, bisa juga tambahkan rolling std, max, dll dengan pola yang sama:
    # full['Curah Hujan (mm)_7d_std'] = rain_past.rolling(7, min_periods=1).std()

    # --- Kembalikan ke split semula ---
    train2 = full.loc[train.index].copy()
    val2   = full.loc[val.index].copy()
    test2  = full.loc[test.index].copy()

    return train2, val2, test2


In [60]:
train2, val2, test2 = add_leakage_safe_lags(train, val, test)

train2 = train2.dropna()
val2   = val2.dropna()
test2  = test2.dropna()


In [61]:
features = [
    'Temperatur Minimum','Temperatur Maksimum','Temperatur Rata-rata','Kelembapan Rata-rata','Lamanya Penyinaran Matahari','Kecepatan Angin Maksimum','Arah Angin Saat Kecepatan Maksimum','Kecepatan Angin Rata-rata','Arah Angin Terbanyak (°)',
    'Curah Hujan (mm)_lag1','Curah Hujan (mm)_lag2','Curah Hujan (mm)_lag3', 'Curah Hujan (mm)_lag7','Temperatur Rata-rata_lag1','Curah Hujan (mm)_diff1','RH_diff1',
    'Curah Hujan (mm)_7d', 'Curah Hujan (mm)_14d','Curah Hujan (mm)_30d','Rain_7d_count',
    'month_sin','month_cos','day_sin','day_cos','temp_range'
]


X_train = train2[features]
y_train = np.log1p(train2['Curah Hujan (mm)'])

X_val   = val2[features]
y_val   = np.log1p(val2['Curah Hujan (mm)'])

X_test  = test2[features]
y_test  = np.log1p(test2['Curah Hujan (mm)'])
# X_train = train2[features]
# #y_train = train2['Curah Hujan (mm)']

# y_train = np.log1p(train['Curah Hujan (mm)'])



# X_val = val2[features]
# #y_val = val2['Curah Hujan (mm)']
# y_val   = np.log1p(val['Curah Hujan (mm)'])


# X_test = test2[features]
# #y_test = test2['Curah Hujan (mm)']
# y_test  = np.log1p(test['Curah Hujan (mm)'])





scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)



In [62]:
for f in features:
    print(f)

Temperatur Minimum
Temperatur Maksimum
Temperatur Rata-rata
Kelembapan Rata-rata
Lamanya Penyinaran Matahari
Kecepatan Angin Maksimum
Arah Angin Saat Kecepatan Maksimum
Kecepatan Angin Rata-rata
Arah Angin Terbanyak (°)
Curah Hujan (mm)_lag1
Curah Hujan (mm)_lag2
Curah Hujan (mm)_lag3
Curah Hujan (mm)_lag7
Temperatur Rata-rata_lag1
Curah Hujan (mm)_diff1
RH_diff1
Curah Hujan (mm)_7d
Curah Hujan (mm)_14d
Curah Hujan (mm)_30d
Rain_7d_count
month_sin
month_cos
day_sin
day_cos
temp_range


In [64]:
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=20,
    random_state=42
)
rf.fit(X_train_scaled, y_train)

pred_rf = rf.predict(X_val_scaled)


y_val_real = np.expm1(y_val)

# Perbaikan di sini: mean_squared_error
print("RF MAE:", mean_absolute_error(y_val_real, pred_rf))
print("RF RMSE:", np.sqrt(mean_squared_error(y_val_real, pred_rf)))

# Output Anda:
# RF MAE: 0.5942667449238894
# RF RMSE: 1.500483658715626

RF MAE: 5.9242906063350915
RF RMSE: 15.956580251503658


In [None]:
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective='reg:squarederror'
)
xgb.fit(X_train_scaled, y_train)

pred_xgb = xgb.predict(X_val_scaled)

print("XGB MAE:", mean_absolute_error(y_val, pred_xgb))

#XGB MAE: 0.7026166778040924


XGB MAE: 0.8336703617967682


In [None]:
# reshape → (samples, timesteps, features)
# X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
# X_val_lstm   = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))


In [None]:
# model = Sequential([
#     LSTM(64, return_sequences=False, input_shape=(1, X_train_scaled.shape[1])),
#     Dropout(0.2),
#     Dense(32, activation='relu'),
#     Dense(1)
# ])

# # Perbaikan: parameter 'loss'
# model.compile(optimizer='adam', loss='mse')

# history = model.fit(
#     X_train_lstm, y_train,
#     validation_data=(X_val_lstm, y_val),
#     epochs=50,
#     batch_size=32,
#     verbose=1
# )

# pred_lstm = model.predict(X_val_lstm)

# # Perbaikan: fungsi mean_absolute_error
# print("LSTM MAE:", mean_absolute_error(y_val, pred_lstm))

# # LSTM MAE: 0.550815572085602

Epoch 1/50


  super().__init__(**kwargs)


[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 297.0454 - val_loss: 221.7646
Epoch 2/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 189.5955 - val_loss: 116.0461
Epoch 3/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 102.6487 - val_loss: 49.1876
Epoch 4/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 48.1571 - val_loss: 18.9748
Epoch 5/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 29.0752 - val_loss: 11.7287
Epoch 6/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 23.0323 - val_loss: 9.3131
Epoch 7/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 21.6186 - val_loss: 6.7704
Epoch 8/50
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 20.3243 - val_loss: 5.4069
Epoch 9/50
[1m103/103[0m [32m━

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12,6))
# sns.kdeplot(y_train, label='Train Curah Hujan (mm)', fill=True)
# sns.kdeplot(y_test, label='Test Curah Hujan (mm)', fill=True)
# plt.title("Distribusi Target Train vs Test")
# plt.legend()
# plt.show()


In [None]:
# import pandas as pd
# import numpy as np

# # Perbaikan: variable 'corr' dan function '.corr()'
# corr = pd.concat([X_train, y_train], axis=1).corr()

# # Perbaikan: variable 'target_corr'
# target_corr = corr['Curah Hujan (mm)'].sort_values(ascending=False)

# print("Korelasi Fitur dengan Target:")
# print(target_corr)

In [None]:
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# import numpy as np

# pred_train_rf = rf.predict(X_train_scaled)
# pred_test_rf = rf.predict(X_test_scaled)

# # Perbaikan fungsi mean_squared_error
# train_rmse = np.sqrt(mean_squared_error(y_train, pred_train_rf))
# test_rmse  = np.sqrt(mean_squared_error(y_test, pred_test_rf))

# # Perbaikan fungsi mean_absolute_error
# train_mae = mean_absolute_error(y_train, pred_train_rf)
# test_mae  = mean_absolute_error(y_test, pred_test_rf)

# print("RF Train RMSE:", train_rmse)
# print("RF Test RMSE :", test_rmse)
# print("RF Train MAE:", train_mae)
# print("RF Test MAE :", test_mae)

In [None]:
# plt.figure(figsize=(10,5))
# plt.scatter(y_test, y_test - pred_test_rf, alpha=0.5, label="Test Residual")
# plt.scatter(y_train, y_train - pred_train_rf, alpha=0.5, label="Train Residual")
# plt.axhline(0, color='red', linestyle='--')
# plt.xlabel("Actual Curah Hujan (mm)")
# plt.ylabel("Residual")
# plt.legend()
# plt.title("Residual Train vs Test (RF)")
# plt.show()


# Predict Future (Forecast)

In [74]:
def build_future_features(history_df, future_date):
    history_df = history_df.copy()
    future_date = pd.to_datetime(future_date)

    # Gunakan max window = 30 hari
    last_30 = history_df.tail(30).copy()

    base = history_df.iloc[-1].copy()
    base['Tanggal'] = future_date

    # --- Date features ---
    base['month'] = future_date.month
    base['day']   = future_date.day

    base['month_sin'] = np.sin(2*np.pi*base['month']/12)
    base['month_cos'] = np.cos(2*np.pi*base['month']/12)
    base['day_sin']   = np.sin(2*np.pi*base['day']/31)
    base['day_cos']   = np.cos(2*np.pi*base['day']/31)

    base['temp_range'] = (
        base['Temperatur Maksimum']
        - base['Temperatur Minimum']
    )

    # Gabungkan histori + baris future (tanpa target future)
    temp = pd.concat(
        [last_30, pd.DataFrame([base])],
        ignore_index=True
    )

    # --- LAG FEATURES (aman) ---
    temp['Curah Hujan (mm)_lag1'] = temp['Curah Hujan (mm)'].shift(1)
    temp['Curah Hujan (mm)_lag2'] = temp['Curah Hujan (mm)'].shift(2)
    temp['Curah Hujan (mm)_lag3'] = temp['Curah Hujan (mm)'].shift(3)
    temp['Curah Hujan (mm)_lag7'] = temp['Curah Hujan (mm)'].shift(7)

    temp['Temperatur Rata-rata_lag1'] = temp['Temperatur Rata-rata'].shift(1)

    temp['RH_diff1'] = (
        temp['Kelembapan Rata-rata']
        - temp['Kelembapan Rata-rata'].shift(1)
    )

    # --- ROLLING AMAN (SHIFT DULU!) ---
    rain_past = temp['Curah Hujan (mm)'].shift(1)

    temp['Curah Hujan (mm)_7d']  = rain_past.rolling(7).mean()
    temp['Curah Hujan (mm)_14d'] = rain_past.rolling(14).mean()
    temp['Curah Hujan (mm)_30d'] = rain_past.rolling(30).mean()

    rain_ind_past = (temp['Curah Hujan (mm)'] > 0).astype(int).shift(1)
    temp['Rain_7d_count'] = rain_ind_past.rolling(7).sum()

    # ✅ Jangan buat diff curah hujan untuk future
    temp['Curah Hujan (mm)_diff1'] = (
    temp['Curah Hujan (mm)'].shift(1)
    - temp['Curah Hujan (mm)'].shift(2)
    )

    future = temp.iloc[-1:].copy()

    # Isi NaN / inf pakai nilai terakhir yg masuk akal
    future = future.replace([np.inf, -np.inf], np.nan)

    for col in future.columns:
        if future[col].isna().any():
            future[col] = future[col].fillna(
                history_df[col].iloc[-1]
                if col in history_df.columns
                else 0
            )

    return future   

In [76]:
def forecast_7_days(
    df,
    start_date,
    features,
    scaler,
    rf,
    xgb,
    lstm_model
):
    history = df.copy()
    results = []

    # Perbaikan: variable 'current_date'
    current_date = pd.to_datetime(start_date)

    for i in range(3):
        # Perbaikan: variable 'current_date'
        next_date = current_date + pd.Timedelta(days=1)

        # Pastikan fungsi build_future_features sudah didefinisikan sebelumnya
        future_row = build_future_features(history, next_date) 
        future_scaled = scaler.transform(future_row[features])

        # Predictions
        pred_rf  = np.expm1(rf.predict(future_scaled)[0])
        pred_xgb = np.expm1(xgb.predict(future_scaled)[0])
        pred_lstm = lstm_model.predict(
            future_scaled.reshape(1,1,-1)
        )[0][0]

        # Ensemble simple (Rata-rata)
        #pred_mean = np.mean([pred_rf, pred_xgb, pred_lstm])
        pred_mean = np.mean([pred_rf, pred_xgb])
        

        results.append({
            "Tanggal": next_date,
            "RF": pred_rf,
            "XGB": pred_xgb,
            "LSTM": pred_lstm,
            "Ensemble": pred_mean
        })

        # Append prediction untuk dipakai prediksi hari berikutnya (Recursive)
        new_row = future_row.copy()
        
        # Bagian ini TETAP, karena ini nama kolom di Dataframe
        new_row['Curah Hujan (mm)'] = pred_mean   
        
        history = pd.concat([history, new_row], ignore_index=True)

        # Perbaikan: variable 'current_date'
        current_date = next_date

    return pd.DataFrame(results)

In [77]:
future_7d = forecast_7_days(
    df=df,
    start_date="2025-12-07",
    features=features,
    scaler=scaler,
    rf=rf,
    xgb=xgb,
    lstm_model=model
)

print(future_7d)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
     Tanggal        RF       XGB      LSTM  Ensemble
0 2025-12-08  0.000000  0.018043  0.360467  0.009021
1 2025-12-09  0.016999  0.209904  0.418548  0.113452
2 2025-12-10  0.175447  0.538851  0.432853  0.357149
