In [1]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_excel("dataset_final.xlsx")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3983 entries, 0 to 3982
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   TANGGAL  3983 non-null   datetime64[ns]
 1   TN       3983 non-null   float64       
 2   TX       3983 non-null   float64       
 3   TAVG     3983 non-null   float64       
 4   RH_AVG   3983 non-null   float64       
 5   RR       3983 non-null   float64       
 6   SS       3983 non-null   float64       
 7   FF_X     3983 non-null   int64         
 8   DDD_X    3983 non-null   int64         
 9   FF_AVG   3983 non-null   int64         
 10  DDD_CAR  3983 non-null   int64         
dtypes: datetime64[ns](1), float64(6), int64(4)
memory usage: 342.4 KB


# Feature Engineering

In [5]:
df.index = pd.to_datetime(df.index)

df['Bulan_Angka'] = df.index.month

print(df[['Bulan_Angka']].head())


                               Bulan_Angka
1970-01-01 00:00:00.000000000            1
1970-01-01 00:00:00.000000001            1
1970-01-01 00:00:00.000000002            1
1970-01-01 00:00:00.000000003            1
1970-01-01 00:00:00.000000004            1


In [6]:
# Buat Fitur Sinus (Sumbu Y)
df['Bulan_Sin'] = np.sin(2 * np.pi * df['Bulan_Angka'] / 12)

# Buat Fitur Cosinus (Sumbu X)
df['Bulan_Cos'] = np.cos(2 * np.pi * df['Bulan_Angka'] / 12)

# Lihat hasilnya
print(df[['Bulan_Angka', 'Bulan_Sin', 'Bulan_Cos']].head(12))

                               Bulan_Angka  Bulan_Sin  Bulan_Cos
1970-01-01 00:00:00.000000000            1        0.5   0.866025
1970-01-01 00:00:00.000000001            1        0.5   0.866025
1970-01-01 00:00:00.000000002            1        0.5   0.866025
1970-01-01 00:00:00.000000003            1        0.5   0.866025
1970-01-01 00:00:00.000000004            1        0.5   0.866025
1970-01-01 00:00:00.000000005            1        0.5   0.866025
1970-01-01 00:00:00.000000006            1        0.5   0.866025
1970-01-01 00:00:00.000000007            1        0.5   0.866025
1970-01-01 00:00:00.000000008            1        0.5   0.866025
1970-01-01 00:00:00.000000009            1        0.5   0.866025
1970-01-01 00:00:00.000000010            1        0.5   0.866025
1970-01-01 00:00:00.000000011            1        0.5   0.866025


In [7]:
# Pastikan data urut berdasarkan tanggal dulu! (Wajib)
df = df.sort_index()

# --- MEMBUAT LAG FEATURES ---

# 1. Fitur: Curah Hujan Kemarin (H-1)
df['RR_lag1'] = df['RR'].shift(1)

# 2. Fitur: Curah Hujan 2 Hari Lalu (H-2)
df['RR_lag2'] = df['RR'].shift(2)

# 3. Fitur: Kelembapan Kemarin (H-1)
# Karena kelembapan kemarin sangat mempengaruhi jenuhnya awan hari ini
df['RH_lag1'] = df['RH_AVG'].shift(1)

# --- CEK HASILNYA ---
# Kita lihat 5 baris pertama
# Perhatikan ada nilai NaN (Not a Number)
print(df[['RR', 'RR_lag1', 'RR_lag2']].head())

                                 RR  RR_lag1  RR_lag2
1970-01-01 00:00:00.000000000  14.5      NaN      NaN
1970-01-01 00:00:00.000000001  31.5     14.5      NaN
1970-01-01 00:00:00.000000002   0.5     31.5     14.5
1970-01-01 00:00:00.000000003   2.4      0.5     31.5
1970-01-01 00:00:00.000000004  35.3      2.4      0.5


In [8]:
# Hapus baris yang mengandung NaN akibat shift
df_clean = df.dropna()

print(f"Data awal: {len(df)} baris")
print(f"Data setelah dropna: {len(df_clean)} baris")
# Biasanya berkurang 2 baris (karena lag terjauh kita adalah 2)

Data awal: 3983 baris
Data setelah dropna: 3981 baris


In [9]:
# Pastikan data urut waktu
df = df.sort_index()

# --- FITUR ROLLING WINDOW ---

# 1. Rata-rata Hujan 7 Hari Terakhir
# .rolling(7).mean() -> Hitung rata-rata per blok 7 hari
# .shift(1) -> GESER ke bawah, supaya data hari ini tidak ikut kehitung
df['RR_mean_7d'] = df['RR'].rolling(window=7).mean().shift(1)

# 2. Rata-rata Kelembapan 3 Hari Terakhir
df['RH_mean_3d'] = df['RH_AVG'].rolling(window=3).mean().shift(1)

# 3. Maksimum Angin 3 Hari Terakhir (Mendeteksi badai yg baru lewat)
df['ff_max_3d'] = df['FF_X'].rolling(window=3).max().shift(1)

# --- CEK HASILNYA ---
print(df[['RR', 'RR_mean_7d']].head(10))

                                   RR  RR_mean_7d
1970-01-01 00:00:00.000000000  14.500         NaN
1970-01-01 00:00:00.000000001  31.500         NaN
1970-01-01 00:00:00.000000002   0.500         NaN
1970-01-01 00:00:00.000000003   2.400         NaN
1970-01-01 00:00:00.000000004  35.300         NaN
1970-01-01 00:00:00.000000005   2.300         NaN
1970-01-01 00:00:00.000000006   0.000         NaN
1970-01-01 00:00:00.000000007   0.329   12.357143
1970-01-01 00:00:00.000000008   0.275   10.332714
1970-01-01 00:00:00.000000009   0.154    5.872000


In [10]:
# Hapus baris NaN di awal data
df_final = df.dropna()

print(f"Data siap pakai: {len(df_final)} baris")
# Data Anda sekarang sudah bersih, padat, dan kaya fitur!

Data siap pakai: 3976 baris


In [11]:
# --- 1. FITUR SELISIH SUHU (DETEKTOR AWAN) ---
# Tx = Suhu Maksimum, Tn = Suhu Minimum
df['Temp_Range'] = df['TX'] - df['TN']


# --- 2. FITUR VEKTOR ANGIN ---
# Kita butuh kecepatan (ff_x) dan arah (ddd_x)

# Langkah A: Ubah derajat ke Radian (Komputer maunya Radian)
# Rumus: Derajat * PI / 180
wd_rad = df['DDD_X'] * np.pi / 180

# Langkah B: Hitung Komponen X dan Y
# Angin X (Barat <-> Timur)
df['Wind_x'] = df['FF_X'] * np.cos(wd_rad)

# Angin Y (Selatan <-> Utara)
df['Wind_y'] = df['FF_X'] * np.sin(wd_rad)

# --- FINAL CHECK ---
# Hapus kolom asli yang sudah tidak dipakai (opsional, biar rapi)
# Kita drop 'ddd_x' karena sudah diganti Wind_x dan Wind_y
# df = df.drop(columns=['ddd_x']) 

print("Engineering Fisika Selesai.")
print(df[['TX', 'TN', 'Temp_Range', 'Wind_x', 'Wind_y']].head())

Engineering Fisika Selesai.
                                 TX    TN  Temp_Range    Wind_x    Wind_y
1970-01-01 00:00:00.000000000  29.8  23.6         6.2  3.064178 -2.571150
1970-01-01 00:00:00.000000001  29.4  24.2         5.2  3.064178 -2.571150
1970-01-01 00:00:00.000000002  29.6  25.0         4.6  4.316039 -4.167950
1970-01-01 00:00:00.000000003  30.0  24.4         5.6  3.596699 -3.473292
1970-01-01 00:00:00.000000004  32.6  24.0         8.6  2.925415 -2.727993


In [12]:
df_final = df.dropna()

In [13]:
target_col = "RR"   # variabel yang mau diprediksi

# Hanya ambil kolom numerik
df_num = df_final.select_dtypes(include=[np.number])

X = df_num.drop(columns=[target_col])   # fitur
y = df_num[target_col]                  # target


In [14]:
train_end = '2022-12-31'
val_end   = '2023-12-31'

X_train = X.loc[:train_end]
y_train = y.loc[:train_end]

X_val = X.loc['2023-01-01':val_end]
y_val = y.loc['2023-01-01':val_end]

X_test = X.loc['2024-01-01':]
y_test = y.loc['2024-01-01':]


In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# fit hanya pada train
scaler.fit(X_train)

cols = X_train.columns

X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=cols, index=X_train.index)
X_val_scaled   = pd.DataFrame(scaler.transform(X_val),   columns=cols, index=X_val.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test),  columns=cols, index=X_test.index)

print("SUKSES! Split & scaling selesai tanpa data leakage.")


ValueError: Found array with 0 sample(s) (shape=(0, 21)) while a minimum of 1 is required by MinMaxScaler.

KeyError: 'TANGGAL'