# Pelatihan Model Prediksi Cuaca (Hourly Data)

Notebook ini melatih model machine learning untuk memprediksi parameter cuaca menggunakan data historis per jam dari Kota Semarang.

**Dataset:** `historical_data_hourly.csv` (Timezone: Asia/Jakarta)

## 1. Persiapan Lingkungan dan Pemuatan Pustaka

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('✓ Semua pustaka berhasil dimuat!')

✓ Semua pustaka berhasil dimuat!


## 2. Pengumpulan dan Pemuatan Data

In [20]:
# Memuat dataset historis hourly
df = pd.read_csv('D:\laragon\www\weather-iot\examples\data_collections\datasets\historical_data_hourly.csv')

print(f'Total records: {len(df):,}')
print(f'\nKolom dataset: {df.columns.tolist()}')
print(f'\nBeberapa baris pertama:')
df.head(10)

Total records: 227,280

Kolom dataset: ['id', 'hour', 'day', 'month', 'year', 'temp', 'humidity', 'windspeed', 'sealevelpressure', 'weather_code', 'conditions']

Beberapa baris pertama:


Unnamed: 0,id,hour,day,month,year,temp,humidity,windspeed,sealevelpressure,weather_code,conditions
0,0,0,1,1,2000,21.8,98,4.0,1008.4,3,Overcast
1,1,1,1,1,2000,21.4,99,4.0,1007.9,3,Overcast
2,2,2,1,1,2000,21.4,98,3.2,1007.4,3,Overcast
3,3,3,1,1,2000,21.2,99,4.6,1007.0,3,Overcast
4,4,4,1,1,2000,21.0,99,3.6,1006.9,3,Overcast
5,5,5,1,1,2000,20.8,98,2.7,1006.6,3,Overcast
6,6,6,1,1,2000,21.1,98,2.7,1007.5,3,Overcast
7,7,7,1,1,2000,23.5,94,1.4,1008.6,3,Overcast
8,8,8,1,1,2000,24.8,87,4.7,1009.0,3,Overcast
9,9,9,1,1,2000,26.0,82,8.2,1009.0,51,Rain


In [None]:
# Informasi dataset
print('Informasi Dataset:')
print('='*50)
df.info()
print('\nStatistik Deskriptif:')
df.describe()

## 3. Pra-pemrosesan Data

In [None]:
# Cek missing values
print('Missing values per kolom:')
print(df.isnull().sum())
print(f'\nTotal missing values: {df.isnull().sum().sum()}')

In [None]:
# Copy dataframe untuk cleaning
df_clean = df.copy()

# Hapus baris dengan null pada kolom penting
df_clean = df_clean.dropna(subset=['conditions', 'windspeed', 'humidity'])

# Isi null pressure dengan median
if df_clean['sealevelpressure'].isnull().any():
    median_pressure = df_clean['sealevelpressure'].median()
    df_clean['sealevelpressure'].fillna(median_pressure, inplace=True)

print(f'Jumlah data setelah cleaning: {len(df_clean):,}')

In [None]:
# Label Encoding untuk conditions
conditions_mapping = {
    'Clear': 0,
    'Overcast': 1,
    'Partially cloudy': 2,
    'Rain': 3,
    'Rain, Overcast': 4,
    'Rain, Partially cloudy': 5,
    'Unknown': 6
}

df_clean['conditions_encoded'] = df_clean['conditions'].map(conditions_mapping)
df_clean['conditions_encoded'].fillna(6, inplace=True)

print('Distribusi kondisi cuaca:')
print(df_clean['conditions'].value_counts())
print('\nDataset siap untuk training!')
df_clean.head()

## 4. Pelatihan dan Perbandingan Model

In [None]:
# Definisi fitur dan target
features = ['hour', 'day', 'month', 'year', 'sealevelpressure', 
            'conditions_encoded', 'windspeed', 'humidity']
targets = ['temp', 'humidity', 'windspeed', 'sealevelpressure']

X = df_clean[features]
Y = df_clean[targets]

# Split data: 80% train, 20% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f'Training set: {len(X_train):,} samples')
print(f'Test set: {len(X_test):,} samples')

In [None]:
# Inisialisasi model
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

# Training dan evaluasi untuk Temperature
results = []
target_var = 'temp'
trained_models = {}

print(f'Evaluasi model untuk prediksi {target_var}...')
print('='*70)

for name, model in models.items():
    print(f'Training {name}...')
    model.fit(X_train, Y_train[target_var])
    trained_models[name] = model
    Y_pred = model.predict(X_test)
    
    mse = mean_squared_error(Y_test[target_var], Y_pred)
    mae = mean_absolute_error(Y_test[target_var], Y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(Y_test[target_var], Y_pred)
    
    results.append({
        'Model': name,
        'MSE': round(mse, 2),
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'R² Score': round(r2, 4),
        'Accuracy (%)': round(r2 * 100, 2)
    })

print('\n✓ Training selesai!')

In [None]:
# Tabel Perbandingan Metrik (Mirip Tabel 4)
results_df = pd.DataFrame(results)
print('Tabel 4: Perbandingan Metrik Kinerja Model')
print('='*70)
results_df.style.highlight_max(subset=['R² Score', 'Accuracy (%)'], color='lightgreen')

## 5. Analisis Hasil dan Kinerja Individual Parameter

In [None]:
# Evaluasi Random Forest untuk semua parameter
param_results = []
rf_models = {}

print('Evaluasi Random Forest untuk setiap parameter:')
print('='*50)

for target in targets:
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, Y_train[target])
    rf_models[target] = rf_model
    
    Y_pred = rf_model.predict(X_test)
    
    mae = mean_absolute_error(Y_test[target], Y_pred)
    rmse = np.sqrt(mean_squared_error(Y_test[target], Y_pred))
    r2 = r2_score(Y_test[target], Y_pred)
    
    param_results.append({
        'Parameter': target.capitalize(),
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'R² Score': round(r2, 2)
    })
    print(f'  {target}: R² = {r2:.2f}')

print('\n✓ Evaluasi selesai!')

In [None]:
# Tabel Evaluasi Kinerja Parameter (Mirip Tabel 5)
param_df = pd.DataFrame(param_results)
print('Tabel 5: Evaluasi Kinerja Parameter Prediksi Cuaca')
print('='*50)
param_df

In [None]:
# Visualisasi perbandingan R² Score dengan barplot
fig, ax = plt.subplots(figsize=(10, 6))
colors = sns.color_palette('viridis', len(param_df))
bars = ax.bar(param_df['Parameter'], param_df['R² Score'], color=colors)

# Add value labels on bars
for bar, val in zip(bars, param_df['R² Score']):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
            f'{val:.2f}', ha='center', va='bottom', fontweight='bold')

ax.set_ylim(0, 1.1)
ax.set_xlabel('Parameter', fontsize=12)
ax.set_ylabel('R² Score', fontsize=12)
ax.set_title('Perbandingan R² Score untuk Setiap Parameter', fontsize=14, fontweight='bold')
ax.axhline(y=0.9, color='red', linestyle='--', alpha=0.5, label='Target R²=0.9')
ax.legend()
plt.tight_layout()
plt.show()

## 6. Penyimpanan Model Terbaik

In [None]:
import os

# Buat folder models jika belum ada
os.makedirs('../models', exist_ok=True)

# Simpan semua model ke satu file pkl
model_bundle = {
    'models': rf_models,
    'features': features,
    'targets': targets,
    'conditions_mapping': conditions_mapping
}

model_path = '../models/weather_model_hourly.pkl'
joblib.dump(model_bundle, model_path)

print(f'✓ Model berhasil disimpan ke: {model_path}')
print(f'\nModel bundle berisi:')
print(f'  - Models: {list(rf_models.keys())}')
print(f'  - Features: {features}')
print(f'  - Targets: {targets}')

In [None]:
# Simpan metadata dalam format JSON
import json

metadata = {
    'model_type': 'RandomForestRegressor',
    'n_estimators': 100,
    'features': features,
    'targets': targets,
    'conditions_mapping': conditions_mapping,
    'data_type': 'hourly',
    'dataset': 'historical_data_hourly.csv',
    'performance': param_results
}

with open('../models/model_metadata_hourly.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print('✓ Metadata model disimpan ke: ../models/model_metadata_hourly.json')

## 7. Visualisasi Perbandingan Aktual vs. Prediksi (Januari 2020)

In [None]:
# Filter data untuk Januari 2020
jan_2020 = df_clean[(df_clean['year'] == 2020) & (df_clean['month'] == 1)].copy()

# Agregasi per hari (rata-rata harian)
jan_2020_daily = jan_2020.groupby('day').agg({
    'temp': 'mean',
    'humidity': 'mean',
    'windspeed': 'mean',
    'sealevelpressure': 'mean',
    'conditions_encoded': 'first'
}).reset_index()

# Buat fitur untuk prediksi
jan_2020_daily['hour'] = 12  # Use noon as reference
jan_2020_daily['month'] = 1
jan_2020_daily['year'] = 2020

X_jan = jan_2020_daily[features]

# Prediksi untuk setiap parameter
predictions_jan = {}
for target in targets:
    predictions_jan[target] = rf_models[target].predict(X_jan)

print(f'Data Januari 2020: {len(jan_2020_daily)} hari')

In [None]:
# Buat label tanggal
dates = [f'{d} Jan' for d in jan_2020_daily['day']]
x_ticks = range(0, len(dates), 4)  # Jeda setiap 4 hari
x_labels = [dates[i] for i in x_ticks]

In [None]:
# Grafik 1: Actual vs Predicted Temperature (Gambar 15)
plt.figure(figsize=(14, 6))
plt.plot(jan_2020_daily['day'], jan_2020_daily['temp'], 
         label='Actual Temperature', color='blue', linewidth=2, marker='o', markersize=4)
plt.plot(jan_2020_daily['day'], predictions_jan['temp'], 
         label='Predicted Temperature', color='red', linestyle='--', linewidth=2, marker='s', markersize=4)
plt.title('Actual vs Predicted Temperature (Januari 2020)', fontsize=14, fontweight='bold')
plt.xlabel('Tanggal', fontsize=12)
plt.ylabel('Temperature (°C)', fontsize=12)
plt.xticks(range(1, 32, 4), [f'{d} Jan' for d in range(1, 32, 4)])
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Grafik 2: Actual vs Predicted Humidity (Gambar 16)
plt.figure(figsize=(14, 6))
plt.plot(jan_2020_daily['day'], jan_2020_daily['humidity'], 
         label='Actual Humidity', color='blue', linewidth=2, marker='o', markersize=4)
plt.plot(jan_2020_daily['day'], predictions_jan['humidity'], 
         label='Predicted Humidity', color='red', linestyle='--', linewidth=2, marker='s', markersize=4)
plt.title('Actual vs Predicted Humidity (Januari 2020)', fontsize=14, fontweight='bold')
plt.xlabel('Tanggal', fontsize=12)
plt.ylabel('Humidity (%)', fontsize=12)
plt.xticks(range(1, 32, 4), [f'{d} Jan' for d in range(1, 32, 4)])
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Grafik 3: Actual vs Predicted Wind Speed (Gambar 17)
plt.figure(figsize=(14, 6))
plt.plot(jan_2020_daily['day'], jan_2020_daily['windspeed'], 
         label='Actual Wind Speed', color='blue', linewidth=2, marker='o', markersize=4)
plt.plot(jan_2020_daily['day'], predictions_jan['windspeed'], 
         label='Predicted Wind Speed', color='red', linestyle='--', linewidth=2, marker='s', markersize=4)
plt.title('Actual vs Predicted Wind Speed (Januari 2020)', fontsize=14, fontweight='bold')
plt.xlabel('Tanggal', fontsize=12)
plt.ylabel('Wind Speed (km/h)', fontsize=12)
plt.xticks(range(1, 32, 4), [f'{d} Jan' for d in range(1, 32, 4)])
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Grafik 4: Actual vs Predicted Pressure (Gambar 18)
plt.figure(figsize=(14, 6))
plt.plot(jan_2020_daily['day'], jan_2020_daily['sealevelpressure'], 
         label='Actual Pressure', color='blue', linewidth=2, marker='o', markersize=4)
plt.plot(jan_2020_daily['day'], predictions_jan['sealevelpressure'], 
         label='Predicted Pressure', color='red', linestyle='--', linewidth=2, marker='s', markersize=4)
plt.title('Actual vs Predicted Pressure (Januari 2020)', fontsize=14, fontweight='bold')
plt.xlabel('Tanggal', fontsize=12)
plt.ylabel('Pressure (hPa)', fontsize=12)
plt.xticks(range(1, 32, 4), [f'{d} Jan' for d in range(1, 32, 4)])
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Visualisasi Dampak Data Inkremental

In [None]:
# Simulasi incremental learning
data_points = [10000, 50000, 100000, 150000, 200000, len(df_clean)]
r2_scores = []

print('Melatih model dengan peningkatan jumlah data...')
print('='*50)

for n_points in data_points:
    # Subset data
    df_subset = df_clean.iloc[:n_points]
    X_subset = df_subset[features]
    Y_subset = df_subset['temp']
    
    # Split dan train
    X_tr, X_te, Y_tr, Y_te = train_test_split(X_subset, Y_subset, 
                                               test_size=0.2, random_state=42)
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_tr, Y_tr)
    Y_pr = rf_model.predict(X_te)
    
    r2 = r2_score(Y_te, Y_pr) * 100
    r2_scores.append(r2)
    print(f'  {n_points:>7,} data points: R² = {r2:.2f}%')

print('\n✓ Simulasi selesai!')

In [None]:
# Grafik 5: Impact of Incremental Data (Gambar 21)
plt.figure(figsize=(12, 6))
plt.plot(data_points, r2_scores, marker='o', color='green', 
         linewidth=2.5, markersize=10, markerfacecolor='white', markeredgewidth=2)

# Add value labels
for x, y in zip(data_points, r2_scores):
    plt.annotate(f'{y:.1f}%', (x, y), textcoords='offset points', 
                 xytext=(0, 10), ha='center', fontweight='bold')

plt.title('Impact of Incremental Data on Model Performance (R²)', 
          fontsize=14, fontweight='bold')
plt.xlabel('Number of Data Points', fontsize=12)
plt.ylabel('R² Score (%)', fontsize=12)
plt.grid(True, alpha=0.3)

# Format x-axis with thousands separator
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x):,}'))

plt.tight_layout()
plt.show()

## Ringkasan

Notebook ini telah menyelesaikan:
1. ✅ Memuat dan pre-processing dataset hourly
2. ✅ Melatih 4 model regresi dan membandingkannya
3. ✅ Menganalisis kinerja Random Forest pada setiap parameter
4. ✅ Menyimpan model terbaik ke file `.pkl`
5. ✅ Visualisasi Actual vs Predicted untuk Januari 2020
6. ✅ Visualisasi dampak incremental learning