In [None]:
import pandas as pd
import numpy as np

# Baseline

In [None]:
# Rainfall Forecasting using Recurrent Neural Networks (RNN-LSTM)
# Berdasarkan paper: Prasetya & Djamal (2019)

# ============================================================================
# 1. IMPORT LIBRARIES
# ============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

# Set random seed untuk reproducibility
np.random.seed(42)
keras.utils.set_random_seed(42)

print("Library berhasil diimport!")

# ============================================================================
# 2. LOAD DATA
# ============================================================================
# Ganti dengan path file Anda
df = pd.read_excel('updated_dataset_final - Copy.xlsx')


# UNCOMMENT baris berikut dan sesuaikan path file:
df['Tanggal'] = pd.to_datetime(df['Tanggal'])
df = df.sort_values('Tanggal').reset_index(drop=True)

print("Data berhasil dimuat!")
# print(f"Jumlah data: {len(df)}")
# print(f"\nInfo data:\n{df.info()}")
# print(f"\nSample data:\n{df.head()}")

# ============================================================================
# 3. PREPROCESSING DATA
# ============================================================================

def interpolate_missing_data(df):
    """
    Menangani missing data dengan interpolasi linear
    """
    df_clean = df.copy()
    
    # Interpolasi untuk kolom numerik
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df_clean[col] = df_clean[col].interpolate(method='linear', limit_direction='both')
    
    return df_clean

def extract_weekly_features(df):
    """
    Mengekstrak fitur mingguan dengan mengambil nilai maksimum setiap minggu
    Sesuai dengan formula di paper: x = Σ(n/k) * max(xi, x_i+1+7)
    """
    df_weekly = df.copy()
    df_weekly['Week'] = df_weekly['Tanggal'].dt.isocalendar().week
    df_weekly['Year'] = df_weekly['Tanggal'].dt.year
    
    # Agregasi mingguan - ambil nilai maksimum
    weekly_data = df_weekly.groupby(['Year', 'Week']).agg({
        'Temperatur Minimum': 'max',
        'Temperatur Maksimum': 'max',
        'Temperatur Rata-rata': 'max',
        'Kelembapan Rata-rata': 'max',
        'Curah Hujan (mm)': 'max',
        'Lamanya Penyinaran Matahari': 'max',
        'Kecepatan Angin Maksimum': 'max',
        'Kecepatan Angin Rata-rata': 'max',
        'Tanggal': 'last'
    }).reset_index()
    
    return weekly_data

def normalize_data(df, feature_cols):
    """
    Normalisasi data ke rentang [0, 1]
    """
    scaler = MinMaxScaler(feature_range=(0, 1))
    df_normalized = df.copy()
    df_normalized[feature_cols] = scaler.fit_transform(df[feature_cols])
    
    return df_normalized, scaler

def classify_rainfall(rainfall):
    """
    Klasifikasi curah hujan menjadi 5 kelas:
    0: Very Low (<5mm)
    1: Low (5-20mm)
    2: Medium (20-50mm)
    3: High (50-100mm)
    4: Very High (>100mm)
    """
    if rainfall < 5:
        return 0
    elif rainfall < 20:
        return 1
    elif rainfall < 50:
        return 2
    elif rainfall < 100:
        return 3
    else:
        return 4

# Uncomment untuk menjalankan preprocessing:

# 3.1 Interpolasi missing data
df = interpolate_missing_data(df)
print("Missing data berhasil diinterpolasi!")

# 3.2 Ekstraksi fitur mingguan
df_weekly = extract_weekly_features(df)
print(f"Data mingguan berhasil diekstrak! Total: {len(df_weekly)} minggu")

# 3.3 Klasifikasi curah hujan
df_weekly['Rainfall_Class'] = df_weekly['Curah Hujan (mm)'].apply(classify_rainfall)

# 3.4 Normalisasi
feature_cols = ['Temperatur Minimum', 'Temperatur Maksimum', 'Temperatur Rata-rata',
                'Kelembapan Rata-rata', 'Curah Hujan (mm)', 'Lamanya Penyinaran Matahari',
                'Kecepatan Angin Maksimum', 'Kecepatan Angin Rata-rata']

df_normalized, scaler = normalize_data(df_weekly, feature_cols)
print("Data berhasil dinormalisasi!")


# ============================================================================
# 4. PREPARE SEQUENCES (OVERLAPPING WINDOWS)
# ============================================================================

def create_sequences(data, feature_cols, target_col, sequence_length=52):

    X, y = [], []
    
    for i in range(len(data) - sequence_length):
        # Ambil sequence 52 minggu
        sequence = data[feature_cols].iloc[i:i+sequence_length].values
        # Target adalah minggu ke-53 (prediksi minggu depan)
        target = data[target_col].iloc[i+sequence_length]
        
        X.append(sequence)
        y.append(target)
    
    return np.array(X), np.array(y)

# Uncomment untuk membuat sequences:

sequence_length = 52  # 1 tahun = 52 minggu

X, y = create_sequences(df_normalized, feature_cols, 'Rainfall_Class', sequence_length)

print(f"Shape X: {X.shape}")  # (samples, 52, n_features)
print(f"Shape y: {y.shape}")  # (samples,)
print(f"Total sequences: {len(X)}")

# Visualisasi distribusi kelas
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.hist(y, bins=5, edgecolor='black')
plt.xlabel('Rainfall Class')
plt.ylabel('Frequency')
plt.title('Distribution of Rainfall Classes')
plt.xticks([0, 1, 2, 3, 4], ['Very Low', 'Low', 'Medium', 'High', 'Very High'])

plt.subplot(1, 2, 2)
class_names = ['Very Low\n(<5mm)', 'Low\n(5-20mm)', 'Medium\n(20-50mm)', 
               'High\n(50-100mm)', 'Very High\n(>100mm)']
unique, counts = np.unique(y, return_counts=True)
plt.bar(unique, counts, edgecolor='black')
plt.xlabel('Rainfall Class')
plt.ylabel('Count')
plt.title('Rainfall Class Distribution')
plt.xticks(unique, class_names, rotation=45, ha='right')
plt.tight_layout()
plt.show()


# ============================================================================
# 5. SPLIT DATA (80% TRAINING, 20% TESTING)
# ============================================================================

# Uncomment untuk split data:

# Split data sesuai paper: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Convert target ke categorical (one-hot encoding)
from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train, num_classes=5)
y_test_cat = to_categorical(y_test, num_classes=5)


# ============================================================================
# 6. BUILD RNN-LSTM MODEL
# ============================================================================

def build_lstm_model(input_shape, num_classes=5, lstm_units=64, dropout_rate=0.2):
    model = Sequential([
        # Input + LSTM Layer (64 units)
        LSTM(lstm_units, input_shape=input_shape, return_sequences=False),
        
        # Dropout untuk mencegah overfitting
        Dropout(dropout_rate),
        
        # Dense Layer (13 neurons dengan ReLU activation)
        Dense(13, activation='relu'),
        
        # Output Layer (5 classes dengan Sigmoid activation)
        Dense(num_classes, activation='softmax')
    ])
    
    return model

# Uncomment untuk build model:
# Input shape: (sequence_length, n_features)
input_shape = (X_train.shape[1], X_train.shape[2])

model = build_lstm_model(input_shape, num_classes=5, lstm_units=64, dropout_rate=0.2)

# Print model summary
model.summary()


# ============================================================================
# 7. COMPILE MODEL
# ============================================================================

# Uncomment untuk compile model:

# Sesuai paper: menggunakan Adam optimizer dengan learning rate 0.001
optimizer = Adam(learning_rate=0.001)

model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("Model berhasil dikompilasi!")


# ============================================================================
# 8. TRAINING MODEL
# ============================================================================

# Uncomment untuk training:

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=50,
    restore_best_weights=True,
    verbose=1
)

checkpoint = ModelCheckpoint(
    'best_rainfall_model.h5',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

# Training dengan 500 epochs sesuai paper
history = model.fit(
    X_train, y_train_cat,
    epochs=500,
    batch_size=32,
    validation_data=(X_test, y_test_cat),
    callbacks=[early_stopping, checkpoint],
    verbose=1
)

print("Training selesai!")

# ============================================================================
# 9. VISUALISASI HASIL TRAINING
# ============================================================================

def plot_training_history(history):
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot Accuracy
    axes[0].plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
    axes[0].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
    axes[0].set_xlabel('Epoch', fontsize=12)
    axes[0].set_ylabel('Accuracy', fontsize=12)
    axes[0].set_title('Model Accuracy', fontsize=14, fontweight='bold')
    axes[0].legend(fontsize=10)
    axes[0].grid(True, alpha=0.3)
    
    # Plot Loss
    axes[1].plot(history.history['loss'], label='Training Loss', linewidth=2)
    axes[1].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    axes[1].set_xlabel('Epoch', fontsize=12)
    axes[1].set_ylabel('Loss', fontsize=12)
    axes[1].set_title('Model Loss', fontsize=14, fontweight='bold')
    axes[1].legend(fontsize=10)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Uncomment untuk visualisasi:

plot_training_history(history)


# ============================================================================
# 10. EVALUASI MODEL
# ============================================================================

# Uncomment untuk evaluasi:

# Evaluasi pada training data
train_loss, train_accuracy = model.evaluate(X_train, y_train_cat, verbose=0)
print(f"Training Accuracy: {train_accuracy*100:.2f}%")
print(f"Training Loss: {train_loss:.4f}")

# Evaluasi pada test data
test_loss, test_accuracy = model.evaluate(X_test, y_test_cat, verbose=0)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")
print(f"Test Loss: {test_loss:.4f}")

# Prediksi
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification Report
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
class_names = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
print(classification_report(y_test, y_pred, target_names=class_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# ============================================================================
# 11. COMPARISON: RNN vs CNN (OPSIONAL)
# ============================================================================

def build_cnn_model(input_shape, num_classes=5):
    from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
    
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=32, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])
    
    return model

# Uncomment untuk perbandingan RNN vs CNN:
print("\n" + "="*60)
print("PERBANDINGAN RNN vs CNN")
print("="*60)

# Build CNN model
cnn_model = build_cnn_model(input_shape, num_classes=5)
cnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train CNN
history_cnn = cnn_model.fit(
    X_train, y_train_cat,
    epochs=500,
    batch_size=32,
    validation_data=(X_test, y_test_cat),
    callbacks=[EarlyStopping(patience=50, restore_best_weights=True)],
    verbose=0
)

# Evaluate CNN
cnn_train_loss, cnn_train_acc = cnn_model.evaluate(X_train, y_train_cat, verbose=0)
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(X_test, y_test_cat, verbose=0)

# Comparison Table
comparison_df = pd.DataFrame({
    'Method': ['RNN (LSTM)', 'CNN (1D)'],
    'Training Accuracy (%)': [train_accuracy*100, cnn_train_acc*100],
    'Training Loss': [train_loss, cnn_train_loss],
    'Test Accuracy (%)': [test_accuracy*100, cnn_test_acc*100],
    'Test Loss': [test_loss, cnn_test_loss]
})

print(comparison_df.to_string(index=False))


# ============================================================================
# 12. COMPARISON: SGD vs ADAM OPTIMIZER (OPSIONAL)
# ============================================================================

# Uncomment untuk perbandingan SGD vs Adam:
print("\n" + "="*60)
print("PERBANDINGAN SGD vs ADAM OPTIMIZER")
print("="*60)

# Build model dengan SGD
model_sgd = build_lstm_model(input_shape, num_classes=5)
model_sgd.compile(
    optimizer=SGD(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train dengan SGD
history_sgd = model_sgd.fit(
    X_train, y_train_cat,
    epochs=500,
    batch_size=32,
    validation_data=(X_test, y_test_cat),
    callbacks=[EarlyStopping(patience=50, restore_best_weights=True)],
    verbose=0
)

# Evaluate SGD
sgd_train_loss, sgd_train_acc = model_sgd.evaluate(X_train, y_train_cat, verbose=0)
sgd_test_loss, sgd_test_acc = model_sgd.evaluate(X_test, y_test_cat, verbose=0)

# Comparison Table
optimizer_comparison = pd.DataFrame({
    'Optimizer': ['Adam', 'SGD'],
    'Training Accuracy (%)': [train_accuracy*100, sgd_train_acc*100],
    'Training Loss': [train_loss, sgd_train_loss],
    'Test Accuracy (%)': [test_accuracy*100, sgd_test_acc*100],
    'Test Loss': [test_loss, sgd_test_loss]
})

print(optimizer_comparison.to_string(index=False))


# ============================================================================
# 13. TESTING DIFFERENT LEARNING RATES (OPSIONAL)
# ============================================================================

# Uncomment untuk testing learning rates:

print("\n" + "="*60)
print("TESTING DIFFERENT LEARNING RATES")
print("="*60)

learning_rates = [0.001, 0.002, 0.010, 0.040, 0.100, 0.400, 0.600, 0.800]
lr_results = []

for lr in learning_rates:
    print(f"\nTesting learning rate: {lr}")
    
    # Build and compile model
    model_lr = build_lstm_model(input_shape, num_classes=5)
    model_lr.compile(
        optimizer=Adam(learning_rate=lr),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Train
    history_lr = model_lr.fit(
        X_train, y_train_cat,
        epochs=500,
        batch_size=32,
        validation_data=(X_test, y_test_cat),
        callbacks=[EarlyStopping(patience=30, restore_best_weights=True)],
        verbose=0
    )
    
    # Evaluate
    train_loss_lr, train_acc_lr = model_lr.evaluate(X_train, y_train_cat, verbose=0)
    test_loss_lr, test_acc_lr = model_lr.evaluate(X_test, y_test_cat, verbose=0)
    
    lr_results.append({
        'Learning Rate': lr,
        'Training Accuracy (%)': train_acc_lr*100,
        'Training Loss': train_loss_lr,
        'Test Accuracy (%)': test_acc_lr*100,
        'Test Loss': test_loss_lr
    })

lr_df = pd.DataFrame(lr_results)
print("\n" + lr_df.to_string(index=False))

# Visualize learning rate comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(lr_df['Learning Rate'], lr_df['Training Accuracy (%)'], 
             marker='o', label='Training', linewidth=2)
axes[0].plot(lr_df['Learning Rate'], lr_df['Test Accuracy (%)'], 
             marker='s', label='Test', linewidth=2)
axes[0].set_xlabel('Learning Rate', fontsize=12)
axes[0].set_ylabel('Accuracy (%)', fontsize=12)
axes[0].set_title('Accuracy vs Learning Rate', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].set_xscale('log')

axes[1].plot(lr_df['Learning Rate'], lr_df['Training Loss'], 
             marker='o', label='Training', linewidth=2)
axes[1].plot(lr_df['Learning Rate'], lr_df['Test Loss'], 
             marker='s', label='Test', linewidth=2)
axes[1].set_xlabel('Learning Rate', fontsize=12)
axes[1].set_ylabel('Loss', fontsize=12)
axes[1].set_title('Loss vs Learning Rate', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xscale('log')

plt.tight_layout()
plt.show()

# ============================================================================
# 14. PREDICTION FUNCTION
# ============================================================================

def predict_rainfall(model, input_sequence, scaler):
    # Reshape untuk prediksi
    input_seq = input_sequence.reshape(1, input_sequence.shape[0], input_sequence.shape[1])
    
    # Prediksi
    prediction_probs = model.predict(input_seq, verbose=0)
    predicted_class = np.argmax(prediction_probs, axis=1)[0]
    confidence = prediction_probs[0][predicted_class] * 100
    
    # Mapping kelas ke nama
    class_mapping = {
        0: 'Very Low (<5mm)',
        1: 'Low (5-20mm)',
        2: 'Medium (20-50mm)',
        3: 'High (50-100mm)',
        4: 'Very High (>100mm)'
    }
    
    class_name = class_mapping[predicted_class]
    
    return predicted_class, class_name, confidence

# Uncomment untuk testing prediksi
# Contoh prediksi
sample_sequence = X_test[0]
pred_class, pred_name, pred_confidence = predict_rainfall(model, sample_sequence, scaler)

print(f"\nPrediksi Curah Hujan Minggu Depan:")
print(f"Kelas: {pred_name}")
print(f"Confidence: {pred_confidence:.2f}%")
print(f"Actual: {class_mapping[y_test[0]]}")


# ============================================================================
# 15. SAVE AND LOAD MODEL
# ============================================================================

# Uncomment untuk save model:

# Save model
model.save('rainfall_forecast_model.h5')
print("Model berhasil disimpan!")

# Save scaler
import joblib
joblib.dump(scaler, 'scaler.pkl')
print("Scaler berhasil disimpan!")


# Uncomment untuk load model:

# Load model
from tensorflow.keras.models import load_model
loaded_model = load_model('rainfall_forecast_model.h5')

# Load scaler
loaded_scaler = joblib.load('scaler.pkl')

print("Model dan scaler berhasil dimuat!")


print("\n" + "="*60)
print("SCRIPT SELESAI!")
print("="*60)
print("\nCatatan:")
print("- Uncomment bagian-bagian kode sesuai kebutuhan")
print("- Pastikan data sudah dimuat dengan benar sebelum menjalankan")
print("- Sesuaikan path file untuk load dan save data/model")
print("="*60)

In [None]:
# Rainfall Forecasting using Recurrent Neural Networks (RNN-LSTM)
# Berdasarkan paper: Prasetya & Djamal (2019)

# ============================================================================
# 1. IMPORT LIBRARIES
# ============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

# Set random seed untuk reproducibility
np.random.seed(42)
keras.utils.set_random_seed(42)

print("Library berhasil diimport!")

# ============================================================================
# 2. LOAD DATA
# ============================================================================
# Ganti dengan path file Anda
df = pd.read_excel('updated_dataset_final - Copy.xlsx')

# Contoh struktur data yang diharapkan:


# UNCOMMENT baris berikut dan sesuaikan path file:
# df = pd.read_csv('data_iklim.csv')
df['Tanggal'] = pd.to_datetime(df['Tanggal'])
df = df.sort_values('Tanggal').reset_index(drop=True)

print("Data berhasil dimuat!")
# print(f"Jumlah data: {len(df)}")
# print(f"\nInfo data:\n{df.info()}")
# print(f"\nSample data:\n{df.head()}")

# ============================================================================
# 3. PREPROCESSING DATA
# ============================================================================

def interpolate_missing_data(df):

    df_clean = df.copy()
    
    # Interpolasi untuk kolom numerik
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df_clean[col] = df_clean[col].interpolate(method='linear', limit_direction='both')
    
    return df_clean

def extract_weekly_features(df):
    df_weekly = df.copy()
    df_weekly['Week'] = df_weekly['Tanggal'].dt.isocalendar().week
    df_weekly['Year'] = df_weekly['Tanggal'].dt.year
    
    # Agregasi mingguan - ambil nilai maksimum (HANYA 3 KOLOM)
    weekly_data = df_weekly.groupby(['Year', 'Week']).agg({
        'Temperatur Rata-rata': 'max',
        'Kelembapan Rata-rata': 'max',
        'Curah Hujan (mm)': 'max',
        'Tanggal': 'last'
    }).reset_index()
    
    return weekly_data

def normalize_data(df, feature_cols):
    scaler = MinMaxScaler(feature_range=(0, 1))
    df_normalized = df.copy()
    df_normalized[feature_cols] = scaler.fit_transform(df[feature_cols])
    
    return df_normalized, scaler

def classify_rainfall(rainfall):
    if rainfall < 5:
        return 0
    elif rainfall < 20:
        return 1
    elif rainfall < 50:
        return 2
    elif rainfall < 100:
        return 3
    else:
        return 4

# Uncomment untuk menjalankan preprocessing:

# 3.1 Interpolasi missing data
df = interpolate_missing_data(df)
print("Missing data berhasil diinterpolasi!")

# 3.2 Ekstraksi fitur mingguan
df_weekly = extract_weekly_features(df)
print(f"Data mingguan berhasil diekstrak! Total: {len(df_weekly)} minggu")

# 3.3 Klasifikasi curah hujan
df_weekly['Rainfall_Class'] = df_weekly['Curah Hujan (mm)'].apply(classify_rainfall)

# 3.4 Normalisasi - HANYA 3 KOLOM SESUAI PAPER
feature_cols = ['Temperatur Rata-rata', 'Kelembapan Rata-rata', 'Curah Hujan (mm)']

df_normalized, scaler = normalize_data(df_weekly, feature_cols)
print("Data berhasil dinormalisasi!")
print(f"Jumlah fitur yang digunakan: {len(feature_cols)}")


# ============================================================================
# 4. PREPARE SEQUENCES (OVERLAPPING WINDOWS)
# ============================================================================

def create_sequences(data, feature_cols, target_col, sequence_length=52):
    X, y = [], []
    
    for i in range(len(data) - sequence_length):
        # Ambil sequence 52 minggu
        sequence = data[feature_cols].iloc[i:i+sequence_length].values
        # Target adalah minggu ke-53 (prediksi minggu depan)
        target = data[target_col].iloc[i+sequence_length]
        
        X.append(sequence)
        y.append(target)
    
    return np.array(X), np.array(y)

# Uncomment untuk membuat sequences:

sequence_length = 52  # 1 tahun = 52 minggu

X, y = create_sequences(df_normalized, feature_cols, 'Rainfall_Class', sequence_length)

print(f"Shape X: {X.shape}")  # (samples, 52, n_features)
print(f"Shape y: {y.shape}")  # (samples,)
print(f"Total sequences: {len(X)}")

# Visualisasi distribusi kelas
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.hist(y, bins=5, edgecolor='black')
plt.xlabel('Rainfall Class')
plt.ylabel('Frequency')
plt.title('Distribution of Rainfall Classes')
plt.xticks([0, 1, 2, 3, 4], ['Very Low', 'Low', 'Medium', 'High', 'Very High'])

plt.subplot(1, 2, 2)
class_names = ['Very Low\n(<5mm)', 'Low\n(5-20mm)', 'Medium\n(20-50mm)', 
               'High\n(50-100mm)', 'Very High\n(>100mm)']
unique, counts = np.unique(y, return_counts=True)
plt.bar(unique, counts, edgecolor='black')
plt.xlabel('Rainfall Class')
plt.ylabel('Count')
plt.title('Rainfall Class Distribution')
plt.xticks(unique, class_names, rotation=45, ha='right')
plt.tight_layout()
plt.show()


# ============================================================================
# 5. SPLIT DATA (80% TRAINING, 20% TESTING)
# ============================================================================

# Uncomment untuk split data:

# Split data sesuai paper: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Convert target ke categorical (one-hot encoding)
from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train, num_classes=5)
y_test_cat = to_categorical(y_test, num_classes=5)


# ============================================================================
# 6. BUILD RNN-LSTM MODEL
# ============================================================================

def build_lstm_model(input_shape, num_classes=5, lstm_units=64, dropout_rate=0.2):
    model = Sequential([
        # Input + LSTM Layer (64 units)
        LSTM(lstm_units, input_shape=input_shape, return_sequences=False),
        
        # Dropout untuk mencegah overfitting
        Dropout(dropout_rate),
        
        # Dense Layer (13 neurons dengan ReLU activation)
        Dense(13, activation='relu'),
        
        # Output Layer (5 classes dengan Sigmoid activation)
        Dense(num_classes, activation='softmax')
    ])
    
    return model

# Uncomment untuk build model:

# Input shape: (sequence_length, n_features)
input_shape = (X_train.shape[1], X_train.shape[2])

model = build_lstm_model(input_shape, num_classes=5, lstm_units=64, dropout_rate=0.2)

# Print model summary
model.summary()


# ============================================================================
# 7. COMPILE MODEL
# ============================================================================

# Uncomment untuk compile model:

# Sesuai paper: menggunakan Adam optimizer dengan learning rate 0.001
optimizer = Adam(learning_rate=0.001)

model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("Model berhasil dikompilasi!")


# ============================================================================
# 8. TRAINING MODEL
# ============================================================================

# Uncomment untuk training:

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=50,
    restore_best_weights=True,
    verbose=1
)

checkpoint = ModelCheckpoint(
    'best_rainfall_model.h5',
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

# Training dengan 500 epochs sesuai paper
history = model.fit(
    X_train, y_train_cat,
    epochs=500,
    batch_size=32,
    validation_data=(X_test, y_test_cat),
    callbacks=[early_stopping, checkpoint],
    verbose=1
)

print("Training selesai!")


# ============================================================================
# 9. VISUALISASI HASIL TRAINING
# ============================================================================

def plot_training_history(history):

    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot Accuracy
    axes[0].plot(history.history['accuracy'], label='Training Accuracy', linewidth=2)
    axes[0].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
    axes[0].set_xlabel('Epoch', fontsize=12)
    axes[0].set_ylabel('Accuracy', fontsize=12)
    axes[0].set_title('Model Accuracy', fontsize=14, fontweight='bold')
    axes[0].legend(fontsize=10)
    axes[0].grid(True, alpha=0.3)
    
    # Plot Loss
    axes[1].plot(history.history['loss'], label='Training Loss', linewidth=2)
    axes[1].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    axes[1].set_xlabel('Epoch', fontsize=12)
    axes[1].set_ylabel('Loss', fontsize=12)
    axes[1].set_title('Model Loss', fontsize=14, fontweight='bold')
    axes[1].legend(fontsize=10)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Uncomment untuk visualisasi:
plot_training_history(history)


# ============================================================================
# 10. EVALUASI MODEL
# ============================================================================

# Uncomment untuk evaluasi:

# Evaluasi pada training data
train_loss, train_accuracy = model.evaluate(X_train, y_train_cat, verbose=0)
print(f"Training Accuracy: {train_accuracy*100:.2f}%")
print(f"Training Loss: {train_loss:.4f}")

# Evaluasi pada test data
test_loss, test_accuracy = model.evaluate(X_test, y_test_cat, verbose=0)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")
print(f"Test Loss: {test_loss:.4f}")

# Prediksi
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification Report
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
class_names = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
print(classification_report(y_test, y_pred, target_names=class_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


# ============================================================================
# 11. COMPARISON: RNN vs CNN (OPSIONAL)
# ============================================================================

def build_cnn_model(input_shape, num_classes=5):
    from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
    
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=32, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])
    
    return model

# Uncomment untuk perbandingan RNN vs CNN:
print("\n" + "="*60)
print("PERBANDINGAN RNN vs CNN")
print("="*60)

# Build CNN model
cnn_model = build_cnn_model(input_shape, num_classes=5)
cnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train CNN
history_cnn = cnn_model.fit(
    X_train, y_train_cat,
    epochs=500,
    batch_size=32,
    validation_data=(X_test, y_test_cat),
    callbacks=[EarlyStopping(patience=50, restore_best_weights=True)],
    verbose=0
)

# Evaluate CNN
cnn_train_loss, cnn_train_acc = cnn_model.evaluate(X_train, y_train_cat, verbose=0)
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(X_test, y_test_cat, verbose=0)

# Comparison Table
comparison_df = pd.DataFrame({
    'Method': ['RNN (LSTM)', 'CNN (1D)'],
    'Training Accuracy (%)': [train_accuracy*100, cnn_train_acc*100],
    'Training Loss': [train_loss, cnn_train_loss],
    'Test Accuracy (%)': [test_accuracy*100, cnn_test_acc*100],
    'Test Loss': [test_loss, cnn_test_loss]
})

print(comparison_df.to_string(index=False))

# ============================================================================
# 12. COMPARISON: SGD vs ADAM OPTIMIZER (OPSIONAL)
# ============================================================================

# Uncomment untuk perbandingan SGD vs Adam:

print("\n" + "="*60)
print("PERBANDINGAN SGD vs ADAM OPTIMIZER")
print("="*60)

# Build model dengan SGD
model_sgd = build_lstm_model(input_shape, num_classes=5)
model_sgd.compile(
    optimizer=SGD(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train dengan SGD
history_sgd = model_sgd.fit(
    X_train, y_train_cat,
    epochs=500,
    batch_size=32,
    validation_data=(X_test, y_test_cat),
    callbacks=[EarlyStopping(patience=50, restore_best_weights=True)],
    verbose=0
)

# Evaluate SGD
sgd_train_loss, sgd_train_acc = model_sgd.evaluate(X_train, y_train_cat, verbose=0)
sgd_test_loss, sgd_test_acc = model_sgd.evaluate(X_test, y_test_cat, verbose=0)

# Comparison Table
optimizer_comparison = pd.DataFrame({
    'Optimizer': ['Adam', 'SGD'],
    'Training Accuracy (%)': [train_accuracy*100, sgd_train_acc*100],
    'Training Loss': [train_loss, sgd_train_loss],
    'Test Accuracy (%)': [test_accuracy*100, sgd_test_acc*100],
    'Test Loss': [test_loss, sgd_test_loss]
})

print(optimizer_comparison.to_string(index=False))


# ============================================================================
# 13. TESTING DIFFERENT LEARNING RATES (OPSIONAL)
# ============================================================================

# Uncomment untuk testing learning rates:

print("\n" + "="*60)
print("TESTING DIFFERENT LEARNING RATES")
print("="*60)

learning_rates = [0.001, 0.002, 0.010, 0.040, 0.100, 0.400, 0.600, 0.800]
lr_results = []

for lr in learning_rates:
    print(f"\nTesting learning rate: {lr}")
    
    # Build and compile model
    model_lr = build_lstm_model(input_shape, num_classes=5)
    model_lr.compile(
        optimizer=Adam(learning_rate=lr),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Train
    history_lr = model_lr.fit(
        X_train, y_train_cat,
        epochs=500,
        batch_size=32,
        validation_data=(X_test, y_test_cat),
        callbacks=[EarlyStopping(patience=30, restore_best_weights=True)],
        verbose=0
    )
    
    # Evaluate
    train_loss_lr, train_acc_lr = model_lr.evaluate(X_train, y_train_cat, verbose=0)
    test_loss_lr, test_acc_lr = model_lr.evaluate(X_test, y_test_cat, verbose=0)
    
    lr_results.append({
        'Learning Rate': lr,
        'Training Accuracy (%)': train_acc_lr*100,
        'Training Loss': train_loss_lr,
        'Test Accuracy (%)': test_acc_lr*100,
        'Test Loss': test_loss_lr
    })

lr_df = pd.DataFrame(lr_results)
print("\n" + lr_df.to_string(index=False))

# Visualize learning rate comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(lr_df['Learning Rate'], lr_df['Training Accuracy (%)'], 
             marker='o', label='Training', linewidth=2)
axes[0].plot(lr_df['Learning Rate'], lr_df['Test Accuracy (%)'], 
             marker='s', label='Test', linewidth=2)
axes[0].set_xlabel('Learning Rate', fontsize=12)
axes[0].set_ylabel('Accuracy (%)', fontsize=12)
axes[0].set_title('Accuracy vs Learning Rate', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].set_xscale('log')

axes[1].plot(lr_df['Learning Rate'], lr_df['Training Loss'], 
             marker='o', label='Training', linewidth=2)
axes[1].plot(lr_df['Learning Rate'], lr_df['Test Loss'], 
             marker='s', label='Test', linewidth=2)
axes[1].set_xlabel('Learning Rate', fontsize=12)
axes[1].set_ylabel('Loss', fontsize=12)
axes[1].set_title('Loss vs Learning Rate', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_xscale('log')

plt.tight_layout()
plt.show()


# ============================================================================
# 14. PREDICTION FUNCTION
# ============================================================================

def predict_rainfall(model, input_sequence, scaler):
    # Reshape untuk prediksi
    input_seq = input_sequence.reshape(1, input_sequence.shape[0], input_sequence.shape[1])
    
    # Prediksi
    prediction_probs = model.predict(input_seq, verbose=0)
    predicted_class = np.argmax(prediction_probs, axis=1)[0]
    confidence = prediction_probs[0][predicted_class] * 100
    
    # Mapping kelas ke nama
    class_mapping = {
        0: 'Very Low (<5mm)',
        1: 'Low (5-20mm)',
        2: 'Medium (20-50mm)',
        3: 'High (50-100mm)',
        4: 'Very High (>100mm)'
    }
    
    class_name = class_mapping[predicted_class]
    
    return predicted_class, class_name, confidence

# Uncomment untuk testing prediksi:
# Contoh prediksi
sample_sequence = X_test[0]
pred_class, pred_name, pred_confidence = predict_rainfall(model, sample_sequence, scaler)

print(f"\nPrediksi Curah Hujan Minggu Depan:")
print(f"Kelas: {pred_name}")
print(f"Confidence: {pred_confidence:.2f}%")
print(f"Actual: {class_mapping[y_test[0]]}")


# ============================================================================
# 15. SAVE AND LOAD MODEL
# ============================================================================

# Uncomment untuk save model:
# Save model
model.save('rainfall_forecast_model.h5')
print("Model berhasil disimpan!")

# Save scaler
import joblib
joblib.dump(scaler, 'scaler.pkl')
print("Scaler berhasil disimpan!")

# Uncomment untuk load model:

# Load model
from tensorflow.keras.models import load_model
loaded_model = load_model('rainfall_forecast_model.h5')

# Load scaler
loaded_scaler = joblib.load('scaler.pkl')

print("Model dan scaler berhasil dimuat!")

print("\n" + "="*60)
print("SCRIPT SELESAI!")
print("="*60)
print("\nCatatan:")
print("- Uncomment bagian-bagian kode sesuai kebutuhan")
print("- Pastikan data sudah dimuat dengan benar sebelum menjalankan")
print("- Sesuaikan path file untuk load dan save data/model")
print("="*60)

In [17]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, classification_report
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [18]:
# Load data
df = pd.read_excel("updated_dataset_final.xlsx")

# Pastikan format tanggal
df["Tanggal"] = pd.to_datetime(df["Tanggal"])

# Sort time (WAJIB)
df = df.sort_values("Tanggal").reset_index(drop=True)

In [19]:
MISSING_CODE = 8888
df.replace(MISSING_CODE, np.nan, inplace=True)

df["RainTomorrow"] = (df["Curah Hujan (mm)"].shift(-1) >= 1.0).astype(int)
df = df.iloc[:-1]  # drop baris terakhir (target NaN)

df = df.drop(columns=["Curah Hujan (mm)"])

In [20]:
df["Rain_lag1"] = df["RainTomorrow"].shift(1)
df["Rain_lag3"] = df["RainTomorrow"].shift(3)

df["wind_sin"] = np.sin(np.deg2rad(df["Arah Angin Terbanyak (°)"]))
df["wind_cos"] = np.cos(np.deg2rad(df["Arah Angin Terbanyak (°)"]))

df = df.drop(columns=["Arah Angin Terbanyak (°)"])


In [None]:
feature_cols = df.columns.drop(["Tanggal", "RainTomorrow"])

for col in feature_cols:
    df[col + "_missing"] = df[col].isna().astype(int)


In [None]:
# =========================
# 0. IMPORT LIBRARY
# =========================
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, classification_report
)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


# =========================
# 1. LOAD & SORT DATA
# =========================
df = pd.read_excel("updated_dataset_final.xlsx")

df["Tanggal"] = pd.to_datetime(df["Tanggal"])
df = df.sort_values("Tanggal").reset_index(drop=True)


# =========================
# 2. HANDLE 8888 → NaN
# =========================
MISSING_CODE = 8888
df.replace(MISSING_CODE, np.nan, inplace=True)


# =========================
# 3. DEFINE TARGET (RAIN TOMORROW)
# =========================
df["RainTomorrow"] = (df["Curah Hujan (mm)"].shift(-1) >= 1.0).astype(int)
df = df.iloc[:-1]   # drop baris terakhir


# =========================
# 4. DROP LEAKAGE FEATURE
# =========================
df = df.drop(columns=["Curah Hujan (mm)"])


# =========================
# 5. FEATURE ENGINEERING
# =========================

# Lag hujan (masa lalu saja)
df["Rain_lag1"] = df["RainTomorrow"].shift(1)
df["Rain_lag3"] = df["RainTomorrow"].shift(3)

# Arah angin → circular encoding
df["wind_sin"] = np.sin(np.deg2rad(df["Arah Angin Terbanyak (°)"]))
df["wind_cos"] = np.cos(np.deg2rad(df["Arah Angin Terbanyak (°)"]))

df = df.drop(columns=["Arah Angin Terbanyak (°)"])


# =========================
# 6. MISSING INDICATOR
# =========================
feature_cols = df.columns.drop(["Tanggal", "RainTomorrow"])

for col in feature_cols:
    df[col + "_missing"] = df[col].isna().astype(int)


# =========================
# 7. TIME-BASED SPLIT
# =========================
n = len(df)

train_end = int(0.7 * n)
val_end   = int(0.85 * n)

train = df.iloc[:train_end]
val   = df.iloc[train_end:val_end]
test  = df.iloc[val_end:]


# =========================
# 8. IMPUTATION (NO LEAKAGE)
# =========================
train = train.ffill()
val   = val.ffill()
test  = test.ffill()

median_vals = train.median(numeric_only=True)

train = train.fillna(median_vals)
val   = val.fillna(median_vals)
test  = test.fillna(median_vals)


# =========================
# 9. PREPARE X & y
# =========================
X_train = train.drop(columns=["Tanggal", "RainTomorrow"])
y_train = train["RainTomorrow"]

X_val = val.drop(columns=["Tanggal", "RainTomorrow"])
y_val = val["RainTomorrow"]

X_test = test.drop(columns=["Tanggal", "RainTomorrow"])
y_test = test["RainTomorrow"]


# =========================
# 10. SCALING (TRAIN ONLY)
# =========================
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)


# =========================
# 11. WINDOWING FOR RNN
# =========================
def make_sequences(X, y, window=7):
    X_seq, y_seq = [], []
    for i in range(window, len(X)):
        X_seq.append(X[i-window:i])
        y_seq.append(y.iloc[i])
    return np.array(X_seq), np.array(y_seq)

WINDOW = 7

X_train_seq, y_train_seq = make_sequences(X_train, y_train, WINDOW)
X_val_seq, y_val_seq     = make_sequences(X_val, y_val, WINDOW)
X_test_seq, y_test_seq   = make_sequences(X_test, y_test, WINDOW)


# =========================
# 12. BUILD RNN MODEL
# =========================
model = Sequential([
    LSTM(32, input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
    Dropout(0.2),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)


# =========================
# 13. TRAIN MODEL
# =========================
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


# =========================
# 14. EVALUATION (TEST SET)
# =========================
y_prob = model.predict(X_test_seq).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print("\n===== TEST PERFORMANCE =====")
print("Accuracy :", accuracy_score(y_test_seq, y_pred))
print("Precision:", precision_score(y_test_seq, y_pred))
print("Recall   :", recall_score(y_test_seq, y_pred))
print("ROC AUC  :", roc_auc_score(y_test_seq, y_prob))

print("\nClassification Report:\n")
print(classification_report(y_test_seq, y_pred))


# =========================
# 15. DAILY WEATHER REPORT
# =========================
latest_prob = y_prob[-1] * 100

print("\n===== LAPORAN CUACA BESOK =====")
print("Prediksi Hujan :", "YA" if latest_prob >= 50 else "TIDAK")
print(f"Probabilitas   : {latest_prob:.2f}%")


Epoch 1/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.6625 - loss: 0.6072 - val_accuracy: 0.7030 - val_loss: 0.5945
Epoch 2/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6739 - loss: 0.5818 - val_accuracy: 0.7047 - val_loss: 0.5923
Epoch 3/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6764 - loss: 0.5789 - val_accuracy: 0.7097 - val_loss: 0.5928
Epoch 4/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6825 - loss: 0.5716 - val_accuracy: 0.6997 - val_loss: 0.5915
Epoch 5/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6842 - loss: 0.5715 - val_accuracy: 0.6930 - val_loss: 0.5923
Epoch 6/50
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6907 - loss: 0.5693 - val_accuracy: 0.6980 - val_loss: 0.5912
Epoch 7/50
[1m88/88[0m [32m━━━━━━━━━━

In [26]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_seq, y_pred)

print("\nConfusion Matrix:")
print(cm)


Confusion Matrix:
[[293  72]
 [143  88]]


In [4]:
# =====================================================================
#               WEATHER RAIN PREDICTION — FULL SCRIPT
# =====================================================================

import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


# ===========================================================
# 1. LOAD DATA
# ===========================================================
df = pd.read_excel("Updated_Data_Historis_2015_2025.xlsx")
df["Tanggal"] = pd.to_datetime(df["Tanggal"])
df = df.sort_values("Tanggal").reset_index(drop=True)

# ===========================================================
# 2. HANDLE MISSING 8888 → NaN
# ===========================================================
df.replace(8888, np.nan, inplace=True)



# FIX: pastikan Curah Hujan (mm) numerik
df["Curah Hujan (mm)"] = (
    df["Curah Hujan (mm)"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .str.strip()
)

df["Curah Hujan (mm)"] = pd.to_numeric(df["Curah Hujan (mm)"], errors="coerce")

# TARGET: hujan besok
df["RainTomorrow"] = (df["Curah Hujan (mm)"].shift(-1) >= 1.0).astype(int)
df = df.iloc[:-1]

# ===========================================================
# 3. TARGET: HUJAN BESOK
# ===========================================================


# ===========================================================
# 4. LAG CURAH HUJAN (PALING PENTING)
# ===========================================================
df["Rain_mm_lag1"] = df["Curah Hujan (mm)"].shift(1)
df["Rain_mm_lag3"] = df["Curah Hujan (mm)"].shift(3)
df["Rain_mm_lag7"] = df["Curah Hujan (mm)"].shift(7)
df.drop(columns=["Curah Hujan (mm)"], inplace=True)

# ===========================================================
# 5. MUSIM (SIN-COS)
# ===========================================================
df["month"] = df["Tanggal"].dt.month
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
df.drop(columns=["month"], inplace=True)

# ===========================================================
# 6. CIRCULAR WIND DIRECTION
# ===========================================================

df["Arah Angin Terbanyak (°)"] = (
    df["Arah Angin Terbanyak (°)"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .str.strip()
)

df["Arah Angin Terbanyak (°)"] = pd.to_numeric(
    df["Arah Angin Terbanyak (°)"], errors="coerce"
)


df["wind_sin"] = np.sin(np.deg2rad(df["Arah Angin Terbanyak (°)"]))
df["wind_cos"] = np.cos(np.deg2rad(df["Arah Angin Terbanyak (°)"]))
df.drop(columns=["Arah Angin Terbanyak (°)"], inplace=True)

# ===========================================================
# 7. MISSING INDICATOR
# ===========================================================
feature_cols = df.columns.drop(["Tanggal", "RainTomorrow"])
for col in feature_cols:
    df[col + "_missing"] = df[col].isna().astype(int)

# ===========================================================
# 8. TIME SERIES SPLIT
# ===========================================================
n = len(df)
train_end = int(0.7 * n)
val_end   = int(0.85 * n)

train = df.iloc[:train_end]
val   = df.iloc[train_end:val_end]
test  = df.iloc[val_end:]

# ===========================================================
# 9. IMPUTATION (ANTI LEAKAGE)
# ===========================================================
train = train.ffill()
val   = val.ffill()
test  = test.ffill()

median_vals = train.median(numeric_only=True)
train = train.fillna(median_vals)
val   = val.fillna(median_vals)
test  = test.fillna(median_vals)

# ===========================================================
# 10. SCALING
# ===========================================================
X_train = train.drop(columns=["Tanggal", "RainTomorrow"])
y_train = train["RainTomorrow"]

X_val = val.drop(columns=["Tanggal", "RainTomorrow"])
y_val = val["RainTomorrow"]

X_test = test.drop(columns=["Tanggal", "RainTomorrow"])
y_test = test["RainTomorrow"]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

# ===========================================================
# 11. WINDOWING
# ===========================================================
def make_sequences(X, y, window=30):
    X_seq, y_seq = [], []
    for i in range(window, len(X)):
        X_seq.append(X[i-window:i])
        y_seq.append(y.iloc[i])
    return np.array(X_seq), np.array(y_seq)

WINDOW = 30

X_train_seq, y_train_seq = make_sequences(X_train, y_train)
X_val_seq, y_val_seq     = make_sequences(X_val, y_val)
X_test_seq, y_test_seq   = make_sequences(X_test, y_test)

# ===========================================================
# 12. CLASS WEIGHTS
# ===========================================================
weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0,1]),
    y=y_train_seq
)
class_weights = {0: weights[0], 1: weights[1]}

# ===========================================================
# 13. MODEL LSTM
# ===========================================================
model = Sequential([
    LSTM(64, return_sequences=True,
         input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
    LSTM(32),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=[tf.keras.metrics.AUC(name="auc")]
)

# ===========================================================
# 14. TRAIN
# ===========================================================
early_stop = EarlyStopping(
    monitor="val_auc",
    patience=7,
    mode="max",
    restore_best_weights=True
)

history = model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=50,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)

# ===========================================================
# 15. EVALUATION
# ===========================================================
y_prob = model.predict(X_test_seq).ravel()
threshold = 0.35   # lebih sensitif terhadap hujan
y_pred = (y_prob >= threshold).astype(int)

print("\n===== TEST RESULTS =====")
print("Accuracy :", accuracy_score(y_test_seq, y_pred))
print("Precision:", precision_score(y_test_seq, y_pred))
print("Recall   :", recall_score(y_test_seq, y_pred))
print("ROC AUC  :", roc_auc_score(y_test_seq, y_prob))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_seq, y_pred))

print("\nClassification Report:")
print(classification_report(y_test_seq, y_pred))

# ===========================================================
# 16. WEATHER FORECAST FOR NEXT DAY
# ===========================================================
prob = y_prob[-1] * 100
print("\n===== LAPORAN CUACA BESOK =====")
print("Prediksi Hujan :", "YA" if prob >= 50 else "TIDAK")
print(f"Probabilitas   : {prob:.2f}%")


ValueError: could not convert string to float: '-'

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4019 entries, 0 to 4018
Data columns (total 32 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   Tanggal                                     4019 non-null   datetime64[ns]
 1   Temperatur Minimum                          4019 non-null   object        
 2   Temperatur Maksimum                         4019 non-null   object        
 3   Temperatur Rata-rata                        4019 non-null   object        
 4   Kelembapan Rata-rata                        4019 non-null   object        
 5   Lamanya Penyinaran Matahari                 4019 non-null   object        
 6   Kecepatan Angin Maksimum                    4019 non-null   int64         
 7   Arah Angin Saat Kecepatan Maksimum          4019 non-null   int64         
 8   Kecepatan Angin Rata-rata                   4019 non-null   int64         
 9   RainTomo