In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Google Collab/Dataset/disaster_risk_indonesia.csv')

In [8]:
df

Unnamed: 0,Category,Luas Bahaya(Ha),Jiwa Terpapar,Fisik (Rp. Miliyar),Ekonomi (Rp. Miliyar),Lingkungan (Ha),provinsi,tahun
0,Kota Solok,32256,449740,2932.423,5601.36,1374,Sumatera Barat,2016
1,Pesisir Selatan,2141994,413603,4049.368,68521.98,389402,Sumatera Barat,2016
2,Kota Payakumbuh,38624,702702,3825.734,10985.22,204,Sumatera Barat,2016
3,Dharmasraya,1343003,855747,5034.535,56937.33,260077,Sumatera Barat,2016
4,Pasaman,875267,1554126,7872.659,65001.44,148117,Sumatera Barat,2016
...,...,...,...,...,...,...,...,...
3316,Bireuen,1142678,3112975,11274.349,128776.76,71785,Aceh,2024
3317,Kota Langsa,84568,1138359,3764.776,17972.62,4,Aceh,2024
3318,Aceh Selatan,712774,471731,4947.781,33171.33,73238,Aceh,2024
3319,Bener Meriah,495686,563005,1739.156,32516.15,172811,Aceh,2024


In [11]:
import pandas as pd
import numpy as np
import time
import pickle
import os
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import HistGradientBoostingRegressor

# For downloading files
import base64
from IPython.display import HTML

# Install and import the boosting libraries
import subprocess
import sys

def install_package(package):
    """Install a package using pip"""
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install missing packages
required_packages = {
    'catboost': 'catboost',
    'lightgbm': 'lightgbm',
    'xgboost': 'xgboost',
    'ngboost': 'ngboost'
}

for package_name, pip_name in required_packages.items():
    try:
        __import__(package_name)
        print(f"{package_name} is already installed")
    except ImportError:
        print(f"Installing {package_name}...")
        install_package(pip_name)
        print(f"{package_name} installed successfully!")

# Now import all the libraries
import catboost as cb
import lightgbm as lgbm
import xgboost as xgb
from ngboost import NGBRegressor

# Set random seed for reproducibility
SEED = 42

# ===============================
# DATA PREPARATION
# ===============================

# You need to specify which column you want to predict
# Based on your data, common targets might be:
# - 'Luas Bahaya(Ha)' (hazard area)
# - 'Jiwa Terpapar' (exposed population)
# - 'Fisik (Rp. Miliyar)' (physical damage)
# - 'Ekonomi (Rp. Miliyar)' (economic damage)
# - 'Lingkungan (Ha)' (environmental impact)

# TARGET COLUMN - PREDICTING HAZARD AREA
target_col = 'Luas Bahaya(Ha)'

# Check if target column exists
if target_col not in df.columns:
    print(f"Target column '{target_col}' not found in dataframe.")
    print(f"Available columns: {df.columns.tolist()}")
    raise ValueError(f"Please specify a valid target column from: {df.columns.tolist()}")

# Remove rows with missing target values
dataset = df.dropna(subset=[target_col]).copy()
print(f"Dataset shape after removing missing targets: {dataset.shape}")

# Split features and target
X = dataset.drop(target_col, axis=1)
y = dataset[target_col]

# ===============================
# TRAIN-TEST SPLIT (80-20)
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=None
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# ===============================
# PREPROCESSING PIPELINE
# ===============================

# Identify numerical and categorical features
num_features = X.select_dtypes(include=np.number).columns.tolist()
cat_features = [col for col in X.columns if col not in num_features]

print(f"Numerical features: {num_features}")
print(f"Categorical features: {cat_features}")

# Create preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5))
])

categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, num_features),
    ('cat', categorical_pipeline, cat_features)
])

# ===============================
# MODELS DEFINITION
# ===============================

# ===============================
# MODELS DEFINITION
# ===============================

models = {
    'CatBoostRegressor': cb.CatBoostRegressor(random_state=SEED, verbose=0),
    'LightGBMRegressor': lgbm.LGBMRegressor(random_state=SEED, verbose=-1),
    'XGBoostRegressor': xgb.XGBRegressor(random_state=SEED, enable_categorical=True, verbosity=0),
    'NGBoostRegressor': NGBRegressor(random_state=SEED, verbose=0),
    'HistGradientBoosting': HistGradientBoostingRegressor(random_state=SEED),
    'LinearRegression': LinearRegression(),
    'ElasticNet': ElasticNet(random_state=SEED)
}

print(f"Available models: {list(models.keys())}")

# ===============================
# VALIDATION SETUP
# ===============================

# For time series data, use TimeSeriesSplit
# For regular data, you might want to use regular cross-validation
tscv = TimeSeriesSplit(n_splits=5)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# ===============================
# PIPELINE CREATION
# ===============================

def make_pipeline(model):
    return Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', model)
    ])

# ===============================
# MODEL TRAINING AND EVALUATION
# ===============================

print("Starting model training and evaluation...")
results = []

for name, model in models.items():
    print(f"Training {name}...")
    pipe = make_pipeline(model)

    start = time.perf_counter()

    # Cross-validation on training data
    mae_folds = cross_val_score(pipe, X_train, y_train, cv=tscv, scoring=mae_scorer)

    # Train on full training set and evaluate on test set
    pipe.fit(X_train, y_train)
    test_pred = pipe.predict(X_test)
    test_mae = mean_absolute_error(y_test, test_pred)

    end = time.perf_counter()

    results.append({
        'Model': name,
        'CV Mean MAE': -np.mean(mae_folds),
        'CV Std MAE': np.std(mae_folds),
        'CV Max MAE': -np.min(mae_folds),
        'CV Min MAE': -np.max(mae_folds),
        'Test MAE': test_mae,
        'Time (secs)': end - start
    })

# ===============================
# RESULTS DISPLAY
# ===============================

results_df = pd.DataFrame(results)
float_cols = ['CV Mean MAE', 'CV Std MAE', 'CV Max MAE', 'CV Min MAE', 'Test MAE', 'Time (secs)']
results_df[float_cols] = results_df[float_cols].round(4)
results_df = results_df.sort_values(by='CV Mean MAE', ascending=True)

# Display results in clean Python format
print("\n" + "="*120)
print("🏆 MODEL EVALUATION RESULTS - RANKED BY CROSS-VALIDATION PERFORMANCE")
print("="*120)

# Add ranking
results_df.insert(0, 'Rank', range(1, len(results_df) + 1))

# Configure pandas display options for clean output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.4f}'.format)

# Display the table
print(results_df.to_string(index=False))

print("\n" + "="*120)
print("📊 PERFORMANCE SUMMARY")
print("="*120)

# Performance summary
best_cv_model = results_df.iloc[0]['Model']
best_cv_mae = results_df.iloc[0]['CV Mean MAE']
best_test_model = results_df.loc[results_df['Test MAE'].idxmin(), 'Model']
best_test_mae = results_df.loc[results_df['Test MAE'].idxmin(), 'Test MAE']
fastest_model = results_df.loc[results_df['Time (secs)'].idxmin(), 'Model']
fastest_time = results_df.loc[results_df['Time (secs)'].idxmin(), 'Time (secs)']

print(f"🥇 Best Cross-Validation Performance: {best_cv_model} (MAE: {best_cv_mae:.4f})")
print(f"🎯 Best Test Set Performance: {best_test_model} (MAE: {best_test_mae:.4f})")
print(f"⚡ Fastest Training Time: {fastest_model} ({fastest_time:.2f} seconds)")

# Performance difference analysis
if best_cv_model != best_test_model:
    print(f"⚠️  Note: Best CV model differs from best test model - consider overfitting")
else:
    print(f"✅ Consistent performance: Same model performs best on both CV and test set")

print("\n" + "="*120)
print("📈 DETAILED METRICS EXPLANATION")
print("="*120)
print("• CV Mean MAE: Average Mean Absolute Error across cross-validation folds")
print("• CV Std MAE: Standard deviation of MAE across folds (lower = more consistent)")
print("• CV Max/Min MAE: Best and worst performance across folds")
print("• Test MAE: Performance on held-out test set (most important metric)")
print("• Time: Training time in seconds")
print("="*120)

# ===============================
# ADDITIONAL ANALYSIS
# ===============================

print(f"\n📊 TARGET VARIABLE ANALYSIS: {target_col}")
print("="*60)
print(f"Mean: {y.mean():,.2f}")
print(f"Median: {y.median():,.2f}")
print(f"Std: {y.std():,.2f}")
print(f"Min: {y.min():,.2f}")
print(f"Max: {y.max():,.2f}")
print(f"Range: {y.max() - y.min():,.2f}")

# Performance context
print(f"\n🎯 PERFORMANCE CONTEXT")
print("="*60)
mean_target = y.mean()
best_mae = results_df.iloc[0]['Test MAE']
baseline_mae = y.std()  # Simple baseline using standard deviation

print(f"Target Mean: {mean_target:,.2f}")
print(f"Best Model MAE: {best_mae:,.2f}")
print(f"Baseline MAE (std): {baseline_mae:,.2f}")
print(f"Improvement over baseline: {((baseline_mae - best_mae) / baseline_mae * 100):.1f}%")
print(f"Relative error: {(best_mae / mean_target * 100):.1f}% of target mean")

# Reset pandas display options
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

# ===============================
# EXPORT BEST MODEL TO PICKLE
# ===============================

print(f"\n💾 EXPORTING BEST MODEL TO PICKLE FORMAT")
print("="*60)

# Get the best model based on cross-validation performance
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Cross-Validation MAE: {results_df.iloc[0]['CV Mean MAE']:.4f}")
print(f"Test Set MAE: {results_df.iloc[0]['Test MAE']:.4f}")

# Create and train the best model pipeline
best_pipeline = make_pipeline(best_model)
print(f"Training final model on full training dataset...")
best_pipeline.fit(X_train, y_train)

# Create model info dictionary
model_info = {
    'model_name': best_model_name,
    'pipeline': best_pipeline,
    'target_column': target_col,
    'feature_columns': X.columns.tolist(),
    'numerical_features': num_features,
    'categorical_features': cat_features,
    'cv_mae': results_df.iloc[0]['CV Mean MAE'],
    'test_mae': results_df.iloc[0]['Test MAE'],
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'target_stats': {
        'mean': y.mean(),
        'std': y.std(),
        'min': y.min(),
        'max': y.max()
    },
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'random_seed': SEED
}

# Save the model
model_filename = f"best_model_{best_model_name.lower().replace(' ', '_')}.pkl"
try:
    with open(model_filename, 'wb') as f:
        pickle.dump(model_info, f)

    print(f"✅ Model successfully exported to: {model_filename}")
    print(f"📁 File size: {os.path.getsize(model_filename) / 1024:.2f} KB")

    # Create download link for Jupyter notebooks
    def create_download_link(filename):
        """Create a download link for the file"""
        try:
            with open(filename, 'rb') as f:
                data = f.read()
            b64 = base64.b64encode(data).decode()
            href = f'<a href="data:application/octet-stream;base64,{b64}" download="{filename}">Click here to download {filename}</a>'
            return HTML(href)
        except:
            return None

    # Try to create download link
    download_link = create_download_link(model_filename)
    if download_link:
        print(f"\n📥 DOWNLOAD LINK:")
        print("="*40)
        display(download_link)
    else:
        print(f"\n📥 FILE READY FOR DOWNLOAD:")
        print("="*40)
        print(f"File saved as: {model_filename}")
        print(f"You can find it in your current directory or use:")
        print(f"Right-click the file in your file explorer → Save As")

    # Test loading the model
    print(f"\n🔍 Testing model loading...")
    with open(model_filename, 'rb') as f:
        loaded_model_info = pickle.load(f)

    loaded_pipeline = loaded_model_info['pipeline']
    test_prediction = loaded_pipeline.predict(X_test[:5])  # Test with first 5 samples

    print(f"✅ Model loaded successfully!")
    print(f"🎯 Test prediction shape: {test_prediction.shape}")
    print(f"📊 Sample predictions: {test_prediction[:3]}")

except Exception as e:
    print(f"❌ Error saving model: {str(e)}")

print(f"\n📋 MODEL EXPORT SUMMARY")
print("="*60)
print(f"• Model Type: {model_info['model_name']}")
print(f"• Target Variable: {model_info['target_column']}")
print(f"• Number of Features: {len(model_info['feature_columns'])}")
print(f"• Training Samples: {model_info['training_samples']:,}")
print(f"• Test Samples: {model_info['test_samples']:,}")
print(f"• Cross-Validation MAE: {model_info['cv_mae']:.4f}")
print(f"• Test Set MAE: {model_info['test_mae']:.4f}")
print(f"• Export Date: {model_info['training_date']}")
print(f"• File Location: {os.path.abspath(model_filename)}")

print(f"\n🚀 HOW TO USE THE EXPORTED MODEL:")
print("="*60)
print("```python")
print("import pickle")
print("import pandas as pd")
print("")
print("# Load the model")
print(f"with open('{model_filename}', 'rb') as f:")
print("    model_info = pickle.load(f)")
print("")
print("# Get the trained pipeline")
print("pipeline = model_info['pipeline']")
print("")
print("# Make predictions on new data")
print("# new_data should have the same columns as training data")
print("predictions = pipeline.predict(new_data)")
print("")
print("# Access model information")
print("print(f\"Model: {model_info['model_name']}\")")
print("print(f\"Target: {model_info['target_column']}\")")
print("print(f\"Features: {model_info['feature_columns']}\")")
print("```")
print("="*60)

catboost is already installed
lightgbm is already installed
xgboost is already installed
ngboost is already installed
Dataset shape after removing missing targets: (3321, 8)
Training set shape: (2656, 7)
Test set shape: (665, 7)
Numerical features: ['Jiwa Terpapar', 'Fisik (Rp. Miliyar)', 'Ekonomi (Rp. Miliyar)', 'Lingkungan (Ha)', 'tahun']
Categorical features: ['Category', 'provinsi']
Available models: ['CatBoostRegressor', 'LightGBMRegressor', 'XGBoostRegressor', 'NGBoostRegressor', 'HistGradientBoosting', 'LinearRegression', 'ElasticNet']
Starting model training and evaluation...
Training CatBoostRegressor...
Training LightGBMRegressor...
Training XGBoostRegressor...
Training NGBoostRegressor...
Training HistGradientBoosting...
Training LinearRegression...
Training ElasticNet...

🏆 MODEL EVALUATION RESULTS - RANKED BY CROSS-VALIDATION PERFORMANCE
 Rank                Model  CV Mean MAE  CV Std MAE  CV Max MAE  CV Min MAE    Test MAE  Time (secs)
    1     LinearRegression   15985.0


🔍 Testing model loading...
✅ Model loaded successfully!
🎯 Test prediction shape: (5,)
📊 Sample predictions: [ 181844.00209443 1173173.00052152 2217090.99956733]

📋 MODEL EXPORT SUMMARY
• Model Type: LinearRegression
• Target Variable: Luas Bahaya(Ha)
• Number of Features: 7
• Training Samples: 2,656
• Test Samples: 665
• Cross-Validation MAE: 15985.0491
• Test Set MAE: 0.0048
• Export Date: 2025-07-10 14:59:16
• File Location: /content/best_model_linearregression.pkl

🚀 HOW TO USE THE EXPORTED MODEL:
```python
import pickle
import pandas as pd

# Load the model
with open('best_model_linearregression.pkl', 'rb') as f:
    model_info = pickle.load(f)

# Get the trained pipeline
pipeline = model_info['pipeline']

# Make predictions on new data
# new_data should have the same columns as training data
predictions = pipeline.predict(new_data)

# Access model information
print(f"Model: {model_info['model_name']}")
print(f"Target: {model_info['target_column']}")
print(f"Features: {model_info[