In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error
import sys
import os

# --- UPDATE 1: Import the Model ---
from sklearn.linear_model import LinearRegression # (Kept for comparison)
from sklearn.ensemble import RandomForestRegressor # <-- NEW MODEL

# --- System Integration (Same) ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
try:
    from src.processing.features import DamageDataPreprocessor
    print("SUCCESS: 'src.processing.features.DamageDataPreprocessor' imported.")
except ImportError:
    print("ERROR: Could not import from 'src' folder.")
    raise

# --- 1. Data Loading (Same) ---
file_path = "../1900_2021_DISASTERS.xlsx - emdat data.csv"
df_raw = pd.read_csv(file_path)

# --- 2. Data Preprocessing (Same) ---
preprocessor = DamageDataPreprocessor(
    target_col="Total Damages ('000 US$)"
)
df_clean = preprocessor.fit_transform(df_raw)
print(f"Data processed. Available rows for the model: {len(df_clean)}")

# --- 3. Preparing Data for Model Training (Same) ---
target = 'Log_Total_Damages'
categorical_features = ['Disaster Subgroup', 'Continent', 'Disaster Group']
numerical_features = ['Total Deaths', 'No Injured', 'No Affected', 'Dis Mag Value', 'Start Year']
X = df_clean[categorical_features + numerical_features]
y = df_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# --- 4. Advanced Model Pipeline (Same) ---
# The plumbing (data processing) DOES NOT CHANGE.
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor_pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# --- UPDATE 2: Change the model in the Pipeline ---
# The "System" is the same, the "Model" (component) has changed
model_pipeline_v3 = Pipeline(steps=[
    ('preprocessor', preprocessor_pipeline),
    # ('model', LinearRegression()) # <-- OLD
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)) # <-- NEW
    # n_jobs=-1 : Use all CPU cores (System Thinking: Speed)
    # random_state=42 : Ensure reproducible results (Engineering Quality)
])

# --- 5. Training the Model ---
print("Training model pipeline v3 (with RandomForest)...")
# This will take LONGER than LinearRegression.
model_pipeline_v3.fit(X_train, y_train)
print("Model v3 trained.")

# --- 6. Evaluating the Model ---
y_pred = model_pipeline_v3.predict(X_test)

r2_v3 = r2_score(y_test, y_pred)
mse_v3 = mean_squared_error(y_test, y_pred)

print("\n--- Model v3 Evaluation Results (Random Forest) ---")
print(f"R-squared (R2) Score: {r2_v3:.4f}")
print(f"Mean Squared Error (MSE): {mse_v3:.4f}")
print("-----------------------------------")

SUCCESS: 'src.processing.features.DamageDataPreprocessor' imported.
Data processed. Available rows for the model: 5245
Training set size: 4196, Test set size: 1049
Training model pipeline v3 (with RandomForest)...
Model v3 trained.

--- Model v3 Evaluation Results (Random Forest) ---
R-squared (R2) Score: 0.2270
Mean Squared Error (MSE): 0.9599
-----------------------------------


In [2]:
import joblib
import os

# --- 1. Create Save Path and Directory ---
# We want to go up one directory from the notebooks folder and
# create a folder named 'models'.
# 'exist_ok=True' prevents an error if the directory already exists.
model_directory = "../models"
os.makedirs(model_directory, exist_ok=True)

# The full file path for the model to be saved
model_file_path = os.path.join(model_directory, "damage_model_v1.joblib")

# --- 2. Save the Model (The Entire Pipeline) ---
# We serialize our 'model_pipeline_v3' object (which contains both
# data processing and the RandomForest) into a single file.
print(f"Saving model pipeline to the path: {model_file_path}")
joblib.dump(model_pipeline_v3, model_file_path)

print("SUCCESS: Model successfully saved as 'damage_model_v1.joblib'.")

# --- 3. (Optional) Check That the Model is Loaded Back ---
print("\nLet's test by loading the model back...")
loaded_model = joblib.load(model_file_path)

# Take the first test data row (X_test.iloc[0]) and make a prediction
test_sample = X_test.iloc[0:1] # 0:1 -> Ensures it remains a DataFrame
prediction = loaded_model.predict(test_sample)

# Reminder: Our prediction is in the Logarithmic space
print(f"Logarithmic Prediction for the test data: {prediction[0]:.4f}")

# To find the Real (Original) Damage, we must take 10^x
real_prediction = 10**prediction[0]
print(f"The 'Real Damage' equivalent of this prediction: {real_prediction:,.2f} ('000 US$)")

Saving model pipeline to the path: ../models/damage_model_v1.joblib
SUCCESS: Model successfully saved as 'damage_model_v1.joblib'.

Let's test by loading the model back...
Logarithmic Prediction for the test data: 4.8667
The 'Real Damage' equivalent of this prediction: 73,573.12 ('000 US$)
