In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import sys
import os

# --- System Integration ---
# To allow this 'notebook' file to see the 'src' folder,
# we need to tell Python that 'src' is one directory up.
# This enables us to import our clean code from 'src' into the 'notebook'.
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

try:
    # THIS IS OUR CLEAN CODE!
    from src.processing.features import DamageDataPreprocessor
    print("SUCCESS: 'src.processing.features.DamageDataPreprocessor' imported.")
except ImportError:
    print("ERROR: Could not import from 'src' folder.")
    print("Please verify that the file 'src/processing/features.py' exists.")
    # raise

# --- 1. Data Loading ---
file_path = "../1900_2021_DISASTERS.xlsx - emdat data.csv"
df_raw = pd.read_csv(file_path)
print(f"Raw data loaded. Number of rows: {len(df_raw)}")

# --- 2. Data Preprocessing (Using Our 'System' Component) ---
# Instantiating our clean class
preprocessor = DamageDataPreprocessor(
    features_to_use=['Disaster Subgroup', 'Continent', 'Disaster Group'],
    target_col="Total Damages ('000 US$)"
)

# By running 'fit_transform', we perform all those messy steps
# (dropna, log, fillna) from the notebook in a single line.
df_clean = preprocessor.fit_transform(df_raw)

print(f"Data processed. Available rows for the model: {len(df_clean)}")

# --- 3. Preparing Data for Model Training ---
# 'df_clean' now only contains the columns needed for the model
target = 'Log_Total_Damages'
features = preprocessor.features_to_use # ['Disaster Subgroup', 'Continent', 'Disaster Group']

X = df_clean[features]
y = df_clean[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# --- 4. Creating the Model Pipeline ---
# This is a pipeline for the model itself.
# Its purpose: Convert categorical features (text) into numbers (OneHotEncoder)
# and then train the model (LinearRegression).

model_pipeline = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore')), # 'handle_unknown' is important to prevent system crashes
    ('model', LinearRegression())
])

# --- 5. Training the Model ---
print("Training the model pipeline...")
model_pipeline.fit(X_train, y_train)
print("Model trained.")

# --- 6. Evaluating the Model ---
# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate metrics (in the Logarithmic space)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("\n--- Model Evaluation Results ---")
print(f"R-squared (R2) Score: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print("-----------------------------------")

SUCCESS: 'src.processing.features.DamageDataPreprocessor' imported.
Raw data loaded. Number of rows: 16126


TypeError: DamageDataPreprocessor.__init__() got an unexpected keyword argument 'features_to_use'

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import sys
import os

# --- System Integration (Same) ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
try:
    from src.processing.features import DamageDataPreprocessor
    print("SUCCESS: 'src.processing.features.DamageDataPreprocessor' imported.")
except ImportError:
    print("ERROR: Could not import from 'src' folder.")
    raise

# --- 1. Data Loading (Same) ---
file_path = "../1900_2021_DISASTERS.xlsx - emdat data.csv"
df_raw = pd.read_csv(file_path)

# --- 2. Data Preprocessing (Our Class Changed) ---

# UPDATE: Our class no longer takes 'features_to_use'.
# It only takes 'target_col'.
preprocessor = DamageDataPreprocessor(
    target_col="Total Damages ('000 US$)"
)

# df_clean NOW CONTAINS ALL COLUMNS
# (but only for the 5000+ rows where the target is valid)
df_clean = preprocessor.fit_transform(df_raw)

print(f"Data processed. Available rows for the model: {len(df_clean)}")

# --- 3. Preparing Data for Model Training (Critical Change) ---
target = 'Log_Total_Damages'

# We must manually define our features
categorical_features = ['Disaster Subgroup', 'Continent', 'Disaster Group']
numerical_features = ['Total Deaths', 'No Injured', 'No Affected', 'Dis Mag Value', 'Start Year']

# UPDATE: We must CAREFULLY select X and y from 'df_clean'
# X should contain only the features we will use
X = df_clean[categorical_features + numerical_features]
# y should contain only the new logarithmic target
y = df_clean[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# --- 4. Advanced Model Pipeline (ColumnTransformer) ---
# THERE IS NO CHANGE IN THIS PART.
# BECAUSE WE FIXED THE FLAWED COMPONENT (features.py).
# NOW 'SimpleImputer(strategy='median')' WILL ONLY RECEIVE NUMERICAL DATA.

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor_pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model_pipeline_v2 = Pipeline(steps=[
    ('preprocessor', preprocessor_pipeline),
    ('model', LinearRegression())
])

# --- 5. Training the Model ---
print("Training model pipeline v2 (Refactored)...")
model_pipeline_v2.fit(X_train, y_train) # <-- The place where the error occurred, should work CORRECTLY now
print("Model v2 trained.")

# --- 6. Evaluating the Model ---
y_pred = model_pipeline_v2.predict(X_test)

r2_v2 = r2_score(y_test, y_pred)
mse_v2 = mean_squared_error(y_test, y_pred)

print("\n--- Model v2 Evaluation Results (Numerical Features Added) ---")
print(f"R-squared (R2) Score: {r2_v2:.4f}")
print(f"Mean Squared Error (MSE): {mse_v2:.4f}")
print("-----------------------------------")

SUCCESS: 'src.processing.features.DamageDataPreprocessor' imported.
Data processed. Available rows for the model: 5245
Training set size: 4196, Test set size: 1049
Training model pipeline v2 (Refactored)...
Model v2 trained.

--- Model v2 Evaluation Results (Numerical Features Added) ---
R-squared (R2) Score: 0.0915
Mean Squared Error (MSE): 1.1282
-----------------------------------
