In [1]:
# Cell 0: Setup
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Paths
BASE = Path("..")  # relative to notebooks/
DATA_PROCESSED = BASE / "data" / "processed"
OUT_DIR = BASE / "data" / "interim"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("✅ Paths set up")


✅ Paths set up


In [2]:
# Cell 1: Load the cleaned dataset
clean_file = list(DATA_PROCESSED.glob("cleaned.parquet")) or list(DATA_PROCESSED.glob("cleaned.csv"))
assert clean_file, "Run preprocessing first to generate cleaned.csv or cleaned.parquet"
df = pd.read_parquet(clean_file[0]) if clean_file[0].suffix == ".parquet" else pd.read_csv(clean_file[0])

df.head()


Unnamed: 0,event_type,state,month,season,magnitude,magnitude_type,begin_lat,begin_lon,damage_property_num
0,Thunderstorm Wind,GEORGIA,3,MAM,52.0,EG,33.4757,-85.238,1000.0
1,Tornado,MICHIGAN,3,MAM,50.0,Unknown,41.79,-86.1,100000.0
2,Flash Flood,TENNESSEE,4,MAM,50.0,Unknown,36.03,-89.33,0.0
3,Thunderstorm Wind,TENNESSEE,4,MAM,52.0,EG,36.18,-88.16,0.0
4,Flash Flood,TENNESSEE,4,MAM,50.0,Unknown,36.3,-88.71,0.0


In [3]:
# Cell 2: Define target and features
target_col = "damage_property_num"

num_cols = ["month", "magnitude", "begin_lat", "begin_lon"]
cat_cols = ["event_type", "state", "season", "magnitude_type"]

features = num_cols + cat_cols
df_model = df[features + [target_col]].copy()

print("✅ Feature columns selected:", features)
df_model.head()


✅ Feature columns selected: ['month', 'magnitude', 'begin_lat', 'begin_lon', 'event_type', 'state', 'season', 'magnitude_type']


Unnamed: 0,month,magnitude,begin_lat,begin_lon,event_type,state,season,magnitude_type,damage_property_num
0,3,52.0,33.4757,-85.238,Thunderstorm Wind,GEORGIA,MAM,EG,1000.0
1,3,50.0,41.79,-86.1,Tornado,MICHIGAN,MAM,Unknown,100000.0
2,4,50.0,36.03,-89.33,Flash Flood,TENNESSEE,MAM,Unknown,0.0
3,4,52.0,36.18,-88.16,Thunderstorm Wind,TENNESSEE,MAM,EG,0.0
4,4,50.0,36.3,-88.71,Flash Flood,TENNESSEE,MAM,Unknown,0.0


In [4]:
# Cell 3: Handle missing values
# Numbers → median, Categories → "Unknown"
for c in num_cols:
    df_model[c] = pd.to_numeric(df_model[c], errors="coerce")
    df_model[c] = df_model[c].fillna(df_model[c].median())

for c in cat_cols:
    df_model[c] = df_model[c].astype("string").fillna("Unknown")

print("✅ Missing values handled")
df_model.isna().sum()


✅ Missing values handled


month                  0
magnitude              0
begin_lat              0
begin_lon              0
event_type             0
state                  0
season                 0
magnitude_type         0
damage_property_num    0
dtype: int64

In [5]:
# Cell 4: Build preprocessing pipeline (encoding + imputation)
preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("oh", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

# Fit the preprocessor
preprocessor.fit(df_model[features])

# Save for reuse
joblib.dump(preprocessor, OUT_DIR / "feature_preprocessor.joblib")

print("✅ Preprocessor fitted and saved ->", OUT_DIR / "feature_preprocessor.joblib")


✅ Preprocessor fitted and saved -> ..\data\interim\feature_preprocessor.joblib


In [6]:
# Cell 5: Transform dataset into numeric matrix
X = preprocessor.transform(df_model[features])
y = df_model[target_col].values

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (19098, 86)
y shape: (19098,)


In [7]:
# Cell 6: Save processed feature set (optional, sparse matrix may be large)
import scipy.sparse

# Save X (features) and y (target) separately
scipy.sparse.save_npz(OUT_DIR / "X_features.npz", X)
np.save(OUT_DIR / "y_target.npy", y)

print("✅ Saved features ->", OUT_DIR / "X_features.npz")
print("✅ Saved target   ->", OUT_DIR / "y_target.npy")


✅ Saved features -> ..\data\interim\X_features.npz
✅ Saved target   -> ..\data\interim\y_target.npy


In [8]:
# Cell 7: Quick sanity check
print("First 10 target values:", y[:10])


First 10 target values: [  1000. 100000.      0.      0.      0.      0.      0.      0.   5000.
   5000.]
