In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
# Load the cleaned dataset (from preprocessing stage)
df = pd.read_csv("../data/interim/StormEvents_cleaned.csv")

print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (33904, 42)


Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,STATE,YEAR,MONTH_NAME,EVENT_TYPE,...,BEGIN_AZIMUTH,BEGIN_LOCATION,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,DATA_SOURCE
0,202503,31,1104,202503,31,1106,14,2025,3,35,...,13,8875,2.0,13,8928,33.4757,-85.238,33.4757,-85.238,0
1,202503,30,1552,202503,30,1555,34,2025,3,36,...,12,2815,1.0,5,2820,41.79,-86.1,41.82,-86.07,0
2,202501,5,1800,202501,6,2227,59,2025,2,39,...,3,3212,1.0,3,3206,36.7106,-88.08,36.71405,-88.07145,0
3,202501,3,1300,202501,3,1900,32,2025,2,40,...,3,3212,1.0,3,3206,36.7106,-88.08,36.71405,-88.07145,0
4,202501,3,1300,202501,3,1900,32,2025,2,40,...,3,3212,1.0,3,3206,36.7106,-88.08,36.71405,-88.07145,0


In [3]:
# Extract season from month
month_to_season = {
    12: "Winter", 1: "Winter", 2: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Fall", 10: "Fall", 11: "Fall"
}

df["SEASON"] = df["BEGIN_DAY"].map(lambda m: month_to_season.get(m, "Unknown"))


In [4]:
# Create a severity score combining injuries and fatalities
df["SEVERITY_SCORE"] = (
    df["MAGNITUDE"].fillna(0) * 10 +
    df["INJURIES_DIRECT"].fillna(0) * 5 +
    df["INJURIES_INDIRECT"].fillna(0) * 3 +
    df["DEATHS_DIRECT"].fillna(0) * 20 +
    df["DEATHS_INDIRECT"].fillna(0) * 15
)


In [5]:
# Label encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

print("Categorical columns encoded.")


Categorical columns encoded.


In [6]:
# Drop features with no variance (constant columns)
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
if constant_cols:
    print("Dropping constant columns:", constant_cols)
    df = df.drop(columns=constant_cols)


Dropping constant columns: ['YEAR', 'DATA_SOURCE']


In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Exclude target columns from scaling
numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(
    ["DAMAGE_PROPERTY", "DAMAGE_CROPS"], errors="ignore"
)

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("Numeric features scaled.")


Numeric features scaled.


In [8]:
print("Final Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Final Shape: (33904, 42)
Columns: ['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH', 'END_DAY', 'END_TIME', 'STATE', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_NAME', 'WFO', 'CZ_TIMEZONE', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'SOURCE', 'MAGNITUDE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'TOR_F_SCALE', 'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE', 'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH', 'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON', 'SEASON', 'SEVERITY_SCORE']


Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,STATE,MONTH_NAME,EVENT_TYPE,CZ_TYPE,...,BEGIN_LOCATION,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,SEASON,SEVERITY_SCORE
0,-0.115915,1.920117,-0.225947,-0.115915,1.776167,-0.412825,-1.208504,0.637075,0.876759,-0.926974,...,2.119477,0.113216,1.949402,2.127779,-0.882837,0.466457,-0.883438,0.465605,0.439303,0.440712
1,-0.115915,1.803019,0.425306,-0.115915,1.659319,0.289067,-0.141277,0.637075,0.961442,-0.926974,...,-0.576246,-0.211831,-0.052424,-0.575328,1.444893,0.353899,1.452682,0.356961,0.439303,0.334989
2,-1.558638,-1.124428,0.785822,-1.558638,-1.145023,1.339561,1.192755,-0.044832,1.21549,1.078778,...,-0.399645,-0.211831,-0.55288,-0.404503,0.022828,0.095354,0.02319,0.095608,-1.391679,0.334989
3,-1.558638,-1.358624,0.058977,-1.558638,-1.495566,0.828384,-0.248,-0.044832,1.300173,1.078778,...,-0.399645,-0.211831,-0.55288,-0.404503,0.022828,0.095354,0.02319,0.095608,-1.391679,0.334989
4,-1.558638,-1.358624,0.058977,-1.558638,-1.495566,0.828384,-0.248,-0.044832,1.300173,1.078778,...,-0.399645,-0.211831,-0.55288,-0.404503,0.022828,0.095354,0.02319,0.095608,-1.391679,0.334989


In [9]:
# Save processed dataset
df.to_csv("../data/processed/StormEvents_features.csv", index=False)
print("✅ Processed dataset saved at ../data/processed/StormEvents_features.csv")


✅ Processed dataset saved at ../data/processed/StormEvents_features.csv
