In [1]:
# Basic imports
import pandas as pd
import numpy as np


In [2]:
# Load the raw dataset (with messy K, M, B values)
df = pd.read_csv("../data/raw/StormEvents_rawDATA.csv")

print("Initial Shape:", df.shape)
df.head()


Initial Shape: (33904, 51)


Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,202503,31,1104,202503,31,1106,201366,1252415,GEORGIA,13,...,2.0,W,TYUS,33.4757,-85.238,33.4757,-85.238,A cold-front initiated a line of thunderstorms...,Tree down at the intersection of highway 5 and...,CSV
1,202503,30,1552,202503,30,1555,200337,1241136,MICHIGAN,26,...,1.0,NNE,EDWARDSBURG,41.79,-86.1,41.82,-86.07,A cold front pushed into the area during the a...,A brief EF-1 tornado was confirmed in Edwardsb...,CSV
2,202501,5,1800,202501,6,2227,197733,1222851,VIRGINIA,51,...,,,,,,,,An area of low pressure tracked across souther...,,CSV
3,202501,3,1300,202501,3,1900,197761,1223112,MARYLAND,24,...,,,,,,,,An area of low pressure moved off into New Eng...,,CSV
4,202501,3,1300,202501,3,1900,197761,1223113,MARYLAND,24,...,,,,,,,,An area of low pressure moved off into New Eng...,,CSV


In [3]:
drop_cols = [
    "EPISODE_ID", "EVENT_ID", 
    "EPISODE_NARRATIVE", "EVENT_NARRATIVE", 
    "BEGIN_DATE_TIME", "END_DATE_TIME", 
    "STATE_FIPS", "CZ_FIPS"
]

df = df.drop(columns=drop_cols, errors="ignore")
print("Shape after dropping irrelevant columns:", df.shape)


Shape after dropping irrelevant columns: (33904, 43)


In [4]:
# Count missing values per column
df.isna().sum().sort_values(ascending=False).head(15)


CATEGORY              33904
TOR_OTHER_CZ_FIPS     33689
TOR_OTHER_CZ_STATE    33689
TOR_OTHER_WFO         33689
TOR_OTHER_CZ_NAME     33689
TOR_F_SCALE           32912
TOR_LENGTH            32912
TOR_WIDTH             32912
FLOOD_CAUSE           30918
MAGNITUDE_TYPE        20733
MAGNITUDE             15520
BEGIN_LON             14806
END_LAT               14806
END_LOCATION          14806
BEGIN_LAT             14806
dtype: int64

In [5]:
def convert_damage(value):
    if pd.isna(value):
        return 0.0
    
    s = str(value).strip().upper()
    
    # Remove accidental spaces inside (e.g. "10 K")
    s = s.replace(" ", "")
    
    try:
        if s.endswith("K"):
            return float(s[:-1]) * 1_000
        elif s.endswith("M"):
            return float(s[:-1]) * 1_000_000
        elif s.endswith("B"):
            return float(s[:-1]) * 1_000_000_000
        else:
            return float(s)  # plain number without suffix
    except ValueError:
        # Instead of silently returning 0, mark as NaN so we can count them
        return np.nan
df["DAMAGE_PROPERTY"] = df["DAMAGE_PROPERTY"].apply(convert_damage)
df["DAMAGE_CROPS"] = df["DAMAGE_CROPS"].apply(convert_damage)

# Report how many NaNs or 0s we created
print("Zeros in DAMAGE_PROPERTY:", (df["DAMAGE_PROPERTY"] == 0).sum())
print("Zeros in DAMAGE_CROPS:", (df["DAMAGE_CROPS"] == 0).sum())
print("NaNs in DAMAGE_PROPERTY:", df["DAMAGE_PROPERTY"].isna().sum())
print("NaNs in DAMAGE_CROPS:", df["DAMAGE_CROPS"].isna().sum())

df[["DAMAGE_PROPERTY", "DAMAGE_CROPS"]].describe()


Zeros in DAMAGE_PROPERTY: 34
Zeros in DAMAGE_CROPS: 0
NaNs in DAMAGE_PROPERTY: 0
NaNs in DAMAGE_CROPS: 0


Unnamed: 0,DAMAGE_PROPERTY,DAMAGE_CROPS
count,33904.0,33904.0
mean,254474.9,84818.338013
std,309977.5,103351.103213
min,0.0,4e-06
25%,11992.08,3998.458744
50%,12072.73,4013.935459
75%,600064.6,200012.564817
max,1715994.0,572001.791319


In [6]:
# Fill numeric NaNs with median, but check for all-NaN columns
for col in df.select_dtypes(include=[np.number]).columns:
    if df[col].isna().all():
        print(f"⚠️ Column '{col}' is entirely NaN -> dropping it.")
        df = df.drop(columns=[col])
    else:
        df[col] = df[col].fillna(df[col].median())

# Fill categorical NaNs with mode
for col in df.select_dtypes(include=[object]).columns:
    if df[col].isna().all():
        print(f"⚠️ Column '{col}' is entirely NaN -> dropping it.")
        df = df.drop(columns=[col])
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

print("✅ Missing values handled safely.")



⚠️ Column 'CATEGORY' is entirely NaN -> dropping it.
✅ Missing values handled safely.


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

print("Categorical columns encoded.")


Categorical columns encoded.


In [8]:
df.to_csv("../data/interim/StormEvents_cleaned.csv", index=False)
print("✅ Cleaned dataset saved at ../data/interim/StormEvents_cleaned.csv")


✅ Cleaned dataset saved at ../data/interim/StormEvents_cleaned.csv


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Exclude target columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(
    ["DAMAGE_PROPERTY", "DAMAGE_CROPS"], errors="ignore"
)

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("Numeric features scaled.")


Numeric features scaled.


In [10]:
df.to_csv("../data/processed/StormEvents_features.csv", index=False)
print("✅ Processed dataset saved at ../data/processed/StormEvents_features.csv")


✅ Processed dataset saved at ../data/processed/StormEvents_features.csv


In [11]:
print("DAMAGE_PROPERTY summary:\n", df["DAMAGE_PROPERTY"].describe())
print("\nDAMAGE_CROPS summary:\n", df["DAMAGE_CROPS"].describe())

print("\nUnique values in property (sample):", df["DAMAGE_PROPERTY"].value_counts().head(10))
print("\nUnique values in crops (sample):", df["DAMAGE_CROPS"].value_counts().head(10))

print("\nNumber of zeros in property:", (df["DAMAGE_PROPERTY"] == 0).sum())
print("Number of zeros in crops:", (df["DAMAGE_CROPS"] == 0).sum())


DAMAGE_PROPERTY summary:
 count    3.390400e+04
mean     2.544749e+05
std      3.099775e+05
min      0.000000e+00
25%      1.199208e+04
50%      1.207273e+04
75%      6.000646e+05
max      1.715994e+06
Name: DAMAGE_PROPERTY, dtype: float64

DAMAGE_CROPS summary:
 count     33904.000000
mean      84818.338013
std      103351.103213
min           0.000004
25%        3998.458744
50%        4013.935459
75%      200012.564817
max      572001.791319
Name: DAMAGE_CROPS, dtype: float64

Unique values in property (sample): DAMAGE_PROPERTY
0.000000         34
12012.098114      1
11904.335988      1
11913.754108      1
11984.087754      1
599969.914669     1
12007.592383      1
11961.357329      1
8971.885624       1
12063.616035      1
Name: count, dtype: int64

Unique values in crops (sample): DAMAGE_CROPS
207982.396695    1
3982.285606      1
4004.350395      1
3987.631238      1
3994.990350      1
4000.631729      1
4006.753008      1
3997.789179      1
4017.791718      1
3999.995610      1
N