### <span style="color:orange">Importing Libraries</span>

In [108]:
import pandas as pd
import sklearn
import warnings
from pathlib import Path
from feature_engine.encoding import RareLabelEncoder
from feature_engine.datetime import DatetimeFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output="pandas")
warnings.filterwarnings("ignore")

### <span style="color:orange">Reading Data from Source</span>

In [109]:
def load_train_test_csv():
    BASE_DIR = Path.cwd().parents[0]

    train_dir = BASE_DIR / "data" / "cleaned" / "train"
    test_dir = BASE_DIR / "data" / "cleaned" / "test"

    X_train = pd.read_csv(train_dir / "X_train.csv")
    X_test  = pd.read_csv(test_dir / "X_test.csv")
    y_train = pd.read_csv(train_dir / "y_train.csv")
    y_test  = pd.read_csv(test_dir / "y_test.csv")

    return X_train, X_test, y_train, y_test

In [110]:
X_train, X_test, y_train, y_test = load_train_test_csv()

In [111]:
# Shape of data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(629286, 22)
(157322, 22)
(629286, 1)
(157322, 1)


In [112]:
X_train.head(5)

Unnamed: 0,region,disasterNumber,sriaDisaster,declarationTitle,disasterType,incidentType,declarationDate,stateAbbreviation,state,county,applicantId,applicantName,pnpStatus,damageCategoryCode,dateObligated,pwNumber,projectTitle,versionNumber,eligibilityStatus,fundingStatus,paCloseoutStatus,id
0,4,4259,1,SEVERE STORMS AND FLOODING,DR,Severe Storm,2016-02-26,GA,Georgia,Meriwether,199-99199-00,MERIWETHER (COUNTY),0.0,C,2016-07-15,174,MERI04C - ROAD WASHOUTS/CULVERTS,0.0,E,O,C,1dc91cc9-96a7-4d05-b55f-47b5e6dee9ec
1,6,1603,0,HURRICANE KATRINA,DR,Tropical Cyclone,2005-08-29,LA,Louisiana,Statewide,000-UXL4N-00,"FACILITY PLANNING AND CONTROL, STATE OF LOUISIANA",0.0,E,2007-06-18,5800,"LO1152, #42 PUMP HOUSE, N.O. CITY PARK",1.0,E,O,O,c8ab2153-1f14-47bb-bdeb-d907b85b1434
2,3,4030,0,TROPICAL STORM LEE,DR,Flood,2011-09-12,PA,Pennsylvania,Berks,011-0D8AD-00,CONRAD WEISER SCHOOL,0.0,E,2012-11-09,5523,JDP-088 Gym Floors,0.0,E,O,C,6132d152-0d2a-4d8e-b879-0fce9c1fefed
3,8,1334,0,"SEVERE STORMS, FLOODING AND GROUND SATURATION",DR,Severe Storm,2000-06-27,ND,North Dakota,Traill,097-51500-00,"MAYVILLE, CITY OF",0.0,F,2000-09-22,867,DRAIN REPAIR,0.0,E,O,C,e70f71e5-adb0-4737-87dd-bbd36712330c
4,2,3173,0,SNOWSTORMS,EM,Winter Storm,2003-02-25,NY,New York,Delaware,025-20126-00,DELHI,0.0,B,2003-04-08,300,EMERGENCY PROTECTIVE MEASURES,0.0,E,O,C,3e310e76-4d93-4a1b-866a-9205ec023d72


# <span style="color:orange">Transformation Operations</span>

### <span style="color:orange">1. incidentType</span>

In [113]:
X_train.incidentType.value_counts()

incidentType
Severe Storm              277202
Tropical Cyclone          232838
Winter Storm               49706
Flood                      49343
Fire                       11781
Earthquake                  3636
Coastal Storm               2726
Other                        942
Biological                   484
Tsunami                      214
Landslide                    153
Terrorist                     96
Infrastructure Failure        83
Chemical                      47
Volcanic Eruption             33
Drought                        2
Name: count, dtype: int64

In [114]:
incidentType_transformer = Pipeline(
    steps=[
        ("rare", RareLabelEncoder(tol=0.01, replace_with="Other")),
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [115]:
incidentType_transformer.fit_transform(X_train.loc[:, ['incidentType']])

Unnamed: 0,incidentType_Fire,incidentType_Flood,incidentType_Other,incidentType_Severe Storm,incidentType_Tropical Cyclone,incidentType_Winter Storm
0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...
629281,0.0,0.0,0.0,0.0,0.0,1.0
629282,0.0,0.0,0.0,1.0,0.0,0.0
629283,0.0,0.0,0.0,1.0,0.0,0.0
629284,0.0,0.0,0.0,0.0,1.0,0.0


### <span style="color:orange">2. state</span>

In [116]:
X_train.state.value_counts().nlargest(10)

state
Florida         61217
Louisiana       48288
New York        44548
Texas           40621
North Dakota    22621
Iowa            22027
Pennsylvania    21398
New Jersey      21022
Mississippi     19815
Oklahoma        18988
Name: count, dtype: int64

In [117]:
state_transformer = Pipeline(
    steps=[
        ("rare", RareLabelEncoder(tol=0.01, replace_with="Other", max_n_categories=10)),
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [118]:
state_transformer.fit_transform(X_train.loc[:, ['state']])

Unnamed: 0,state_Florida,state_Iowa,state_Louisiana,state_Mississippi,state_New Jersey,state_New York,state_North Dakota,state_Oklahoma,state_Other,state_Pennsylvania,state_Texas
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
629281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
629282,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
629283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
629284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### <span style="color:orange">3. county</span>

In [119]:
X_train.county.value_counts().nlargest(10)

county
Statewide     108682
Orleans        10147
Jefferson       9673
Miami-Dade      8456
missing         7421
Palm Beach      5930
Broward         5776
Harris          5757
Washington      5416
Orange          5187
Name: count, dtype: int64

In [120]:
county_transformer = Pipeline(
    steps=[
        ("rare", RareLabelEncoder(tol=0.01, replace_with="Other", max_n_categories=10)),
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [121]:
county_transformer.fit_transform(X_train.loc[:, ['county']])

Unnamed: 0,county_Jefferson,county_Miami-Dade,county_Orleans,county_Other,county_Statewide,county_missing
0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
629281,0.0,0.0,0.0,1.0,0.0,0.0
629282,0.0,0.0,0.0,1.0,0.0,0.0
629283,1.0,0.0,0.0,0.0,0.0,0.0
629284,0.0,0.0,0.0,1.0,0.0,0.0


### <span style="color:orange">4. declarationDate & dateObligated</span>

In [122]:
X_train.declarationDate.head(5)

0    2016-02-26
1    2005-08-29
2    2011-09-12
3    2000-06-27
4    2003-02-25
Name: declarationDate, dtype: object

In [123]:
X_train.dateObligated.head(5)

0    2016-07-15
1    2007-06-18
2    2012-11-09
3    2000-09-22
4    2003-04-08
Name: dateObligated, dtype: object

In [124]:
datetime_transformer = Pipeline(
    steps=[
        ("dt", DatetimeFeatures(features_to_extract=['month', 'quarter', 'weekend', 'day_of_week'], drop_original=True))
    ]
)

In [125]:
datetime_transformer.fit_transform(X_train.loc[:, ['declarationDate', 'dateObligated']])

Unnamed: 0,declarationDate_month,declarationDate_quarter,declarationDate_weekend,declarationDate_day_of_week,dateObligated_month,dateObligated_quarter,dateObligated_weekend,dateObligated_day_of_week
0,2,1,0,4,7,3,0,4
1,8,3,0,0,6,2,0,0
2,9,3,0,0,11,4,0,4
3,6,2,0,1,9,3,0,4
4,2,1,0,1,4,2,0,1
...,...,...,...,...,...,...,...,...
629281,12,4,0,4,4,2,0,3
629282,8,3,1,6,11,4,1,5
629283,6,2,0,3,11,4,0,2
629284,9,3,1,5,2,1,0,0


### <span style="color:orange">5. eligibilityStatus & fundingStatus</span>

In [126]:
X_train.eligibilityStatus.value_counts()

eligibilityStatus
E    628078
V       693
I       503
P         7
X         5
Name: count, dtype: int64

In [127]:
X_train.fundingStatus.value_counts()

fundingStatus
O    628081
N      1205
Name: count, dtype: int64

Handling **eligibilityStatus** and **fundingStatus**

The variables eligibilityStatus and fundingStatus were removed from the modeling dataset due to their lack of predictive usefulness. Both features exhibit extreme class imbalance, with more than 99.8% of observations belonging to a single category, resulting in near-zero variance and negligible information content. As a consequence, these variables effectively behave as constants and cannot meaningfully explain variation in the target variable, federalShareObligated. Both variables were excluded prior to model training.

### <span style="color:orange">6. disasterType & damageCategoryCode</span>

In [128]:
X_train.disasterType.value_counts()

disasterType
DR    600250
EM     24495
FM      4541
Name: count, dtype: int64

In [129]:
X_train.damageCategoryCode.value_counts()

damageCategoryCode
C    196137
B    152400
E    109210
A     70725
G     38630
F     37714
D     11708
Z     10529
H      2233
Name: count, dtype: int64

In [130]:
disasterType_damageCategoryCode_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [131]:
disasterType_damageCategoryCode_transformer.fit_transform(X_train.loc[:, ['disasterType', 'damageCategoryCode']])

Unnamed: 0,disasterType_DR,disasterType_EM,disasterType_FM,damageCategoryCode_A,damageCategoryCode_B,damageCategoryCode_C,damageCategoryCode_D,damageCategoryCode_E,damageCategoryCode_F,damageCategoryCode_G,damageCategoryCode_H,damageCategoryCode_Z
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
629281,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
629282,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
629283,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
629284,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### <span style="color:orange">6. paCloseoutStatus</span>

In [132]:
X_train.paCloseoutStatus.value_counts()

paCloseoutStatus
C    498957
O    130329
Name: count, dtype: int64

In [133]:
paCloseoutStatus_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop='first'))
    ]
)

In [134]:
paCloseoutStatus_transformer.fit_transform(X_train.loc[:, ['paCloseoutStatus']])

Unnamed: 0,paCloseoutStatus_O
0,0.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
629281,0.0
629282,0.0
629283,0.0
629284,1.0
