# Create Metadata for Subgroup Evaluation

In [18]:
import numpy as np
import pandas as pd
import os
import pathlib

# sklearn
from sklearn.model_selection import train_test_split

Step 1 and Step 2 are identical to the approach in `02-choose-trgt-and-preprocess-..-sa.ipynb` notebooks.

## Step 1: Data Ingestion

In [19]:
notebook_dir = pathlib.Path(os.getcwd()) 
csvdir_path = notebook_dir.parent / "data" / "external" 
file_name = "FPA_FOD_Plus.csv"
file_path = f"{csvdir_path}/{file_name}"

In [20]:
df_init = pd.read_csv(file_path)

# drop columns that have the number of missing values greater than 80%
cutoff_missing_values = df_init.shape[0]*0.8
missing_vals = df_init.isna().sum().sort_values(ascending=False)
cols_missing_gt80pct = list(missing_vals[missing_vals > cutoff_missing_values].index)
df = df_init.drop(columns=cols_missing_gt80pct)

  df_init = pd.read_csv(file_path)


`choose prediction task`

In [21]:
# binning approach 4 (from Nedim's code)
bins_04 = [0, 100, 4999, 29000, df.FIRE_SIZE.max()] 
fire_cats_04 = pd.cut(df.FIRE_SIZE, bins_04)
print(f"{fire_cats_04.value_counts()}")
print(f"\n{fire_cats_04.value_counts(normalize=True)}")

FIRE_SIZE
(0.0, 100.0]           2241807
(100.0, 4999.0]          55930
(4999.0, 29000.0]         3682
(29000.0, 662700.0]       1102
Name: count, dtype: int64

FIRE_SIZE
(0.0, 100.0]           0.973632
(100.0, 4999.0]        0.024291
(4999.0, 29000.0]      0.001599
(29000.0, 662700.0]    0.000479
Name: proportion, dtype: float64


In [22]:
group_names = ['level_1', 'level_2', 'level_3', 'level_4']
df['FIRE_SIZE_LABEL'] = pd.cut(df.FIRE_SIZE, bins_04, labels=group_names)

## Step 2: Data Preprocess

`Recode target variable`

In [23]:
# recode fire_size_label  from string to numeric
firelabel_mapping = {
    'level_1': 0,
    'level_2': 1,
    'level_3': 2,
    'level_4': 3
}

df['FIRE_SIZE_LABEL'] = df['FIRE_SIZE_LABEL'].map(firelabel_mapping)

`Shuffle the dataset`

In [24]:
np.random.seed(207)

indices = list(range(df.shape[0]))               # create a list of indices corresponding to rows in the dataset
shuffled_indices = np.random.permutation(indices)
df = df.reindex(index=shuffled_indices)     # change the ordering of the df

`Create training, validation, and test datasets`

Using a 60/20/20 split

Approach: stratified random sampling, strata = FIRE_YEAR and FIRE_SIZE_LABEL

In [25]:
val_size = 0.2/(0.2 + 0.6)
df_train_main, df_test = train_test_split(df, test_size= 0.2, random_state=207,
                                           stratify=list(zip(df['FIRE_YEAR'], df['FIRE_SIZE_LABEL'])))
df_train, df_val = train_test_split(df_train_main, test_size=val_size, random_state=207,
                                     stratify=list(zip(df_train_main['FIRE_YEAR'], df_train_main['FIRE_SIZE_LABEL'])))

# print shapes
print(f"Shape of df_train: {df_train.shape}")
print(f"Shape of df_val: {df_val.shape}")
print(f"Shape of df_test: {df_test.shape}")

Shape of df_train: (1381512, 287)
Shape of df_val: (460504, 287)
Shape of df_test: (460505, 287)


## Step 3: Create Metadata of Subgroups

The metadata will be used to create masks during subgroup evaluations.

`Subgroup: Whether the county is low income`

In [26]:
df_train['FPL200S'].value_counts(dropna=False)

FPL200S
0.0    754324
1.0    627176
NaN        12
Name: count, dtype: int64

In [27]:
fpl_train = df_train['FPL200S']
fpl_val = df_val['FPL200S']
fpl_test = df_test['FPL200S']

`Subgroup: State`

In [28]:
state_train = df_train['STATE']
state_val = df_val['STATE']
state_test = df_test['STATE']

`Subgroup: Total Population`

In [29]:
q_totpop_train = pd.qcut(df_train['TPF'],
        q=4,
        labels=["Q1", "Q2", "Q3", "Q4"])

q_totpop_val = pd.qcut(df_val['TPF'],
        q=4,
        labels=["Q1", "Q2", "Q3", "Q4"])

q_totpop_test = pd.qcut(df_test['TPF'],
        q=4,
        labels=["Q1", "Q2", "Q3", "Q4"])

`Subgroup: Pollution Factor (whether the county is considered a disadvantaged community)`

In [30]:
df_train['M_PLN'].value_counts(dropna=False)

M_PLN
0.0    1353146
1.0      28354
NaN         12
Name: count, dtype: int64

In [31]:
pln_train = df_train['M_PLN']
pln_val = df_val['M_PLN']
pln_test = df_test['M_PLN']

`Subgroup: Transportation Factor (whether the county is considered a disadvantaged community)`

In [32]:
df_train['M_TRN'].value_counts(dropna=False)

M_TRN
0.0    1372132
1.0       9368
NaN         12
Name: count, dtype: int64

In [33]:
trn_train = df_train['M_TRN']
trn_val = df_val['M_TRN']
trn_test = df_test['M_TRN']