In [None]:
import pandas as pd
import os
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import make_scorer,f1_score

current_path = os.getcwd()
root_path = os.path.dirname(current_path)
data_path= root_path + '/data/'
result_path=root_path+'/results/'


In [None]:
filename='step2_consequence_fire_spread.pkl'

In [None]:
df=pd.read_pickle(data_path + filename)

In [None]:
df=df[df['CAUSE_IGN']!='0']
df=df[df['HEAT_SOURCE_new']!='6']
df=df[df['FACT_IGN1_new']!='7']
df=df[df['AREA_ORIG_new']!='8']
df=df[df['AREA_ORIG_new']!='9']

def remove_outliers_z(df, columns, threshold=3):
    for col in columns:
        z_scores = (df[col] - df[col].mean()) / df[col].std()
        df = df[(z_scores < threshold) & (z_scores > -threshold)]
    return df

df = df[(df['last_unit_clear_time'] < 600)&(df['TOT_SQ_FT'] < 50000)].reset_index().drop(columns = ['index'])


In [None]:
cpi_map={'2012':1,
         '2013':1.02,
         '2014':1.02,
         '2015':1.03,
         '2016':1.05,
         '2017':1.07,
         '2018':1.09,
         '2019':1.12,
         '2020':1.13,
         '2021':1.21,
         '2022':1.29
}
df['median_income_list'] = df.apply(lambda row: row['median_income_list'] / cpi_map.get(row['accident_year'], 1)*1.29, axis=1)
df['median_rent_list'] = df.apply(lambda row: row['median_rent_list'] / cpi_map.get(row['accident_year'], 1)*1.29, axis=1)


In [None]:
columns_drop=['DET_OPERAT','build_time_1939_and_earlier_list','build_time_1940_to_1979_list','total_population_list','SUP_APP', 'EMS_APP', 'OTH_APP','Pct_GRAPI_35_pct_or_more_list',
       'Pct_GRAPI_Less_than_15_pct_list','Pct_EDU_Less_than_9th_grade_list',
       'SUP_PER', 'EMS_PER', 'OTH_PER','AID','pop_density']
df.drop(columns=columns_drop,inplace=True)

In [None]:
df.shape

In [None]:
counts = df.groupby(['CBSA Title']).size().reset_index(name='count')

# Keep only those with count >= 2
valid_pairs = counts[counts['count'] >= 5][['CBSA Title','count']]

# Merge to filter the DataFrame
df_filtered = df.merge(valid_pairs, on=['CBSA Title'])

In [None]:
object_columns = df.select_dtypes(include=['object']).columns
print("Object columns:")
print(object_columns)

# Print float columns
float_columns = df.select_dtypes(include=['float']).columns
print("Float columns:")
print(float_columns)

In [None]:
# Initialize empty train/test holders
df=df_filtered.copy()

X = df.drop(columns=[ 'last_unit_clear_time',
                     'FLAME_SPRD', 'FIRE_SPRD', 'HHS_Region', 'Census_Region',
       'Census_Division', 'CBSA Code','count'])

y = df['FIRE_SPRD']
categorical_features = [col for col in X.columns if X[col].dtype == 'object']



X_train_list, X_test_list = [], []
y_train_list, y_test_list = [],[]

# Loop over each region
for region_name, group_df in X.groupby('CBSA Title'):
    y_group = y[group_df.index]  # align y with current group

    # Stratify on y within the group
    X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(
        group_df, y_group, test_size=0.3, random_state=2042
    )

    # Collect the splits
    X_train_list.append(X_train_group)
    X_test_list.append(X_test_group)
    y_train_list.append(y_train_group)
    y_test_list.append(y_test_group)

# Concatenate all regions
X_train = pd.concat(X_train_list)
X_test = pd.concat(X_test_list)
y_train = pd.concat(y_train_list)
y_test = pd.concat(y_test_list)

X_train.drop(columns=['CBSA Title'], inplace=True)
X_test.drop(columns=['CBSA Title'], inplace=True)


categorical_features = [col for col in X_test.columns if X_test[col].dtype == 'object']


train_data = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_data = Pool(data=X_test, label=y_test, cat_features=categorical_features)


In [None]:
model = CatBoostClassifier(iterations=1500, depth=6,
                           learning_rate=0.1, l2_leaf_reg=5,
                            class_weights=[1, 3, 1, 4],
                           task_type='GPU',
                           loss_function='MultiClass',
                           random_state=2042)
model.fit(train_data, verbose=100)
# y_pred = model.predict(X_)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
joblib.dump(model, os.path.join(result_path, 'model_cont_fs_cat_best_gpu_city_v1.pkl'))
X_test.to_pickle(os.path.join(result_path, 'X_test_fs_cat_best_gpu_city_v1.pkl'))
y_test.to_pickle(os.path.join(result_path, 'y_test_fs_cat_best_gpu_city_v1.pkl'))
X_train.to_pickle(os.path.join(result_path, 'x_train_fs_cat_best_gpu_city_v1.pkl'))
y_train.to_pickle(os.path.join(result_path, 'y_train_fs_cat_best_gpu_city_v1.pkl'))

probs = model.predict_proba(X_test)
joblib.dump(probs, os.path.join(result_path, 'y_prob_fs_cat_best_gpu_city_v1.pkl'))
