In [None]:
import pandas as pd
import os
import numpy as np
from catboost import CatBoostClassifier, Pool,CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,mean_squared_error,r2_score
from sklearn.multioutput import MultiOutputRegressor
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import joblib

current_path = os.getcwd()
root_path = os.path.dirname(current_path)
data_path= root_path + '/data/'
result_path=root_path+'/results/'





In [None]:
# import datafrome
df=pd.read_pickle(data_path + '/step2_consequence_injury_severity.pkl')
df_new2=pd.read_pickle(data_path + '/raw_data_with_city_info.pkl')


In [None]:
# convert to 2022 value
cpi_map={'2012':1,
         '2013':1.02,
         '2014':1.02,
         '2015':1.03,
         '2016':1.05,
         '2017':1.07,
         '2018':1.09,
         '2019':1.12,
         '2020':1.13,
         '2021':1.21,
         '2022':1.29
}
df['median_income_list'] = df.apply(lambda row: row['median_income_list'] / cpi_map.get(row['accident_year'], 1)*1.29, axis=1)
df['median_rent_list'] = df.apply(lambda row: row['median_rent_list'] / cpi_map.get(row['accident_year'], 1)*1.29, axis=1)


In [None]:
# drop fires that are spread from other fires, etc
df=df[df['CAUSE_IGN']!='0']
df=df[df['HEAT_SOURCE_new']!='6']
df=df[df['FACT_IGN1_new']!='7']
df=df[df['AREA_ORIG_new']!='8']
df=df[df['AREA_ORIG_new']!='9']

# remove outliers

df = df[(df['last_unit_clear_time'] < 600)&(df['TOT_SQ_FT'] < 50000)].reset_index().drop(columns = ['index'])


In [None]:
# drop featues that are not used
columns_drop=['DET_OPERAT','build_time_1939_and_earlier_list','build_time_1940_to_1979_list','total_population_list','SUP_APP', 'EMS_APP', 'OTH_APP','Pct_GRAPI_35_pct_or_more_list',
       'Pct_GRAPI_Less_than_15_pct_list','Pct_EDU_Less_than_9th_grade_list',
       'SUP_PER', 'EMS_PER', 'OTH_PER','AID']
df.drop(columns=columns_drop,inplace=True)

In [None]:
object_columns = df.select_dtypes(include=['object']).columns
print("Object columns:")
print(object_columns)

# Print float columns
float_columns = df.select_dtypes(include=['float','int']).columns
print("Float columns:")
print(float_columns)

In [None]:
df_new=df_new2[['INCIDENT_KEY', 'CBSA Code','CBSA Title','State_Abbrs', 'HHS_Region',
       'Census_Region', 'Census_Division']]
df_filtered=df.merge(df_new,on='INCIDENT_KEY',how='left').copy()

In [None]:
# drop cities with very few fire events

counts = df_filtered.groupby(['CBSA Title']).size().reset_index(name='count')

# Keep only those with count >= 2
valid_pairs = counts[counts['count'] >= 5][['CBSA Title','count']]

# Merge to filter the DataFrame
df_filtered = df_filtered.merge(valid_pairs, on=['CBSA Title'])
df_filtered.drop(columns=['count'],inplace=True)

In [None]:
df_filtered.columns, df_filtered.shape

In [None]:
# category prediction

df_label = df_filtered.copy()
#vsl_proportion = [x * 1000 for x in [0.003, 0.047, 0.105, 0.266, 1]]
vsl_proportion = [x * 1000 for x in [0.003, 0.047, 0.266, 0.593, 1]]


#vsl_proportion = [1,4,9,16,25]


# Multiply each column by its corresponding value in vsl_proportion
df_label['vsl_proportion'] = (
    df_label['SEV_1'] * vsl_proportion[0] +
    df_label['SEV_2'] * vsl_proportion[1] +
    df_label['SEV_3'] * vsl_proportion[2] +
    df_label['SEV_4'] * vsl_proportion[3] +
    df_label['SEV_5'] * vsl_proportion[4]
)


bins = [0, 6,  593,float('inf')]

labels = ['0', '1', '2']

# Assign risk groups based on the thresholds
df_label['Risk_Group'] = pd.cut(df_label['vsl_proportion'], bins=bins, labels=labels, include_lowest=True)

X = df_label.drop(columns=['SEV_1', 'SEV_2', 'SEV_3', 'SEV_4', 'SEV_5', 'vsl_proportion','FIRE_SPRD','Risk_Group','last_unit_clear_time',
                          'CBSA Title', 'State_Abbrs', 'HHS_Region',
        'Census_Region', 'Census_Division'])
y = df_label['Risk_Group']


X_train_list, X_test_list = [], []
y_train_list, y_test_list = [],[]

# Loop over each region
for region_name, group_df in X.groupby('CBSA Code'):
    y_group = y[group_df.index]  # align y with current group

    # Stratify on y within the group
    X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(
        group_df, y_group, test_size=0.3, random_state=2042
    )

    # Collect the splits
    X_train_list.append(X_train_group)
    X_test_list.append(X_test_group)
    y_train_list.append(y_train_group)
    y_test_list.append(y_test_group)

# Concatenate all regions
X_train = pd.concat(X_train_list)
X_test = pd.concat(X_test_list)
y_train = pd.concat(y_train_list)
y_test = pd.concat(y_test_list)

X_train.drop(columns=['CBSA Code'], inplace=True)
X_test.drop(columns=['CBSA Code'], inplace=True)


categorical_features = [col for col in X_test.columns if X_test[col].dtype == 'object']


train_data = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_data = Pool(data=X_test, label=y_test, cat_features=categorical_features)



In [None]:
y_train.value_counts(normalize=True)

In [None]:
# Initialize and train the CatBoost model (now using classification)
model = CatBoostClassifier(iterations=1500,
                            depth=4,
                          learning_rate=0.1,
                            l2_leaf_reg=6,
                          class_weights=[1,1,1],
                          task_type='GPU',
                          loss_function='MultiClass',
                            random_state=2042)
model.fit(train_data, verbose=100)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the classification performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Classification Report:\n{report}')

In [None]:
cm = confusion_matrix(y_test, y_pred)

# Normalize the confusion matrix to 0-1 scale
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Define the new labels
labels = ['Low Risk', 'Moderate Risk', 'High Risk']

# Plot the confusion matrix with a normalized color bar (0-1)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues', xticklabels=labels, yticklabels=labels, vmin=0, vmax=1)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Injury')
#plt.colorbar(label='Proportion')
plt.show()



In [None]:
import shap



In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[:,:,0], X_test,plot_type='bar')

In [None]:
joblib.dump(model, os.path.join(result_path, 'model_cont_inj_cat_best_gpu_city_v1.pkl'))
X_test.to_pickle(os.path.join(result_path, 'X_test_inj_cat_best_gpu_city_v1.pkl'))
y_test.to_pickle(os.path.join(result_path, 'y_test_inj_cat_best_gpu_city_v1.pkl'))
X_train.to_pickle(os.path.join(result_path, 'x_train_inj_cat_best_gpu_city_v1.pkl'))
y_train.to_pickle(os.path.join(result_path, 'y_train_inj_cat_best_gpu_city_v1.pkl'))

joblib.dump(shap_values, os.path.join(result_path, 'shap_values_inj_cat_best_gpu_city_v1.pkl'))
probs = model.predict_proba(X_test)
joblib.dump(probs, os.path.join(result_path, 'y_prob_inj_cat_best_gpu_city_v1.pkl'))

## mark: in V7, i deleted all the fires that are too small (det_operate=1) or implies that fire is ignited by other fire df=df[df['CAUSE_IGN']!='0'],df.drop(columns=['DET_OPERAT'],inplace=True),df=df[df['HEAT_SOURCE_new']!='8']
