In [None]:
# The purpose of this notebook was to explore whether or not pulling ACS 5-year averages on a yearly basis would 
# improve model performance

# Early results indicated "no" but only an average of ACS features across years was tried

In [12]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
from matplotlib.lines import Line2D
from shapely.geometry import Point
import geopandas
import seaborn as sns
import glob
from scipy import stats
from imblearn import under_sampling, over_sampling 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import pairwise
from sklearn.model_selection import cross_val_score
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


%matplotlib inline
plt.style.use('dark_background')
pd.set_option('display.max_columns',500)
sns.set()

import random

SEED = 111
random.seed(SEED)
np.random.seed(SEED)


In [13]:
p = Path.cwd()
data_path = p.parent.parent / 'data' / 'Master Project Data' 
nfirs_path =  data_path / 'NFIRS Fire Incident Data.csv'

# List the columns you want to download from the NFIRS csv
cols_to_use = ['state','fdid','inc_date','oth_inj','oth_death','prop_loss',
               'cont_loss','tot_loss','geoid']

# Specify particular data type for geoid column
col_dtypes = {'geoid':str}

# Read in NFIRS dataframe
nfirs = pd.read_csv(nfirs_path,
                    dtype = col_dtypes,
                    usecols = cols_to_use,
                    encoding='latin-1')

# Convert inc_date column values to python datetime type
nfirs['inc_date'] = pd.to_datetime(nfirs['inc_date'], infer_datetime_format=True)


#Read in recent ACS dataframes by year and state
data_path_1 = data_path / 'Recent_ACS_Data' / 'Tennessee2013.csv'
data_path_2 = data_path / 'Recent_ACS_Data' / 'Tennessee2014.csv'
data_path_3 = data_path / 'Recent_ACS_Data' / 'Tennessee2015.csv'
data_path_4 = data_path / 'Recent_ACS_Data' / 'Tennessee2016.csv'
ACS_TN_13 = pd.read_csv(data_path_1, dtype = {'GEOID':'object'}, index_col = 2)
ACS_TN_14 = pd.read_csv(data_path_2, dtype = {'GEOID':'object'}, index_col = 2)
ACS_TN_15 = pd.read_csv(data_path_3, dtype = {'GEOID':'object'}, index_col = 2)
ACS_TN_16 = pd.read_csv(data_path_4, dtype = {'GEOID':'object'}, index_col = 2)
ACS_TN_13['year']='2013'
ACS_TN_14['year']='2014'
ACS_TN_15['year']='2015'
ACS_TN_16['year']='2016'
ACS_TN_13['state']='TN'
ACS_TN_14['state']='TN'
ACS_TN_15['state']='TN'
ACS_TN_16['state']='TN'
ACS_TN=pd.concat([ACS_TN_13, ACS_TN_14, ACS_TN_15, ACS_TN_16])


data_path_1 = data_path / 'Recent_ACS_Data' / 'California2013.csv'
data_path_2 = data_path / 'Recent_ACS_Data' / 'California2014.csv'
data_path_3 = data_path / 'Recent_ACS_Data' / 'California2015.csv'
data_path_4 = data_path / 'Recent_ACS_Data' / 'California2016.csv'
ACS_CA_13 = pd.read_csv(data_path_1, dtype = {'GEOID':'object'}, index_col = 2)
ACS_CA_14 = pd.read_csv(data_path_2, dtype = {'GEOID':'object'}, index_col = 2)
ACS_CA_15 = pd.read_csv(data_path_3, dtype = {'GEOID':'object'}, index_col = 2)
ACS_CA_16 = pd.read_csv(data_path_4, dtype = {'GEOID':'object'}, index_col = 2)
ACS_CA_13['year']='2013'
ACS_CA_14['year']='2014'
ACS_CA_15['year']='2015'
ACS_CA_16['year']='2016'
ACS_CA_13['state']='CA'
ACS_CA_14['state']='CA'
ACS_CA_15['state']='CA'
ACS_CA_16['state']='CA'
ACS_CA=pd.concat([ACS_CA_13, ACS_CA_14, ACS_CA_15, ACS_CA_16])


data_path_1 = data_path / 'Recent_ACS_Data' / 'Minnesota2013.csv'
data_path_2 = data_path / 'Recent_ACS_Data' / 'Minnesota2014.csv'
data_path_3 = data_path / 'Recent_ACS_Data' / 'Minnesota2015.csv'
data_path_4 = data_path / 'Recent_ACS_Data' / 'Minnesota2016.csv'
ACS_MN_13 = pd.read_csv(data_path_1, dtype = {'GEOID':'object'}, index_col = 2)
ACS_MN_14 = pd.read_csv(data_path_2, dtype = {'GEOID':'object'}, index_col = 2)
ACS_MN_15 = pd.read_csv(data_path_3, dtype = {'GEOID':'object'}, index_col = 2)
ACS_MN_16 = pd.read_csv(data_path_4, dtype = {'GEOID':'object'}, index_col = 2)
ACS_MN_13['year']='2013'
ACS_MN_14['year']='2014'
ACS_MN_15['year']='2015'
ACS_MN_16['year']='2016'
ACS_MN_13['state']='MN'
ACS_MN_14['state']='MN'
ACS_MN_15['state']='MN'
ACS_MN_16['state']='MN'
ACS_MN=pd.concat([ACS_MN_13, ACS_MN_14, ACS_MN_15, ACS_MN_16])

# Create ACS dataframe for each year 13-16
ACS_13=pd.concat([ACS_TN_13, ACS_CA_13, ACS_MN_13])
ACS_14=pd.concat([ACS_TN_14, ACS_CA_14, ACS_MN_14])
ACS_15=pd.concat([ACS_TN_15, ACS_CA_15, ACS_MN_15])
ACS_16=pd.concat([ACS_TN_16, ACS_CA_16, ACS_MN_16])

# Create ACS dataframe for all states and all years
ACS=pd.concat([ACS_TN, ACS_CA, ACS_MN])


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(125792, 126)
31448
4    31448
Name: NAME, dtype: int64


In [14]:
# ACS Munging

# Ensures GEOID variable is in the correct format and sets it as the dataframe index  
ACS_13.set_index(['GEOID'],inplace = True)
ACS_14.set_index(['GEOID'],inplace = True)
ACS_15.set_index(['GEOID'],inplace = True)
ACS_16.set_index(['GEOID'],inplace = True)
ACS_13=ACS_13.dropna(axis=0)

# Captures state properies of GEOIDs for later use before filtering dataframe to be numeric features only
States_13 = ACS_13[['state']]
States_14 = ACS_14[['state']]
States_15 = ACS_15[['state']]
States_16 = ACS_16[['state']]

# Removes extraneous features (i.e. non-numeric) in the dataframe
if 'Unnamed: 0' in ACS_13.columns:
    ACS_13.drop('Unnamed: 0','columns',inplace= True)
if 'Unnamed: 0' in ACS_14.columns:
    ACS_14.drop('Unnamed: 0','columns',inplace= True)
if 'Unnamed: 0' in ACS_15.columns:
    ACS_15.drop('Unnamed: 0','columns',inplace= True)
if 'Unnamed: 0' in ACS_16.columns:
    ACS_16.drop('Unnamed: 0','columns',inplace= True)


if 'NAME' in ACS_13.columns:
    ACS_13.drop('NAME','columns',inplace= True)
if 'NAME' in ACS_14.columns:
    ACS_14.drop('NAME','columns',inplace= True)
if 'NAME' in ACS_15.columns:
    ACS_15.drop('NAME','columns',inplace= True)
if 'NAME' in ACS_16.columns:
    ACS_16.drop('NAME','columns',inplace= True)


if 'inc_pcincome' in ACS_13.columns:
    ACS_13.drop('inc_pcincome','columns',inplace= True)
if 'inc_pcincome' in ACS_14.columns:
    ACS_14.drop('inc_pcincome','columns',inplace= True)
if 'inc_pcincome' in ACS_15.columns:
    ACS_15.drop('inc_pcincome','columns',inplace= True)
if 'inc_pcincome' in ACS_16.columns:
    ACS_16.drop('inc_pcincome','columns',inplace= True)

    
if 'in_poverty' in ACS_13.columns:
    ACS_13.drop('in_poverty','columns',inplace= True)
if 'in_poverty' in ACS_14.columns:
    ACS_14.drop('in_poverty','columns',inplace= True)
if 'in_poverty' in ACS_15.columns:
    ACS_15.drop('in_poverty','columns',inplace= True)
if 'in_poverty' in ACS_16.columns:
    ACS_16.drop('in_poverty','columns',inplace= True)


# Creates vector of total populations for each census block to be used to normalize total fires per year variable
tot_pop_13 = ACS_13[['tot_population']]
tot_pop_13_14 = pd.concat([tot_pop_13, ACS_14[['tot_population']]]).groupby(level=0).mean()
tot_pop_13_14_15 = pd.concat([tot_pop_13, ACS_14[['tot_population']], ACS_15[['tot_population']]]).groupby(level=0).mean()


# Drop all total count columns in ACS and keeps all percentage columns
cols = ACS_13.columns.to_list()
for col in cols:
    if  col.find('tot') != -1 : 
        ACS_13.drop(col,'columns', inplace = True)
        
cols = ACS_14.columns.to_list()
for col in cols:
    if  col.find('tot') != -1 : 
        ACS_14.drop(col,'columns', inplace = True)
        
cols = ACS_15.columns.to_list()
for col in cols:
    if  col.find('tot') != -1 : 
        ACS_15.drop(col,'columns', inplace = True)
        
cols = ACS_16.columns.to_list()
for col in cols:
    if  col.find('tot') != -1 : 
        ACS_16.drop(col,'columns', inplace = True)

ACS_13.drop('state','columns',inplace= True)
ACS_14.drop('state','columns',inplace= True)
ACS_15.drop('state','columns',inplace= True)
ACS_16.drop('state','columns',inplace= True)


In [15]:
# Find correlated features in ACS dataset and identify the highly correlated relationships

# Create ACS correlation matrix
corr = ACS.corr()
 
# Filtering out lower/upper triangular duplicates 
corr_high = corr[abs(corr) > 0.75].stack().reset_index()
corr_high = corr_high[corr_high['level_0'].astype(str)!=corr_high['level_1'].astype(str)]
corr_high['ordered-cols'] = corr_high.apply(lambda x: '-'.join(sorted([x['level_0'],x['level_1']])),axis=1)
corr_high = corr_high.drop_duplicates(['ordered-cols'])
corr_high.drop(['ordered-cols'], axis=1, inplace=True)
corr_high.columns = ['Pair Var 1', 'Pair Var 2', 'Corr Value']

# Display highly correlated pairs
corr_high.sort_values(by=['Corr Value'], ascending=False)

# From highly correlated pairs, remove one of the Pair Vars from the ACS dataset except for the 'mort' variables
ACS_13 = ACS_13.drop(['house_pct_vacant', 'house_pct_non_family', 'house_pct_rent_occupied',
                           'race_pct_nonwhite', 'race_pct_nonwhitenh', 'house_pct_incomplete_plumb',
                           'house_pct_incomplete_kitchen', 'race_pct_whitenh', 'year'], axis=1) 
ACS_14 = ACS_14.drop(['house_pct_vacant', 'did_not_work_past_12_mo', 'house_pct_non_family', 'house_pct_rent_occupied',
                           'race_pct_nonwhite', 'race_pct_nonwhitenh', 'house_pct_incomplete_plumb',
                           'house_pct_incomplete_kitchen', 'race_pct_whitenh', 'year'], axis=1) 
ACS_15 = ACS_15.drop(['house_pct_vacant', 'did_not_work_past_12_mo', 'house_pct_non_family', 'house_pct_rent_occupied',
                           'race_pct_nonwhite', 'race_pct_nonwhitenh', 'house_pct_incomplete_plumb',
                           'house_pct_incomplete_kitchen', 'race_pct_whitenh', 'year'], axis=1) 
ACS_16 = ACS_16.drop(['house_pct_vacant', 'did_not_work_past_12_mo', 'house_pct_non_family', 'house_pct_rent_occupied',
                           'race_pct_nonwhite', 'race_pct_nonwhitenh', 'house_pct_incomplete_plumb',
                           'house_pct_incomplete_kitchen', 'race_pct_whitenh', 'year'], axis=1) 

# Based on feature importance experiments, select features with consistence importance across annual predictions
ACS_13 = ACS_13[['house_yr_pct_earlier_1939', 'house_pct_occupied', 'house_pct_family_married', 'race_pct_black',
          'heat_pct_fueloil_kerosene', 'educ_bachelors', 'house_pct_live_alone', 
          'educ_some_col_no_grad', 'house_pct_ownd_occupied', 'house_w_home_equity_loan', 'house_val_175K_200K',
           'house_val_200K_250K']]
ACS_14 = ACS_14[['house_yr_pct_earlier_1939', 'house_pct_occupied', 'house_pct_family_married', 'race_pct_black',
          'worked_past_12_mo', 'heat_pct_fueloil_kerosene', 'educ_bachelors', 'house_pct_live_alone', 
          'educ_some_col_no_grad', 'house_pct_ownd_occupied', 'house_w_home_equity_loan', 'house_val_175K_200K',
           'house_val_200K_250K']]
ACS_15 = ACS_15[['house_yr_pct_earlier_1939', 'house_pct_occupied', 'house_pct_family_married', 'race_pct_black',
          'worked_past_12_mo', 'heat_pct_fueloil_kerosene', 'educ_bachelors', 'house_pct_live_alone', 
          'educ_some_col_no_grad', 'house_pct_ownd_occupied', 'house_w_home_equity_loan', 'house_val_175K_200K',
           'house_val_200K_250K']]
ACS_16 = ACS_16[['house_yr_pct_earlier_1939', 'house_pct_occupied', 'house_pct_family_married', 'race_pct_black',
          'worked_past_12_mo', 'heat_pct_fueloil_kerosene', 'educ_bachelors', 'house_pct_live_alone', 
          'educ_some_col_no_grad', 'house_pct_ownd_occupied', 'house_w_home_equity_loan', 'house_val_175K_200K',
           'house_val_200K_250K']]

# Segment ACS dataframes with moving average for successive year predictions
ACS_13_14 = pd.concat([ACS_13, ACS_14]).groupby(level=0).mean()
ACS_13_14_15 = pd.concat([ACS_13, ACS_14, ACS_15]).groupby(level=0).mean()





(30509, 12)
(31448, 13)
(31448, 13)


In [16]:
#NFIRS Munging

# Ensure correct calculation of tot_loss column 
nfirs['tot_loss'] = nfirs['prop_loss'] + nfirs['cont_loss']

# Create mask for new severe fire variable
sev_fire_mask = (nfirs['oth_death'] > 0) | (nfirs['oth_inj'] > 0) | (nfirs['tot_loss'] >= 10000)

# By default assigns values of severe fire column as not severe
nfirs['severe_fire'] = 'not_sev_fire'

# Applies filter to severe fire column to label the severe fire instances correctly
nfirs.loc[sev_fire_mask,'severe_fire'] = 'sev_fire'

# Create new NFIRS variables based on specified thresholds of existing variables in dataframe
nfirs['had_inj'] = np.where(nfirs['oth_inj']>0,'had_inj','no_inj')
nfirs['had_death'] = np.where(nfirs['oth_death']>0,'had_death','no_death')
nfirs['10k_loss'] = np.where(nfirs['tot_loss']>=10000,'had_10k_loss','no_10k_loss')

# Extract just the numeric portion of the geoid
nfirs['geoid'] =  nfirs['geoid'].str.strip('#_')

# Add a year column to be used to groupby in addition to geoid
nfirs['year'] = nfirs['inc_date'].dt.year.astype('str')
nfirs.set_index('geoid',inplace = True)
nfirs_CA = nfirs[nfirs['state']=='CA']
nfirs_TN = nfirs[nfirs['state']=='TN']
nfirs_MN = nfirs[nfirs['state']=='MN']
nfirs=pd.concat([nfirs_CA, nfirs_TN, nfirs_MN])
nfirs.drop('state','columns',inplace= True)


In [17]:
## Adjust total fires per year by the population counts

# Creates dataframe that shows the number of fires in each census block each year
fires =  pd.crosstab(nfirs.index,nfirs['year'])
fires.index.rename('GEOID',inplace = True)

# Grab total population values pulled from ACS dataframe and assign to each census block in NFIRS dataframe
fires1 = fires.merge(tot_pop_13, how = 'left', left_index = True, right_index = True)
fires2 = fires.merge(tot_pop_13_14, how = 'left', left_index = True, right_index = True)
fires3 = fires.merge(tot_pop_13_14_15, how = 'left', left_index = True, right_index = True)

# Remove resulting NaN/infinity values following merge
fires1.replace([np.inf, -np.inf], np.nan,inplace = True)
fires1.dropna(inplace = True)
fires2.replace([np.inf, -np.inf], np.nan,inplace = True)
fires2.dropna(inplace = True)
fires3.replace([np.inf, -np.inf], np.nan,inplace = True)
fires3.dropna(inplace = True)

# drop rows with no population count
fires1 = fires1[fires1['tot_population'] != 0 ] 
fires2 = fires2[fires2['tot_population'] != 0 ] 
fires3 = fires3[fires3['tot_population'] != 0 ] 

# population adjustment
fires1.loc[:,'2010':'2014'] = fires1.loc[:,'2010':'2014'].div(fires1['tot_population'], axis = 'index') * 1000
fires1 = fires1.loc[:,'2010':'2014']
fires2.loc[:,'2011':'2015'] = fires2.loc[:,'2011':'2015'].div(fires2['tot_population'], axis = 'index') * 1000
fires2 = fires2.loc[:,'2011':'2015']
fires3.loc[:,'2012':'2016'] = fires3.loc[:,'2012':'2016'].div(fires3['tot_population'], axis = 'index') * 1000
fires3 = fires3.loc[:,'2012':'2016']


# view fires by year across geoids; displays additional information regarding # of fires in higher percentile categories
fires1.describe(percentiles=[.75, .85, .9 ,.95, .99])
fires2.describe(percentiles=[.75, .85, .9 ,.95, .99])
fires3.describe(percentiles=[.75, .85, .9 ,.95, .99])

# define variables to indicate census blocks in the top 10% percent of fire risk scores
top10_1 = fires1 > fires1.quantile(.9)
top10_2 = fires2 > fires2.quantile(.9)
top10_3 = fires3 > fires3.quantile(.9)


In [18]:
# Function to upsample or downsample our dataframe features if we have unbalanced classes

def resample_df(X,y,upsample=True,seed = SEED):
    from sklearn.utils import resample
    # check which of our two classes is overly represented 
    if np.mean(y) > .5:
        major,minor = 1,0
    else:
        major,minor = 0, 1
    
    # Add Class feature to dataframe equal to our existing dependent variable
    X['Class'] = y
    
    df_major = X[X.Class == major ]
    df_minor = X[X.Class == minor ]
    

    if upsample:      
    
        df_minor_resampled = resample(df_minor,
                                     replace = True,
                                     n_samples = df_major.shape[0], 
                                     random_state = seed)
    
    
   
        combined = pd.concat([df_major,df_minor_resampled])
        
        # Debug
        #print('minor class {}, major class {}'.format(df_minor_resampled.shape[0],
                                                       #df_major.shape[0]))
    
        
    else: # downsample
         
        df_major_resampled = resample(df_major,
                                     replace = False,
                                     n_samples = df_minor.shape[0],
                                     random_state = seed)
        
        
        combined = pd.concat([df_major_resampled,df_minor])
        
        #print('minor class {}, major class {}'.format(df_minor.shape[0],
                                                      #df_major_resampled.shape[0]))


    
    
    y_out = combined['Class']
    X_out = combined.drop('Class', axis =1)
    return X_out , y_out

In [19]:
# Function to train model that predicts whether each census block is in the top 10% percent of fire risk scores
def train_model(top10=pd.DataFrame(),fires=pd.DataFrame(), ACS = pd.DataFrame(), modeltype='LogisticRegression', seed = SEED):
    from scipy.stats import zscore
    
    
    # Define model types & parameters 
    
    if modeltype =='LogisticRegression':
        from sklearn.linear_model import LogisticRegression
        model = LogisticRegression(warm_start=True,
                                   class_weight = 'balanced',
                                   max_iter = 1000)

        
    elif modeltype =='BalBagged':
        from imblearn.ensemble import BalancedBaggingClassifier
        from sklearn.tree import DecisionTreeClassifier
        model = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                          n_estimators=80, sampling_strategy='auto',
                                          random_state=0)
        
    elif modeltype =='BalRF':
        from imblearn.ensemble import BalancedRandomForestClassifier
        model = BalancedRandomForestClassifier(n_estimators=80, sampling_strategy='auto',
                                               max_depth=10, random_state=0,
                                              max_features=None, min_samples_leaf=40)

    elif modeltype =='Bagged':
        from sklearn.ensemble import BaggingClassifier
        from sklearn.tree import DecisionTreeClassifier
        model = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                    n_estimators=40,
                                    random_state=0)
    
    elif modeltype =='RF':
        from imblearn.ensemble import BalancedRandomForestClassifier
        model = BalancedRandomForestClassifier(n_estimators=60,
                                          warm_start = False,
                                          max_depth = 10,
                                            random_state = 0)
    
    
    # Create framework to predict whether a given census block has a fire risk score in the 90th percentile 
            # based on the specific number of previous years' data

    # Should alter which "fires#" you use based on the model year   
    X = fires1.iloc[:,0:4].copy()
        
    sm = np.sum(X, axis = 1 )
    mu = np.mean(X, axis = 1)
    mx = np.max(X, axis =1)
    X['Sum']  = sm
    X['Mean'] = mu
    X['Max']  = mx
    y = top10_1.iloc[:,4]
    
    # merge in ACS Data into X unless NFIRS-Only model
    X=X[['Sum','Mean','Max']] # drop all other NFIRS columns that have low feature importance scores
    # Should alter the ACS dataframe use based on the model year you want to run
    X = X.merge(ACS_13, how ='left',left_index = True, right_index = True)
    X = X.dropna()
    y = y.filter(X.index)
        
        
        
    # Create 80/20 training/testing set split
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = .2 )
        
    # Perform resampling if data classes are unbalanced
    X_train, y_train = resample_df(X_train,y_train)
    
    
    # Perform cross-validation 
        
    #scaler = preprocessing.StandardScaler().fit(X)
    #scaler.transform(X)
    #print ('Cross Val Score:')
    #print(cross_val_score(model, X, y))
        
        
    # Standardize features by removing the mean and scaling to unit variance
        
    scaler = preprocessing.StandardScaler().fit(X_train)
    scaler.transform(X_train)
    scaler.transform(X_test)
        
        
    # Fit model to training set
    model = model.fit(X_train,y_train)

        
    # Calculate training set performance
        
    train_prediction_probs = model.predict_proba(X_train)
    train_predictions = model.predict(X_train)
    print (confusion_matrix(y_train, train_predictions))
    print (roc_auc_score(y_train, train_prediction_probs[:,1]))
        
        
    # Calculate test set performance
        
    test_prediction_probs = model.predict_proba(X_test)
    test_predictions = model.predict(X_test)
    print (confusion_matrix(y_test, test_predictions))
    print (roc_auc_score(y_test, test_prediction_probs[:,1]))
    print (classification_report(y_test,test_predictions))
    print (log_loss(y_test,test_predictions))
        
        
    #Calculate feature importance for each model
        
    if modeltype=="LogisticRegression":
        feature_importance = {}
        for coef, feat in zip(abs(model.coef_[0]),X_test.columns.tolist()):
            feature_importance[feat] = coef
        print("Feature ranking:")
        print (feature_importance)
    else:
        if modeltype=="RF" or modeltype=="BalRF":
            importances = model.feature_importances_
        elif modeltype=="Bagged":
            importances = np.mean([model.estimators_[i].feature_importances_ for i 
                            in range(len(model.estimators_))], axis=0)
        elif modeltype=="BalBagged":
            importances = np.mean([model.estimators_[i].steps[1][1].feature_importances_ for i 
                            in range(len(model.estimators_))], axis=0)
        
        indices = np.argsort(importances)[::-1]
        print("Feature ranking:")
        for f in range(len(X_test.columns)):
            print("%d. %s (%f)" % (f + 1, X_test.columns[indices[f]], importances[indices[f]]))
    
    
            
    return model,X_test,y_test
        


In [20]:
# Train NFIRS + ACS Model and output prediction performance metrics for each year
# To predict 2014 use top10_1 and ACS_13
# To predict 2015 use top10_2 and ACS_13_14
# To predict 2016 use top10_3 and ACS_13_14_15
mdl,X_test,y_test =train_model(top10_1,fires1,ACS = ACS_13, modeltype='BalRF')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Class'] = y


[[16388  2983]
 [ 2962 16409]]
0.9313336789348324
[[3903  920]
 [ 228  326]]
0.7883958558980696
              precision    recall  f1-score   support

       False       0.94      0.81      0.87      4823
        True       0.26      0.59      0.36       554

    accuracy                           0.79      5377
   macro avg       0.60      0.70      0.62      5377
weighted avg       0.87      0.79      0.82      5377

7.374233016736703
Feature ranking:
1. Max (0.429868)
2. house_pct_live_alone (0.072803)
3. Mean (0.060505)
4. Sum (0.058982)
5. house_yr_pct_earlier_1939 (0.052781)
6. race_pct_black (0.051319)
7. house_pct_family_married (0.050737)
8. house_pct_occupied (0.047937)
9. educ_bachelors (0.040676)
10. house_pct_ownd_occupied (0.033498)
11. educ_some_col_no_grad (0.028150)
12. house_w_home_equity_loan (0.026294)
13. house_val_200K_250K (0.022427)
14. house_val_175K_200K (0.016241)
15. heat_pct_fueloil_kerosene (0.007782)
