# Tree Models for Project 3

## Setup

### Import Libraries

In [60]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from operator import itemgetter

### Set File Locations

In [61]:
# note that some of the raw data files are very large
# these very large files are located in a gitignored directory.

# cleaned, merged data
merged_data_csv = "../00_Data/cleaned_data/cleaned_merged_data.csv"

## Import Data

In [62]:
# Import census data
data_df = pd.read_csv(merged_data_csv)

data_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220423 entries, 0 to 220422
Data columns (total 240 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   fips_block_group                  220423 non-null  int64  
 1   state                             220423 non-null  float64
 2   state_name                        220423 non-null  object 
 3   county                            220423 non-null  float64
 4   county_name                       220423 non-null  object 
 5   tract                             220423 non-null  float64
 6   block_group                       220423 non-null  float64
 7   flag                              220423 non-null  float64
 8   land_area                         220423 non-null  float64
 9   aian_land                         220423 non-null  float64
 10  urbanized_area_pop_cen_2010       220423 non-null  float64
 11  urban_cluster_pop_cen_2010        220423 non-null  

## Prep Data

In [4]:
# Prepare the target
target = data_df["has_superfund"]
target_names = ["negative", "positive"]

In [71]:
# Prepare the features
# Drop all the columns that came in from the site data. This prevents 'trailing indicators' from getting into the model.
# Also drop any column that shouldn't mathematically matter, such as FIPS, tract, etc.
exclusion_list = ['fips_block_group',
            'state',
            'state_name',
            'county',
            'county_name',
            'tract',
            'block_group',
            'has_superfund',
            'fips_full',
            'address',
            'city',
            'date_added',
            'federal_facility_ind',
            'federal_register_url',
            'geocode_source',
            'latitude',
            'longitude',
            'site_epa_id',
            'site_name',
            'site_narrative_url',
            'site_progress_url',
            'site_score',
            'site_text'
            ]

feature_df = data_df.copy()
feature_df.drop(feature_df[exclusion_list],axis=1,inplace=True)
feature_names = feature_df.columns

feature_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220423 entries, 0 to 220422
Data columns (total 217 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   flag                              220423 non-null  float64
 1   land_area                         220423 non-null  float64
 2   aian_land                         220423 non-null  float64
 3   urbanized_area_pop_cen_2010       220423 non-null  float64
 4   urban_cluster_pop_cen_2010        220423 non-null  float64
 5   rural_pop_cen_2010                220423 non-null  float64
 6   tot_population_cen_2010           220423 non-null  float64
 7   tot_population_acs_09_13          220423 non-null  float64
 8   males_cen_2010                    220423 non-null  float64
 9   males_acs_09_13                   220423 non-null  float64
 10  females_cen_2010                  220423 non-null  float64
 11  females_acs_09_13                 220423 non-null  

### train/test split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, target, random_state=42)

## Tree Classifier

In [73]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9854280840561827

In [74]:
# use a confusion matrix to inspect the score
clf_predictions = clf.predict(X_test)
clf_report = classification_report(y_test, clf_predictions)
print(clf_report)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.05      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.52      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106



In [42]:
# sorted(zip(clf.feature_importances_, feature_names), reverse=True)
# build a dictionary of features and their importance, and then sort.
clf_feature_importance = {feature_names[i]: clf.feature_importances_[i] for i in range(len(feature_names))}
{k: v for k, v in sorted(clf_feature_importance.items(), key=lambda item: item[1], reverse = True)}

{'pct_diff_hu_1yr_ago_acs_09_13': 0.01711118035654353,
 'pct_pop_5_17_acs_09_13': 0.014829741085691066,
 'pct_no_ph_srvc_acs_09_13': 0.014135845152171552,
 'land_area': 0.014011705022237387,
 'med_hhd_inc_tr_acs_09_13': 0.013814118071819063,
 'pct_pop_65plus_cen_2010': 0.013242544874239927,
 'pct_pop_45_64_acs_09_13': 0.011218805438181911,
 'mail_return_rate_cen_2010': 0.010531776637884894,
 'pct_males_acs_09_13': 0.010361247626389526,
 'med_house_value_tr_acs_09_13': 0.010279968355049482,
 'pct_pop_25_44_acs_09_13': 0.010218134647674203,
 'pct_not_hs_grad_acs_09_13': 0.010009337125180316,
 'pct_pop_under_5_cen_2010': 0.009977396913903723,
 'pct_rel_under_6_cen_2010': 0.009954927137558833,
 'pct_hispanic_cen_2010': 0.009920473286211349,
 'low_response_score': 0.00988568431319622,
 'pct_othr_lang_acs_09_13': 0.009656690165463458,
 'pct_nh_sor_alone_acs_09_13': 0.009646757147122187,
 'pct_census_mail_returns_cen_2010': 0.009502483886838588,
 'female_no_hb_cen_2010': 0.00934470802272645,


## Random Forest Classifier

In [10]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9947192683192393

In [25]:
rf_predictions = rf.predict(X_test)
rf_report = classification_report(y_test, rf_predictions)
print(rf_report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106



In [43]:
# build a dictionary of features and their importance, and then sort.
rf_feature_importance = {feature_names[i]: rf.feature_importances_[i] for i in range(len(feature_names))}
{k: v for k, v in sorted(rf_feature_importance.items(), key=lambda item: item[1], reverse = True)}

{'land_area': 0.012964393563242731,
 'med_hhd_inc_tr_acs_09_13': 0.007681368430074696,
 'pct_females_acs_09_13': 0.007574385060384785,
 'pct_females_cen_2010': 0.007335122259670448,
 'pct_males_cen_2010': 0.0073065230317976414,
 'pct_nh_white_alone_cen_2010': 0.007297228054261189,
 'med_house_value_tr_acs_09_13': 0.00727303298458742,
 'pct_nh_white_alone_acs_09_13': 0.007097865594986473,
 'pct_males_acs_09_13': 0.006963919351585136,
 'pct_diff_hu_1yr_ago_acs_09_13': 0.006868056707325203,
 'pct_hispanic_cen_2010': 0.006364686090385099,
 'pct_pop_45_64_acs_09_13': 0.00626341974841458,
 'pct_census_mail_returns_cen_2010': 0.006250638916719831,
 'pct_female_no_hb_cen_2010': 0.006227837438286827,
 'pct_nh_asian_alone_cen_2010': 0.006142142650498974,
 'pct_pop_5_17_acs_09_13': 0.006130918714626346,
 'pct_mailback_count_cen_2010': 0.0060784390239122,
 'pct_one_health_ins_acs_09_13': 0.006050462618810506,
 'pct_college_acs_09_13': 0.005960504995439208,
 'pct_frst_frms_cen_2010': 0.005949272702

## Balanced Tree Classifier

In [13]:
bclf = tree.DecisionTreeClassifier(class_weight='balanced')
bclf = bclf.fit(X_train, y_train)
bclf.score(X_test, y_test)

0.986897978441549

In [22]:
bclf_predictions = bclf.predict(X_test)
bclf_report = classification_report(y_test, bclf_predictions)
print(bclf_report)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.06      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.53      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106



In [44]:
# build a dictionary of features and their importance, and then sort.
bclf_feature_importance = {feature_names[i]: bclf.feature_importances_[i] for i in range(len(feature_names))}
{k: v for k, v in sorted(bclf_feature_importance.items(), key=lambda item: item[1], reverse = True)}

{'land_area': 0.11753774131889692,
 'low_response_score': 0.02858976258031158,
 'mlt_u2_9_strc_acs_09_13': 0.019225915441889792,
 'pct_female_no_hb_cen_2010': 0.014987255260355986,
 'pct_nh_white_alone_acs_09_13': 0.014936926823905515,
 'urbanized_area_pop_cen_2010': 0.014444867236379371,
 'pct_hhd_moved_in_acs_09_13': 0.013268682994058313,
 'pct_pop_5_17_cen_2010': 0.012705735096406566,
 'mrdcple_fmly_hhd_acs_09_13': 0.01245412378109287,
 'pct_pop_18_24_acs_09_13': 0.012363741592317044,
 'pct_hhd_ppl_und_18_acs_09_13': 0.012248810424885197,
 'pct_rel_under_6_acs_09_13': 0.011953887186106614,
 'pct_pop_45_64_acs_09_13': 0.01194876101532314,
 'pop_under_5_acs_09_13': 0.011803322292735518,
 'pct_renter_occp_hu_cen_2010': 0.011484078516518002,
 'pct_not_hs_grad_acs_09_13': 0.01134172852057825,
 'occp_u_no_ph_srvc_acs_09_13': 0.011235459612806456,
 'pct_rel_under_6_cen_2010': 0.010696752911103478,
 'pct_college_acs_09_13': 0.010674044719388471,
 'pct_sngl_prns_hhd_acs_09_13': 0.01050511418

## Balanced Random Forest Classifier

In [16]:
brf = RandomForestClassifier(n_estimators=200, class_weight='balanced')
brf = brf.fit(X_train, y_train)
brf.score(X_test, y_test)

0.9947192683192393

In [20]:
brf_predictions = brf.predict(X_test)
brf_report = classification_report(y_test, brf_predictions)
print(brf_report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106



In [45]:
# build a dictionary of features and their importance, and then sort.
brf_feature_importance = {feature_names[i]: brf.feature_importances_[i] for i in range(len(feature_names))}
{k: v for k, v in sorted(brf_feature_importance.items(), key=lambda item: item[1], reverse = True)}

{'land_area': 0.054753621744842024,
 'pct_females_cen_2010': 0.01033304129296016,
 'pct_males_cen_2010': 0.009957144692437919,
 'pct_college_acs_09_13': 0.009120403546170278,
 'pct_mailback_count_cen_2010': 0.008647887184056246,
 'pct_single_unit_acs_09_13': 0.008508420637432065,
 'pct_females_acs_09_13': 0.008205311714530586,
 'pct_males_acs_09_13': 0.007743926536512286,
 'med_house_value_tr_acs_09_13': 0.007702319824757898,
 'pct_pop_65plus_cen_2010': 0.007668892838845825,
 'pct_nh_white_alone_cen_2010': 0.007586018262994815,
 'mail_return_rate_cen_2010': 0.007322187388995538,
 'pct_pop_65plus_acs_09_13': 0.007208743136713535,
 'pct_mlt_u2_9_strc_acs_09_13': 0.007172390876984231,
 'med_house_value_bg_acs_09_13': 0.006959843959113943,
 'pct_hispanic_cen_2010': 0.006926890796919194,
 'pct_pop_18_24_acs_09_13': 0.006877756252085353,
 'pct_vacant_cen_2010': 0.0067917757596097205,
 'pct_pop_45_64_acs_09_13': 0.006776653624140947,
 'pct_pop_45_64_cen_2010': 0.006539328101815171,
 'pct_one_

## Model Summaries

In [30]:
print('Confusion Matrices')
print('-----------------------------------------------------')
print('Decision Tree')
print(clf_report)
print('-----------------------------------------------------')
print('Random Forest')
print(rf_report)
print('-----------------------------------------------------')
print('Decision Tree with Class Balancing')
print(bclf_report)
print('-----------------------------------------------------')
print('Random Forest with Class Balancing')
print(brf_report)
print('-----------------------------------------------------')

Confusion Matrices
-----------------------------------------------------
Decision Tree
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.06      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.53      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106

-----------------------------------------------------
Random Forest
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

-----------------------------------------------------
Decision Tree with Class Balancing
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99    

## Reduce Features

In [68]:
# Select the bottom K features
K = 20
clf_bottom_features = dict(sorted(clf_feature_importance.items(), key = itemgetter(1))[:K])
# make a list of the feature names
bottom_list = list(clf_bottom_features.keys())

In [69]:
# make a new exclusion list and generate a new features df
new_exclusion_list = exclusion_list + bottom_list
new_exclusion_list

new_feature_df = data_df.copy()
new_feature_df.drop(new_feature_df[new_exclusion_list],axis=1,inplace=True)
feature_names = new_feature_df.columns

new_feature_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220423 entries, 0 to 220422
Data columns (total 197 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   land_area                         220423 non-null  float64
 1   urbanized_area_pop_cen_2010       220423 non-null  float64
 2   urban_cluster_pop_cen_2010        220423 non-null  float64
 3   rural_pop_cen_2010                220423 non-null  float64
 4   tot_population_cen_2010           220423 non-null  float64
 5   tot_population_acs_09_13          220423 non-null  float64
 6   females_cen_2010                  220423 non-null  float64
 7   females_acs_09_13                 220423 non-null  float64
 8   pop_under_5_acs_09_13             220423 non-null  float64
 9   pop_5_17_cen_2010                 220423 non-null  float64
 10  pop_5_17_acs_09_13                220423 non-null  float64
 11  pop_18_24_cen_2010                220423 non-null  

In [70]:
X_train, X_test, y_train, y_test = train_test_split(new_feature_df, target, random_state=42)
clf_2 = tree.DecisionTreeClassifier()
clf_2 = clf_2.fit(X_train, y_train)
clf_2.score(X_test, y_test)
clf_2_predictions = clf_2.predict(X_test)
clf_2_report = classification_report(y_test, clf_2_predictions)
print(clf_2_report)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.05      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.52      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106

