# Tree Models for Project 3

## Setup

### Import Libraries

In [1]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

from operator import itemgetter

### Set File Locations

In [2]:
# note that some of the raw data files are very large
# these very large files are located in a gitignored directory.

# cleaned, merged data
merged_data_csv = "../00_Data/cleaned_data/cleaned_merged_data.csv"

## Import Data

In [3]:
# Import census data
data_df = pd.read_csv(merged_data_csv)

data_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220423 entries, 0 to 220422
Data columns (total 240 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   fips_block_group                  220423 non-null  int64  
 1   state                             220423 non-null  float64
 2   state_name                        220423 non-null  object 
 3   county                            220423 non-null  float64
 4   county_name                       220423 non-null  object 
 5   tract                             220423 non-null  float64
 6   block_group                       220423 non-null  float64
 7   flag                              220423 non-null  float64
 8   land_area                         220423 non-null  float64
 9   aian_land                         220423 non-null  float64
 10  urbanized_area_pop_cen_2010       220423 non-null  float64
 11  urban_cluster_pop_cen_2010        220423 non-null  

## Prep Data

In [4]:
# Prepare the target
target = data_df["has_superfund"]
target_names = ["negative", "positive"]

In [5]:
# Prepare the features
# Drop all the columns that came in from the site data. This prevents 'trailing indicators' from getting into the model.
# Also drop any column that shouldn't mathematically matter, such as FIPS, tract, etc.
exclusion_list = ['fips_block_group',
            'state',
            'state_name',
            'county',
            'county_name',
            'tract',
            'block_group',
            'has_superfund',
            'fips_full',
            'address',
            'city',
            'date_added',
            'federal_facility_ind',
            'federal_register_url',
            'geocode_source',
            'latitude',
            'longitude',
            'site_epa_id',
            'site_name',
            'site_narrative_url',
            'site_progress_url',
            'site_score',
            'site_text'
            ]

feature_df = data_df.copy()
feature_df.drop(feature_df[exclusion_list],axis=1,inplace=True)
feature_names = feature_df.columns

feature_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220423 entries, 0 to 220422
Data columns (total 217 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   flag                              220423 non-null  float64
 1   land_area                         220423 non-null  float64
 2   aian_land                         220423 non-null  float64
 3   urbanized_area_pop_cen_2010       220423 non-null  float64
 4   urban_cluster_pop_cen_2010        220423 non-null  float64
 5   rural_pop_cen_2010                220423 non-null  float64
 6   tot_population_cen_2010           220423 non-null  float64
 7   tot_population_acs_09_13          220423 non-null  float64
 8   males_cen_2010                    220423 non-null  float64
 9   males_acs_09_13                   220423 non-null  float64
 10  females_cen_2010                  220423 non-null  float64
 11  females_acs_09_13                 220423 non-null  

### train/test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(feature_df, target, random_state=42)

## Build Models

### Tree Classifier

In [7]:
%%time
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

Wall time: 56.8 s


In [8]:
# use a confusion matrix to inspect the score
clf_predictions = clf.predict(X_test)
clf_report = classification_report(y_test, clf_predictions)
print(clf_report)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.05      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.52      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106



In [9]:
# sorted(zip(clf.feature_importances_, feature_names), reverse=True)
# build a dictionary of features and their importance, and then sort.
clf_feature_importance = {feature_names[i]: clf.feature_importances_[i] for i in range(len(feature_names))}
# {k: v for k, v in sorted(clf_feature_importance.items(), key=lambda item: item[1], reverse = True)}

### Random Forest Classifier

In [10]:
%%time
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

Wall time: 6min 42s


In [11]:
rf_predictions = rf.predict(X_test)
rf_report = classification_report(y_test, rf_predictions)
print(rf_report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106



In [12]:
# build a dictionary of features and their importance, and then sort.
rf_feature_importance = {feature_names[i]: rf.feature_importances_[i] for i in range(len(feature_names))}
# {k: v for k, v in sorted(rf_feature_importance.items(), key=lambda item: item[1], reverse = True)}

### Balanced Tree Classifier

In [13]:
%%time
bclf = tree.DecisionTreeClassifier(class_weight='balanced')
bclf = bclf.fit(X_train, y_train)

Wall time: 30.5 s


In [14]:
bclf_predictions = bclf.predict(X_test)
bclf_report = classification_report(y_test, bclf_predictions)
print(bclf_report)

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.06      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.53      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106



In [15]:
# build a dictionary of features and their importance, and then sort.
bclf_feature_importance = {feature_names[i]: bclf.feature_importances_[i] for i in range(len(feature_names))}
# {k: v for k, v in sorted(bclf_feature_importance.items(), key=lambda item: item[1], reverse = True)}

### Balanced Random Forest Classifier

In [16]:
%%time
brf = RandomForestClassifier(n_estimators=200, class_weight='balanced')
brf = brf.fit(X_train, y_train)

Wall time: 3min 58s


In [17]:
brf_predictions = brf.predict(X_test)
brf_report = classification_report(y_test, brf_predictions)
print(brf_report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106



In [18]:
# build a dictionary of features and their importance, and then sort.
brf_feature_importance = {feature_names[i]: brf.feature_importances_[i] for i in range(len(feature_names))}
# {k: v for k, v in sorted(brf_feature_importance.items(), key=lambda item: item[1], reverse = True)}

## Evaluate Models

### Confusion Matrices

In [19]:
print('Confusion Matrices')
print('-----------------------------------------------------')
print('Decision Tree')
print(clf_report)
print('-----------------------------------------------------')
print('Random Forest')
print(rf_report)
print('-----------------------------------------------------')
print('Decision Tree with Class Balancing')
print(bclf_report)
print('-----------------------------------------------------')
print('Random Forest with Class Balancing')
print(brf_report)
print('-----------------------------------------------------')

Confusion Matrices
-----------------------------------------------------
Decision Tree
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     54790
         1.0       0.05      0.09      0.07       316

    accuracy                           0.99     55106
   macro avg       0.52      0.54      0.53     55106
weighted avg       0.99      0.99      0.99     55106

-----------------------------------------------------
Random Forest
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

-----------------------------------------------------
Decision Tree with Class Balancing
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99    

### ROC-AUC Score

In [20]:
from sklearn.metrics import roc_auc_score

clf_prob_predictions = clf.predict_proba(X_test)
bclf_prob_predictions = bclf.predict_proba(X_test)
rf_prob_predictions = rf.predict_proba(X_test)
brf_prob_predictions = brf.predict_proba(X_test)

In [21]:
print(f"Decision Tree ROC-AUC score: {roc_auc_score(y_test, clf_prob_predictions[:,1])}")
print(f"Balanced Decision Tree ROC-AUC score: {roc_auc_score(y_test, bclf_prob_predictions[:,1])}")
print(f"Random Forest ROC-AUC score: {roc_auc_score(y_test, rf_prob_predictions[:,1])}")
print(f"Balanced Random Forest ROC-AUC score: {roc_auc_score(y_test, brf_prob_predictions[:,1])}")

Decision Tree ROC-AUC score: 0.539677040760926
Balanced Decision Tree ROC-AUC score: 0.5402884662035251
Random Forest ROC-AUC score: 0.6331686751024048
Balanced Random Forest ROC-AUC score: 0.663080293918552


## Reduce Features and Iterate

In [28]:
# Select the bottom K features for exclusion
K = 28
bottom_features = dict(sorted(brf_feature_importance.items(), key = itemgetter(1))[:K])
# make a list of the feature names
bottom_list = list(bottom_features.keys())

In [29]:
# make a new exclusion list and generate a new features df
new_exclusion_list = exclusion_list + bottom_list

new_feature_df = data_df.copy()
new_feature_df.drop(new_feature_df[new_exclusion_list],axis=1,inplace=True)
feature_names = new_feature_df.columns

new_feature_df.info(verbose = True, null_counts = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220423 entries, 0 to 220422
Data columns (total 189 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   land_area                         220423 non-null  float64
 1   urbanized_area_pop_cen_2010       220423 non-null  float64
 2   urban_cluster_pop_cen_2010        220423 non-null  float64
 3   rural_pop_cen_2010                220423 non-null  float64
 4   tot_population_cen_2010           220423 non-null  float64
 5   tot_population_acs_09_13          220423 non-null  float64
 6   males_cen_2010                    220423 non-null  float64
 7   males_acs_09_13                   220423 non-null  float64
 8   females_cen_2010                  220423 non-null  float64
 9   females_acs_09_13                 220423 non-null  float64
 10  pop_under_5_cen_2010              220423 non-null  float64
 11  pop_under_5_acs_09_13             220423 non-null  

In [31]:
%%time
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(new_feature_df, target, random_state=42)

brf_2 = RandomForestClassifier(n_estimators=200, class_weight='balanced')
brf_2 = brf_2.fit(X_train_2, y_train_2)

brf_2_predictions = brf_2.predict(X_test_2)
brf_2_prob_predictions = brf_2.predict_proba(X_test_2)


print(classification_report(y_test_2, brf_2_predictions))

print(f"ROC-AUC score: {roc_auc_score(y_test_2, brf_2_prob_predictions[:,1])}")

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

ROC-AUC score: 0.6849853641406429
Wall time: 4min 2s


In [25]:
# %%time
# best_features = 0
# best_roc_auc_score = 0
# score_log = {}

# for i in range(0, len(feature_df.columns)-1):
    
#     print('-----------------------------------------------------')
#     print(f"Top {len(feature_df.columns)-i} features")
    
#     # trim off a bunch of features
#     K = i
#     bottom_features = dict(sorted(brf_feature_importance.items(), key = itemgetter(1))[:K])
    
#     # make a list of the feature names
#     bottom_list = list(bottom_features.keys())
#     new_exclusion_list = exclusion_list + bottom_list

#     # create the new feature list
#     new_feature_df = data_df.copy()
#     new_feature_df.drop(new_feature_df[new_exclusion_list],axis=1,inplace=True)
#     feature_names = new_feature_df.columns

#     # create the train/test split
#     X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(new_feature_df, target, random_state=42)

#     # create and fit the model
#     brf_2 = RandomForestClassifier(n_estimators=200, class_weight='balanced')
#     brf_2 = brf_2.fit(X_train_2, y_train_2)

#     # run some predictions
#     brf_2_predictions = brf_2.predict(X_test_2)
#     brf_2_prob_predictions = brf_2.predict_proba(X_test_2)
    
#     new_score = roc_auc_score(y_test_2, brf_2_prob_predictions[:,1])
    
#     score_log[len(feature_df.columns)-i] = new_score
    
#     if new_score > best_roc_auc_score:
#         best_roc_auc_score = new_score
#         best_features = len(feature_df.columns)-i
        
#     # print results
#     print(classification_report(y_test_2, brf_2_predictions))
#     print(f"ROC-AUC score: {new_score}")
#     print('-----------------------------------------------------')

-----------------------------------------------------
Top 217 features
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

ROC-AUC score: 0.682662311333723
-----------------------------------------------------
-----------------------------------------------------
Top 216 features
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

ROC-AUC score: 0.6945982185144197
-----------------------------------------------------
-----------------------------

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

ROC-AUC score: 0.6618069048449661
-----------------------------------------------------
-----------------------------------------------------
Top 199 features
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

ROC-AUC score: 0.6804974574959397
-----------------------------------------------------
-----------------------------------------------------
Top 198 features
              precision    r

              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

ROC-AUC score: 0.6448241964139256
-----------------------------------------------------
-----------------------------------------------------
Top 182 features
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     54790
         1.0       1.00      0.08      0.15       316

    accuracy                           0.99     55106
   macro avg       1.00      0.54      0.57     55106
weighted avg       0.99      0.99      0.99     55106

ROC-AUC score: 0.668192939208624
-----------------------------------------------------
-----------------------------------------------------
Top 181 features
              precision    re

KeyboardInterrupt: 

In [26]:
# print(f"Best results was a ROC-AUC score of {best_roc_auc_score} for the top {best_features} features.")

Best results was a ROC-AUC score of 0.6945982185144197 for the top 216 features.


In [27]:
# print(score_log)

{217: 0.682662311333723, 216: 0.6945982185144197, 215: 0.6587708303973052, 214: 0.6694884495692414, 213: 0.6755307087360024, 212: 0.6710464985987927, 211: 0.6719152067387332, 210: 0.6746527304483634, 209: 0.6711177430049371, 208: 0.6699417338006335, 207: 0.6640255601941591, 206: 0.6709021037748273, 205: 0.6642787998364295, 204: 0.6670819076751047, 203: 0.6796729919300621, 202: 0.666171642704827, 201: 0.6849155925617028, 200: 0.6618069048449661, 199: 0.6804974574959397, 198: 0.669501473982363, 197: 0.6513311181242073, 196: 0.6684626976187562, 195: 0.6668808234432506, 194: 0.6616671595343324, 193: 0.6881074690244223, 192: 0.6810843069394998, 191: 0.6749669624642768, 190: 0.6651509445731805, 189: 0.6913636300627712, 188: 0.6695431463285594, 187: 0.6665783162870431, 186: 0.6754973535316664, 185: 0.6754638828114712, 184: 0.6525807398097685, 183: 0.6448241964139256, 182: 0.668192939208624, 181: 0.656886622339381, 180: 0.6605865375507404, 179: 0.6566199539784817, 178: 0.6689709385201494, 177: