In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score

from sklearn.model_selection import cross_validate, KFold

In [3]:
np.random.seed(0)

## Assessment of the intial RandomForestClassifier

In [4]:
def classifier_assessment(X, y):
    """
    RandomForestClassifier comparison for the different datasets

    args: X dataset of features
          y target set of values for classification

    returns: 
        - a dummy classifier score using "most frequent" value assignment
        - the mean of the RandomForest Claffifier prediction scores taken from a 
          5 fold cross validation on the dataset
        - a dataframe that shows the 10 most important features used by the classifer          
    """
    
    clf_rf = RandomForestClassifier(max_depth=4, random_state=0)
    
    def performance_graphics(X=X, y=y, clf_rf=clf_rf):
        # type of certainty tied to classifier passed in
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
        clf_rf.fit(X_train, y_train)
     
        rf_proba = clf_rf.predict_proba(X_test)[:, 1]

        precision, recall, thresholds = precision_recall_curve(y_test, rf_proba)
        fp, tp, thresholds_roc = roc_curve(y_test, rf_proba)
        
        auc_score = np.round(roc_auc_score(y_test, rf_proba), 4)

        close_default = np.argmin(np.abs(thresholds - 0.5))
        close_zero = np.argmin(np.abs(thresholds_roc))

        fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15, 4))

        ax1.plot(precision, recall, label="Precision Recall Curve")
        ax1.plot(precision[close_default], 
                 recall[close_default], 'o', 
                 c='r', markersize=10, 
                 label='threshold 0.5', 
                 fillstyle="none", mew=2)
        ax1.set_title("RF performance")
        ax1.set_xlabel("Precision")
        ax1.set_ylabel("Recall")
        ax1.legend(loc='best')

        ax2.plot(fp, tp, label="ROC curve")
        ax2.plot(fp[close_zero], 
                 tp[close_zero], 'o', 
                 c='r', markersize=10, 
                 label='threshold 0', 
                 fillstyle="none", mew=2)
        ax2.set_title(f"ROC performance: AUC Score {auc_score}")
        ax2.set_xlabel("False Positive Rate")
        ax2.set_ylabel("True Positive (Recall)")
        ax2.legend(loc='best')

        plt.show();



    clf_rf = RandomForestClassifier(max_depth=4, random_state=0)

    kfold = KFold(n_splits=5)
    cross_val = cross_validate(clf_rf, X, y, cv=kfold, return_estimator=True)
    mean_score = cross_val['test_score'].mean()

    estimator = cross_val['estimator']
    ranked_features = {}

    for i, clf in enumerate(estimator):
        clf_no = i + 1
        feat_imp_val = clf.feature_importances_
        cols = X.columns
        feature_importance = list(zip(cols, feat_imp_val))
        feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
        feature_importance = [ f[0] for f in feature_importance]
        ranked_features[f"Estimator: {clf_no}"] = feature_importance[:10]
        
    dummy_clf = DummyClassifier(strategy="most_frequent")
    dummy_clf.fit(X,y)
    d_score = dummy_clf.score(X,y)
    
    df = pd.DataFrame(ranked_features)

    graphic = performance_graphics

    return d_score, mean_score, df, graphic



# Compare Datasets

Leave commented until the notebook has placed the values in scope.  Then uncomment to view the comparisons. 

In [5]:
# # # # # # # # The results of the combinations: 

# obj_ = [('testing dataset', dummy_single,  single_score, (83, 2)), 
#        ('testing and acs datasets', dummy_double, score_two_datasets, (83, 119)),\
#        ]

# condition = [c[0] for c in obj_]
# dummy_results = [d[1] for d in obj_]
# results = [r[2] for r in obj_]
# shape = [s[3] for s in obj_]

# pd.DataFrame({'dummy classifier': dummy_results, 'cross val score': results, 'data set shape': shape}, index=condition)

In [6]:
# df_single_set_output
# df_two_datasets_output


In [7]:
# graphic_single()
# graphic_two()


## Initial testing dataset of blood lead levels

In [8]:
# dataset prepared with this notebook: 
# https://github.com/Cameron-Grams/ComparisonCollection/blob/main/ebll_exploration.ipynb

df_ebll = pd.read_csv('./data/ebll_classifier_2014.csv')

In [9]:
# standardize the county names

df_ebll = df_ebll.iloc[:83, :]
df_ebll['county'] = df_ebll['County Name'].apply(lambda x: x.split()[:-1])
df_ebll['county'] = df_ebll['county'].apply(lambda x: (' ').join(x).lower())
# df_ebll.tail(30)

In [10]:
df_ebll = df_ebll.set_index('county')
df_ebll = df_ebll.iloc[:, 2:]
starting_df = df_ebll.iloc[:, :-1]
starting_df = starting_df.reset_index()
target = df_ebll.iloc[:, -1]

In [11]:
# starting_df.head()

### Assess single dataset

In [12]:
df_single_set = starting_df.set_index('county')
X = df_single_set
y = target
X.shape

(83, 2)

In [13]:
dummy_single, single_score, df_single_set_output, graphic_single = classifier_assessment(X, y)
dummy_single, single_score

(0.7349397590361446, 0.8433823529411765)

In [14]:
df_single_set_output

Unnamed: 0,Estimator: 1,Estimator: 2,Estimator: 3,Estimator: 4,Estimator: 5
0,Number of Children Tested < 72 Months of Age,Number of Children Tested < 72 Months of Age,Number of Children Tested < 72 Months of Age,Number of Children Tested < 72 Months of Age,Number of Children Tested < 72 Months of Age
1,Total Population of Children < 72 Months of Age,Total Population of Children < 72 Months of Age,Total Population of Children < 72 Months of Age,Total Population of Children < 72 Months of Age,Total Population of Children < 72 Months of Age


In [15]:
# graphic_single()

## Add American Community Survey dataset

In [31]:
# dataset prepared with this notebook:
# https://github.com/Cameron-Grams/ComparisonCollection/blob/main/acs_2014.ipynb

df_acs = pd.read_csv('./data/cleaned_2014_acs.csv')

In [17]:
df_acs.head()

Unnamed: 0,County Name,Population 16 years and over,In labor force,Civilian labor force,Employed,Unemployed,Armed Forces,Not in labor force,Civilian labor force.1,Females 16 years and over,...,Unemployed:,With health insurance coverage.2,With private health insurance.2,With public coverage.2,No health insurance coverage.3,Not in labor force:,With health insurance coverage.3,With private health insurance.3,With public coverage.3,No health insurance coverage.4
0,Alcona County,9424,3796,3796,3294,502,0,5628,3796,4668,...,478,290,177,128,188,2168,1849,1168,988,319
1,Alger County,8221,3602,3602,3175,427,0,4619,3602,3618,...,394,226,110,118,168,1589,1364,992,548,225
2,Allegan County,87238,54617,54614,50406,4208,3,32621,54614,44131,...,3868,2105,1432,761,1763,15925,13143,8706,5896,2782
3,Alpena County,24077,13731,13659,12227,1432,72,10346,13659,12369,...,1333,736,351,425,597,4421,3890,2051,2344,531
4,Antrim County,19340,10762,10759,9462,1297,3,8578,10759,9760,...,1148,583,388,208,565,3190,2716,1886,1103,474


In [18]:
# format county names
county = df_acs['County Name'].apply(lambda x: x.split()[:-1])
county = [(' ').join(x).lower() for x in county][:-1]

In [19]:
# need to replace the strings with integers
# get columns needed for conversion 
cols = df_acs.columns[1:]
df_acs_1 = df_acs.iloc[:, 1:]

In [20]:
# df contains a mixed collection of strings and numbers 
string_values = df_acs_1.values

In [21]:
new_vals = []
for arr in string_values:
    arr_2 = [x if (type(x) == int or type(x) == float) else int(x.replace(',', '')) for x in arr]
    new_vals.append(arr_2)

In [22]:
# last sample is the summary for Michigan as a state
number_array = np.array(new_vals)[:-1]

In [23]:
df_acs_2 = pd.DataFrame(number_array, columns=cols)

In [24]:
df_acs_2['county'] = county
df_acs_2.head()

Unnamed: 0,Population 16 years and over,In labor force,Civilian labor force,Employed,Unemployed,Armed Forces,Not in labor force,Civilian labor force.1,Females 16 years and over,In labor force.1,...,With health insurance coverage.2,With private health insurance.2,With public coverage.2,No health insurance coverage.3,Not in labor force:,With health insurance coverage.3,With private health insurance.3,With public coverage.3,No health insurance coverage.4,county
0,9424.0,3796.0,3796.0,3294.0,502.0,0.0,5628.0,3796.0,4668.0,1830.0,...,290.0,177.0,128.0,188.0,2168.0,1849.0,1168.0,988.0,319.0,alcona
1,8221.0,3602.0,3602.0,3175.0,427.0,0.0,4619.0,3602.0,3618.0,1720.0,...,226.0,110.0,118.0,168.0,1589.0,1364.0,992.0,548.0,225.0,alger
2,87238.0,54617.0,54614.0,50406.0,4208.0,3.0,32621.0,54614.0,44131.0,25248.0,...,2105.0,1432.0,761.0,1763.0,15925.0,13143.0,8706.0,5896.0,2782.0,allegan
3,24077.0,13731.0,13659.0,12227.0,1432.0,72.0,10346.0,13659.0,12369.0,6583.0,...,736.0,351.0,425.0,597.0,4421.0,3890.0,2051.0,2344.0,531.0,alpena
4,19340.0,10762.0,10759.0,9462.0,1297.0,3.0,8578.0,10759.0,9760.0,5076.0,...,583.0,388.0,208.0,565.0,3190.0,2716.0,1886.0,1103.0,474.0,antrim


In [25]:
df_two_datasets_ = pd.merge(starting_df, df_acs_2, on='county')
df_two_datasets = df_two_datasets_.set_index('county')
df_two_datasets.shape

(83, 119)

In [26]:
X = df_two_datasets
y = target

In [27]:
dummy_double, score_two_datasets, df_two_datasets_output, graphic_two = classifier_assessment(X, y)
dummy_double, score_two_datasets

(0.7349397590361446, 0.8911764705882353)

In [28]:
# graphic_two()

In [29]:
df_two_datasets_output

Unnamed: 0,Estimator: 1,Estimator: 2,Estimator: 3,Estimator: 4,Estimator: 5
0,Walked,"$10,000 to $14,999","Educational services, and health care and soci...","Transportation and warehousing, and utilities",Walked
1,"Transportation and warehousing, and utilities",Public administration,Government workers,With public coverage.1,With Food Stamp/SNAP benefits in the past 12 m...
2,"Educational services, and health care and soci...","Transportation and warehousing, and utilities",With Food Stamp/SNAP benefits in the past 12 m...,"$10,000 to $14,999.1","$10,000 to $14,999.1"
3,Public administration,With Food Stamp/SNAP benefits in the past 12 m...,With private health insurance.3,With Food Stamp/SNAP benefits in the past 12 m...,With public coverage.1
4,With Supplemental Security Income,"Less than $10,000.1","Transportation and warehousing, and utilities","Educational services, and health care and soci...","Educational services, and health care and soci..."
5,Government workers,"Natural resources, construction, and maintenan...",Walked,"Less than $10,000.1",No health insurance coverage.3
6,Employed.1,"Less than $10,000",Public administration,Walked,"Transportation and warehousing, and utilities"
7,Wholesale trade,No health insurance coverage.2,Civilian employed population 16 years and over.2,Public administration,"Less than $10,000"
8,With Food Stamp/SNAP benefits in the past 12 m...,With cash public assistance income,"$25,000 to $34,999",With Supplemental Security Income,"$15,000 to $24,999"
9,"Less than $10,000.1",Nonfamily households,"Less than $10,000.1","$10,000 to $14,999","Less than $10,000.1"


In [30]:
two_ds_key_values = df_two_datasets_output.values
two_ds_key_features = set(two_ds_key_values.flatten())
# two_ds_key_features