In [66]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image
from time import sleep
import os
import numpy as np
import sklearn as sk
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
import time

from amb_sdk.sdk import DarwinSdk

In [67]:
# Login
ds = DarwinSdk()
ds.set_url('https://amb-demo-api.sparkcognition.com/v1/')
ds.auth_login_user('vargasbri2@gmail.com', 'qNuNeX5JGY')

(True,
 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE1NTYwNzkxMjcsImlhdCI6MTU1NjA3MTkyNywibmJmIjoxNTU2MDcxOTI3LCJqdGkiOiI5ZDAyZDNjNi0yNjY5LTRlZGQtOWRkZS01MWZlNTYzYzM2MTQiLCJpZGVudGl0eSI6Ijc2NmRiM2JjLTRmMzUtMTFlOS1iMzEwLTc3ZWRkN2EzNzNhOSIsImZyZXNoIjpmYWxzZSwidHlwZSI6ImFjY2VzcyJ9.CtvUqUnAqXJxmlcQ5K_iG7n49H5VcLUY8FE5WnQMHGc')

In [68]:
"""Removing Meas_Rpt, Meas_Exp, CRS_Rpt, CRS_Exp because not a lot of varying data
Removing for too many missing values: High_Blood_Pres (0.515441), Few_Fruit_Veg (0.393824)
Removed state and county names because not numerical"""

df_list = []
labels_col = 'HPSA_Ind'
newdata_path = 'balanced/'
sampled_filename = newdata_path + 'BALANCEDSAMPLEDDATASET.csv'
base_path = 'chsi_dataset/'


def create_new_df(dataset_name, column_names):
    df = pd.read_csv(dataset_name)
    return df[column_names]

#Upscales minority class (in our case HPSA_Ind = 1) to perform class balancing
def upscale(df):
    ones_df = df.loc[df['HPSA_Ind'] == 1]
    twos_df = df.loc[df['HPSA_Ind'] == 2]
    print("Original class ratio: ")
    print(df.HPSA_Ind.value_counts())
    
    df_minority_upsampled = resample(ones_df, replace=True, n_samples=len(twos_df))
    df_upsampled = pd.concat([twos_df, df_minority_upsampled])
    
    return df_upsampled

"""Creates new column to act as unique key to merge on. 
This new column, called State_And_County_FIPS_Code
is formatted as <State_FIPS_Code>-<County_FIPS_Code>.
The original columns for State_FIPS_Code and County_FIPS_Code are then removed."""

def merge_state_and_county_codes(df):
    df["State_And_County_FIPS_Code"] = df["State_FIPS_Code"].map(str) + "-" + df["County_FIPS_Code"].map(str)
    df = df.drop(columns = ['State_FIPS_Code', 'County_FIPS_Code'])
    return df

#Creates aggregated dataset from original six datsets in chsi
def sample_help():
    global df_list
    col_names = ['State_FIPS_Code', 'County_FIPS_Code', 'Population_Size','Population_Density','Poverty','Age_19_Under','Age_19_64','Age_65_84','Age_85_and_Over','White','Black','Native_American','Asian','Hispanic']
    demographics_df = create_new_df(base_path + "DEMOGRAPHICS.csv", col_names)
    df_list.append(demographics_df)

    col_names = ['State_FIPS_Code', 'County_FIPS_Code', 'ALE', 'Health_Status']
    summ_meas_health_df = create_new_df(base_path + "SUMMARYMEASURESOFHEALTH.csv", col_names)
    df_list.append(summ_meas_health_df)

    col_names = ['State_FIPS_Code', 'County_FIPS_Code', 'LBW','VLBW','Premature','Under_18','Over_40','Late_Care','Infant_Mortality','Unmarried','Brst_Cancer','Col_Cancer','CHD','Lung_Cancer','Suicide']
    meas_birth_and_death_df = create_new_df(base_path + "MEASURESOFBIRTHANDDEATH.csv", col_names)
    df_list.append(meas_birth_and_death_df)

    col_names = ['State_FIPS_Code', 'County_FIPS_Code', 'No_HS_Diploma','Unemployed','Sev_Work_Disabled','Major_Depression','Recent_Drug_Use','Ecol_Rpt','Ecol_Exp','Salm_Rpt','Salm_Exp','Shig_Rpt','Shig_Exp']
    vuln_pop_and_env_health_df = create_new_df(base_path + "VUNERABLEPOPSANDENVHEALTH.csv", col_names)
    df_list.append(vuln_pop_and_env_health_df)

    col_names = ['State_FIPS_Code', 'County_FIPS_Code', 'FluB_Rpt','FluB_Exp','HepA_Rpt','HepA_Exp','HepB_Rpt','HepB_Exp','Pert_Rpt','Pert_Exp','Syphilis_Rpt','Syphilis_Exp']
    prev_services_df = create_new_df(base_path + "PREVENTIVESERVICESUSE.csv", col_names)
    df_list.append(prev_services_df)

    col_names = ['State_FIPS_Code', 'County_FIPS_Code', 'No_Exercise','Obesity','Smoker','Diabetes','Uninsured', labels_col]
    risk_factors_df = create_new_df(base_path + "RISKFACTORSANDACCESSTOCARE.csv", col_names)
    df_list.append(risk_factors_df)
    
    #Merges State and County Codes for each df
    for df_idx in range(len(df_list)):
        df_list[df_idx] = merge_state_and_county_codes(df_list[df_idx])

#Creates a sampled dataset on size n from aggregate dataset        
def my_sample(n):
    global df_list
    global sampled_filename
    sample_help()
    df_list = [df.set_index("State_And_County_FIPS_Code") for df in df_list]
    merged_df = df_list[0].join(df_list[1:])
    upscaled_df = upscale(merged_df)
    smaller_sample = upscaled_df.sample(n)
    print("Balanced class ratio:")
    print(smaller_sample.HPSA_Ind.value_counts())
    smaller_sample.head()
    smaller_sample.to_csv(sampled_filename)
    
#This call was to create the sampled dataset. It is commented out for subsequent runs.
# my_sample(2000)

In [69]:
#Read aggregated data into DataFrame
merged_df = pd.read_csv(sampled_filename, index_col='State_And_County_FIPS_Code')

In [70]:
#Replace error values with NaN for imputation handling
merged_df = merged_df.replace([-9999, -9998.9, -2222.2, -2222, -1111.1, -1111, -2, -1], np.nan)
merged_df.head()

Unnamed: 0_level_0,Population_Size,Population_Density,Poverty,Age_19_Under,Age_19_64,Age_65_84,Age_85_and_Over,White,Black,Native_American,...,Pert_Rpt,Pert_Exp,Syphilis_Rpt,Syphilis_Exp,No_Exercise,Obesity,Smoker,Diabetes,Uninsured,HPSA_Ind
State_And_County_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2-282,722,0.0,10.8,22.1,70.2,7.2,0.4,57.8,0.1,40.6,...,0.0,0.0,0.0,0.0,,,,,114.0,1
48-501,7408,9.0,15.5,31.1,57.3,10.3,1.4,96.7,1.8,0.9,...,1.0,2.0,0.0,2.0,,,,,1694.0,1
37-75,8085,28.0,16.9,22.7,59.0,16.1,2.2,91.6,0.5,7.2,...,2.0,2.0,1.0,3.0,,,,7.5,1206.0,1
47-121,11657,60.0,15.9,23.9,63.2,11.8,1.1,97.4,1.8,0.3,...,0.0,1.0,2.0,2.0,,,,2.6,1537.0,1
40-73,14302,16.0,10.5,24.8,60.1,13.1,1.9,93.5,1.6,2.8,...,3.0,4.0,3.0,0.0,29.1,27.0,22.0,5.0,2205.0,2


In [71]:
#Imputation to handle missing values (represented as NaN) in data
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
filled_data = imp.fit_transform(merged_df)
filled_df = pd.DataFrame(filled_data, columns=merged_df.columns)

#Save features and labels separately for feature engineering
df_feats = filled_df.loc[:, filled_df.columns != labels_col]
df_labels = merged_df.iloc[:, merged_df.columns == labels_col]
df_feats.head()

Unnamed: 0,Population_Size,Population_Density,Poverty,Age_19_Under,Age_19_64,Age_65_84,Age_85_and_Over,White,Black,Native_American,...,HepB_Exp,Pert_Rpt,Pert_Exp,Syphilis_Rpt,Syphilis_Exp,No_Exercise,Obesity,Smoker,Diabetes,Uninsured
0,722.0,0.0,10.8,22.1,70.2,7.2,0.4,57.8,0.1,40.6,...,0.0,0.0,0.0,0.0,0.0,27.670359,24.627618,23.11139,7.948778,114.0
1,7408.0,9.0,15.5,31.1,57.3,10.3,1.4,96.7,1.8,0.9,...,2.0,1.0,2.0,0.0,2.0,27.670359,24.627618,23.11139,7.948778,1694.0
2,8085.0,28.0,16.9,22.7,59.0,16.1,2.2,91.6,0.5,7.2,...,2.0,2.0,2.0,1.0,3.0,27.670359,24.627618,23.11139,7.5,1206.0
3,11657.0,60.0,15.9,23.9,63.2,11.8,1.1,97.4,1.8,0.3,...,3.0,0.0,1.0,2.0,2.0,27.670359,24.627618,23.11139,2.6,1537.0
4,14302.0,16.0,10.5,24.8,60.1,13.1,1.9,93.5,1.6,2.8,...,1.0,3.0,4.0,3.0,0.0,29.1,27.0,22.0,5.0,2205.0


In [72]:
#Perform standardization
scaler = sk.preprocessing.StandardScaler(with_mean=0, with_std=1)
standardized_data = scaler.fit_transform(df_feats)
standardized_df = pd.DataFrame(standardized_data, columns=df_feats.columns, index=merged_df.index)
standardized_df.head()

Unnamed: 0_level_0,Population_Size,Population_Density,Poverty,Age_19_Under,Age_19_64,Age_65_84,Age_85_and_Over,White,Black,Native_American,...,HepB_Exp,Pert_Rpt,Pert_Exp,Syphilis_Rpt,Syphilis_Exp,No_Exercise,Obesity,Smoker,Diabetes,Uninsured
State_And_County_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2-282,0.003906,0.0,2.01812,6.259339,19.979299,2.086531,0.407467,3.207656,0.006185,4.246292,...,0.0,0.0,0.0,0.0,0.0,5.057749,5.948761,4.818145,3.143179,0.003907
48-501,0.040077,0.007026,2.896377,8.808391,16.30789,2.984898,1.426133,5.366442,0.111338,0.09413,...,0.122509,0.032344,0.120479,0.0,0.093214,5.057749,5.948761,4.818145,3.143179,0.058055
37-75,0.04374,0.021857,3.157985,6.429276,16.791719,4.665714,2.241067,5.083414,0.030927,0.753037,...,0.122509,0.064688,0.120479,0.035044,0.139821,5.057749,5.948761,4.818145,2.965719,0.041331
47-121,0.063064,0.046837,2.971122,6.76915,17.987061,3.419592,1.120533,5.405289,0.111338,0.031377,...,0.183764,0.0,0.06024,0.070089,0.093214,5.057749,5.948761,4.818145,1.028116,0.052674
40-73,0.077373,0.01249,1.962062,7.024055,17.104785,3.796327,1.935467,5.188856,0.098967,0.292848,...,0.061255,0.097032,0.240959,0.105133,0.0,5.319068,6.521806,4.586448,1.977146,0.075567


In [73]:
#Perform PCA to reduce dimensionality
pca = PCA()

pca_data = pca.fit_transform(standardized_df)
pca_df = pd.DataFrame(pca_data, columns=standardized_df.columns, index=merged_df.index)
print(len(pca_df))
pca_df.head()

2000


Unnamed: 0_level_0,Population_Size,Population_Density,Poverty,Age_19_Under,Age_19_64,Age_65_84,Age_85_and_Over,White,Black,Native_American,...,HepB_Exp,Pert_Rpt,Pert_Exp,Syphilis_Rpt,Syphilis_Exp,No_Exercise,Obesity,Smoker,Diabetes,Uninsured
State_And_County_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2-282,-1.077315,0.053476,3.254411,-2.354948,1.1244,3.294606,0.789759,1.568256,-1.717914,-1.488122,...,0.03821,-0.026179,0.020502,-0.022406,0.039239,-0.048094,-0.049195,0.000666,-0.043346,-0.014125
48-501,-1.223772,0.833122,1.870643,2.163266,1.23542,-1.99923,0.832994,0.994178,2.96052,-0.574326,...,-0.041487,-0.077207,-0.073975,-0.052644,-0.0383,0.012412,-0.034154,0.004892,-0.010568,0.017196
37-75,-1.657667,-0.554341,-1.087842,0.11969,0.536578,-0.0965,0.084551,0.097275,-0.220619,0.777194,...,0.002758,-0.074111,0.015364,-0.033788,0.005091,0.007308,0.009076,0.015412,-0.008132,0.001402
47-121,-1.178313,0.194277,0.08804,-1.204085,1.229355,-1.892059,0.820515,0.146754,-1.121749,1.578236,...,-0.053814,-0.04498,-0.034694,0.002646,-0.026171,0.016932,-0.085394,-0.018404,-0.021656,-0.002854
40-73,-0.997916,-1.570042,-0.163419,-0.111482,1.270342,-0.396692,0.593937,-0.733054,-0.601158,0.863157,...,0.052383,0.08446,-0.006165,0.074183,0.028334,-0.024281,0.027635,0.004159,0.0444,-0.01844


In [74]:
#Concatenate original labels that were removed for standardization and PCA
pca_df[labels_col] = df_labels
pca_df.head()

Unnamed: 0_level_0,Population_Size,Population_Density,Poverty,Age_19_Under,Age_19_64,Age_65_84,Age_85_and_Over,White,Black,Native_American,...,Pert_Rpt,Pert_Exp,Syphilis_Rpt,Syphilis_Exp,No_Exercise,Obesity,Smoker,Diabetes,Uninsured,HPSA_Ind
State_And_County_FIPS_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2-282,-1.077315,0.053476,3.254411,-2.354948,1.1244,3.294606,0.789759,1.568256,-1.717914,-1.488122,...,-0.026179,0.020502,-0.022406,0.039239,-0.048094,-0.049195,0.000666,-0.043346,-0.014125,1
48-501,-1.223772,0.833122,1.870643,2.163266,1.23542,-1.99923,0.832994,0.994178,2.96052,-0.574326,...,-0.077207,-0.073975,-0.052644,-0.0383,0.012412,-0.034154,0.004892,-0.010568,0.017196,1
37-75,-1.657667,-0.554341,-1.087842,0.11969,0.536578,-0.0965,0.084551,0.097275,-0.220619,0.777194,...,-0.074111,0.015364,-0.033788,0.005091,0.007308,0.009076,0.015412,-0.008132,0.001402,1
47-121,-1.178313,0.194277,0.08804,-1.204085,1.229355,-1.892059,0.820515,0.146754,-1.121749,1.578236,...,-0.04498,-0.034694,0.002646,-0.026171,0.016932,-0.085394,-0.018404,-0.021656,-0.002854,1
40-73,-0.997916,-1.570042,-0.163419,-0.111482,1.270342,-0.396692,0.593937,-0.733054,-0.601158,0.863157,...,0.08446,-0.006165,0.074183,0.028334,-0.024281,0.027635,0.004159,0.0444,-0.01844,2


In [76]:
#Split Data
data_feats = pca_df.loc[:, pca_df.columns != labels_col]
data_labels = pca_df.iloc[:, pca_df.columns == labels_col]

#Using 80-20 split
train_feats,test_feats, train_labels, test_labels = sk.model_selection.train_test_split(data_feats, data_labels, test_size=0.2)

train_data = train_feats
test_data = test_feats

train_data[labels_col] = train_labels
test_data[labels_col] = test_labels

train_dataset_filename = newdata_path + "BALANCEDTRAINDATA.csv"
test_dataset_filename = newdata_path + "BALANCEDTESTDATA.csv" 

train_data.to_csv(train_dataset_filename)
test_data.to_csv(test_dataset_filename)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [77]:
#Upload training dataset to Darwin (delete first if a dataset with this name already exists)
train_dataset_name = "balanced_healthcare-training-data"

status, dataset = ds.delete_dataset(train_dataset_name)
print(dataset)
status, dataset = ds.upload_dataset(train_dataset_filename, train_dataset_name)
if not status:
    print(dataset)

None


In [78]:
#Use Darwin to clean training data
status, job_id = ds.clean_data(train_dataset_name, target=labels_col)
if status:
    ds.wait_for_job(job_id['job_name'])
print(job_id)

{'status': 'Requested', 'starttime': '2019-04-23T21:14:23.919902', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['balanced_healthcare-training-data'], 'artifact_names': ['a8aa5ee320984f5fab13ec4a31bd75a4'], 'model_name': None, 'job_error': None}
{'status': 'Complete', 'starttime': '2019-04-23T21:14:23.919902', 'endtime': '2019-04-23T21:14:27.822694', 'percent_complete': 100, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['balanced_healthcare-training-data'], 'artifact_names': ['a8aa5ee320984f5fab13ec4a31bd75a4'], 'model_name': None, 'job_error': ''}
{'job_name': '68fe8ea212534fbd98cfb8ae418df161', 'artifact_name': 'a8aa5ee320984f5fab13ec4a31bd75a4'}


In [79]:
#Delete existing model under this name
model_name= labels_col + "_model1-balanced"
status = ds.delete_model(model_name)
print(status)

(True, None)


In [80]:
#Create model using cleaned training set
status, job_id = ds.create_model(dataset_names=train_dataset_name, model_name =  model_name, max_train_time = '00:02')
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

{'status': 'Requested', 'starttime': '2019-04-23T21:15:06.706781', 'endtime': None, 'percent_complete': 0, 'job_type': 'TrainModel', 'loss': None, 'generations': 0, 'dataset_names': ['balanced_healthcare-training-data'], 'artifact_names': None, 'model_name': 'HPSA_Ind_model1-balanced', 'job_error': None}
{'status': 'Running', 'starttime': '2019-04-23T21:15:06.706781', 'endtime': None, 'percent_complete': 8, 'job_type': 'TrainModel', 'loss': 0.5071820020675659, 'generations': 5, 'dataset_names': ['balanced_healthcare-training-data'], 'artifact_names': None, 'model_name': 'HPSA_Ind_model1-balanced', 'job_error': ''}
{'status': 'Running', 'starttime': '2019-04-23T21:15:06.706781', 'endtime': None, 'percent_complete': 16, 'job_type': 'TrainModel', 'loss': 0.4966951906681061, 'generations': 8, 'dataset_names': ['balanced_healthcare-training-data'], 'artifact_names': None, 'model_name': 'HPSA_Ind_model1-balanced', 'job_error': ''}
{'status': 'Running', 'starttime': '2019-04-23T21:15:06.70678

In [81]:
#Upload our held-out test dataset to Darwin (first delete any existing dataset under same name)
test_dataset_name = "balanced-healthcare-test-data"

ds.delete_dataset(test_dataset_name)
status, dataset = ds.upload_dataset(test_dataset_filename, test_dataset_name)
if not status:
    print(dataset)

In [82]:
#Use Darwin to clean our held-out test dataset
status, job_id = ds.clean_data(test_dataset_name, target=labels_col)
if status:
    ds.wait_for_job(job_id['job_name'])
print(job_id)

{'status': 'Requested', 'starttime': '2019-04-23T21:18:06.549883', 'endtime': None, 'percent_complete': 0, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['balanced-healthcare-test-data'], 'artifact_names': ['7c3c6976f3a04c68a8b8c32c93848abc'], 'model_name': None, 'job_error': None}
{'status': 'Complete', 'starttime': '2019-04-23T21:18:06.549883', 'endtime': '2019-04-23T21:18:09.569518', 'percent_complete': 100, 'job_type': 'CleanDataTiny', 'loss': None, 'generations': None, 'dataset_names': ['balanced-healthcare-test-data'], 'artifact_names': ['7c3c6976f3a04c68a8b8c32c93848abc'], 'model_name': None, 'job_error': ''}
{'job_name': '17ecaf9c24d141b78363d9751dae9fa3', 'artifact_name': '7c3c6976f3a04c68a8b8c32c93848abc'}


In [83]:
#Run model against our held-out test dataset
status, job_id = ds.run_model(test_dataset_name, model_name)
if status:
    ds.wait_for_job(job_id['job_name'])
print(job_id)

{'status': 'Running', 'starttime': '2019-04-23T21:18:34.91486', 'endtime': None, 'percent_complete': 0, 'job_type': 'RunModel', 'loss': 0.49523404240608215, 'generations': 9, 'dataset_names': ['balanced-healthcare-test-data'], 'artifact_names': ['1607922b980347e3ade3857fd01c7d52'], 'model_name': 'HPSA_Ind_model1-balanced', 'job_error': ''}
{'status': 'Complete', 'starttime': '2019-04-23T21:18:34.91486', 'endtime': '2019-04-23T21:18:35.928893', 'percent_complete': 100, 'job_type': 'RunModel', 'loss': 0.49523404240608215, 'generations': 9, 'dataset_names': ['balanced-healthcare-test-data'], 'artifact_names': ['1607922b980347e3ade3857fd01c7d52'], 'model_name': 'HPSA_Ind_model1-balanced', 'job_error': ''}
{'job_name': '1f1ef266470a4c29a142fee2a2e3a919', 'artifact_name': '1607922b980347e3ade3857fd01c7d52'}


In [84]:
#Prints results of run_model which include the predictions of test data labels
#as well as the respective probabilities of each class
status, predictions_table = ds.download_artifact(job_id['artifact_name'])
print(predictions_table)

     HPSA_Ind        prob_1    prob_2
0           1  8.553236e-01  0.144676
1           2  1.751242e-01  0.824876
2           1  8.085928e-01  0.191407
3           1  8.859578e-01  0.114042
4           2  4.717572e-02  0.952824
5           2  9.224440e-05  0.999908
6           1  8.954995e-01  0.104500
7           1  9.479465e-01  0.052053
8           2  4.212109e-01  0.578789
9           2  1.616088e-01  0.838391
10          1  9.534643e-01  0.046536
11          1  5.223508e-01  0.477649
12          2  4.685535e-01  0.531446
13          2  4.210941e-01  0.578906
14          1  7.239013e-01  0.276099
15          2  1.054793e-01  0.894521
16          1  8.313764e-01  0.168624
17          1  8.427774e-01  0.157223
18          1  9.479465e-01  0.052053
19          2  9.000000e-10  1.000000
20          1  7.467537e-01  0.253246
21          2  4.380000e-07  1.000000
22          2  1.953880e-05  0.999980
23          1  8.184193e-01  0.181581
24          1  6.792305e-01  0.320770
25          

In [85]:
#Write predicted labels to CSV
predicted_labels = pd.DataFrame(predictions_table)
predicted_labels.to_csv(newdata_path + 'BALANCEDPREDICTIONS.csv')

In [86]:
#Print accuracy score of run_model results on test data
accuracy_score = sk.metrics.accuracy_score(test_labels,predicted_labels[labels_col])
print(accuracy_score)

0.785


In [87]:
#Use Darwin to analyze the model we created, trained, and tested
status, job_id = ds.analyze_model(model_name)
if status:
    ds.wait_for_job(job_id['job_name'])
print(job_id)

{'status': 'Running', 'starttime': '2019-04-23T21:19:12.30925', 'endtime': None, 'percent_complete': 0, 'job_type': 'AnalyzeModel', 'loss': 0.49523404240608215, 'generations': 9, 'dataset_names': None, 'artifact_names': ['051e35d6be4f4aba8e96d8048499137e'], 'model_name': 'HPSA_Ind_model1-balanced', 'job_error': ''}
{'status': 'Complete', 'starttime': '2019-04-23T21:19:12.30925', 'endtime': '2019-04-23T21:19:13.467858', 'percent_complete': 100, 'job_type': 'AnalyzeModel', 'loss': 0.49523404240608215, 'generations': 9, 'dataset_names': None, 'artifact_names': ['051e35d6be4f4aba8e96d8048499137e'], 'model_name': 'HPSA_Ind_model1-balanced', 'job_error': ''}
{'job_name': '4f7cbf11b4c644248eb2cdacd1d9113c', 'artifact_name': '051e35d6be4f4aba8e96d8048499137e'}


In [88]:
#Print results of analyze_model showing the importance of each feature in our dataset
status, artifact = ds.download_artifact(job_id['artifact_name'])
print(artifact)

Population_Size       0.132073
Population_Density    0.056081
HepB_Exp              0.046798
Smoker                0.042580
White                 0.040623
No_Exercise           0.032478
Age_19_Under          0.025626
Ecol_Rpt              0.023349
Poverty               0.021954
HepB_Rpt              0.021653
Obesity               0.021035
Syphilis_Exp          0.019601
HepA_Rpt              0.019077
ALE                   0.018988
Pert_Rpt              0.018543
Pert_Exp              0.017404
No_HS_Diploma         0.017172
HepA_Exp              0.017155
Brst_Cancer           0.016995
Age_19_64             0.016860
Over_40               0.015968
Age_85_and_Over       0.015538
Hispanic              0.014805
Suicide               0.014749
Ecol_Exp              0.014730
Age_65_84             0.014184
Syphilis_Rpt          0.013957
Asian                 0.013556
Under_18              0.013547
Uninsured             0.013322
Unemployed            0.013206
Salm_Rpt              0.012529
Diabetes

In [89]:
#Print confusion matrix and classification report of our model results
print("Confusion Matrix")
print(confusion_matrix(test_labels,predicted_labels[labels_col]))
print("\nClassification Report")
print(classification_report(test_labels, predicted_labels[labels_col]))

Confusion Matrix
[[180  31]
 [ 55 134]]

Classification Report
              precision    recall  f1-score   support

           1       0.77      0.85      0.81       211
           2       0.81      0.71      0.76       189

   micro avg       0.79      0.79      0.79       400
   macro avg       0.79      0.78      0.78       400
weighted avg       0.79      0.79      0.78       400

