In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import warnings
warnings.filterwarnings('ignore')

### Survey patients characteristics

In [2]:
file_path = './data/nys-patient-characteristics-survey-pcs-2015/patient-characteristics-survey-pcs-2015.csv'
labels = ['Diabetes','Obesity','Heart Attack','Stroke','Other Cardiac']

check_df = pd.read_csv(file_path)
print(f'shape: {check_df.shape}')
n_cols_unknown = len([col for col in check_df.columns if "UNKNOWN" in check_df[col].values])

print(f'{n_cols_unknown} out of {len(check_df.columns)} columns contain the UNKNOWN value:\n')

for col in check_df.columns:
    col_vals = check_df[col].values
    if 'UNKNOWN' in col_vals:
        perc_unknown = len(col_vals[col_vals=='UNKNOWN'])/len(col_vals)
        print(f'the percentage of unknown values in {col} is {perc_unknown}')
        if perc_unknown>0.1:
            #rimuovi la colonna
            check_df.drop(col,1,inplace=True)
        else:
            #rimuovi le righe
            indexes_unknown = check_df[check_df[col]=="UNKNOWN"].index.values
            check_df.drop(indexes_unknown,inplace=True)


shape: (179096, 67)
58 out of 67 columns contain the UNKNOWN value:

the percentage of unknown values in Age Group is 0.00010050475722517533
the percentage of unknown values in Sex is 0.00209964373066485
the percentage of unknown values in Transgender is 0.02722409374265537
the percentage of unknown values in Sexual Orientation is 0.07731955797672532
the percentage of unknown values in Hispanic Ethnicity is 0.014563954213322027
the percentage of unknown values in Living Situation is 0.00764266734151588
the percentage of unknown values in Household Composition is 0.018565271721112896
the percentage of unknown values in Preferred Language is 0.0023970378069377678
the percentage of unknown values in Veteran Status is 0.015582369067076466
the percentage of unknown values in Education Status is 0.053268332693910486
the percentage of unknown values in Special Education Services is 0.006029694323144105
the percentage of unknown values in Mental Illness is 0.006319326313421715
the percentage o

In [3]:
dummified_check_df = pd.get_dummies(check_df.drop('Survey Year',1),prefix_sep='=')
dummified_check_df = pd.concat([dummified_check_df,check_df['Survey Year']],1)
dummified_check_df.to_csv('./data/preprocessed_patient-characteristics.csv',index=False)

In [5]:
numerical_vars= ['Survey Year']
labels_name = [col for col in dummified_check_df.columns if ('Diabetes' in col) or ('Obesity' in col) or ('Heart Attack' in col)]
categorical_vars = [col for col in dummified_check_df if col not in labels_name and col not in numerical_vars]

np.save('./data/numerical_vars.npy',numerical_vars)
np.save('./data/labels_name.npy',labels_name)
np.save('./data/categorical_vars.npy',categorical_vars)

In [6]:
from sklearn.ensemble import RandomForestClassifier

dummified_check_df = dummified_check_df.reset_index(drop=True)

bb = RandomForestClassifier()
bb.fit(dummified_check_df[numerical_vars+categorical_vars].values,dummified_check_df[labels_name].values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
from sklearn.externals import joblib
joblib.dump(bb, './black_boxes/RandomForest.pkl') 

['./black_boxes/RandomForest.pkl']