In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split 
from scipy.stats.stats import pearsonr
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV
import sklearn.metrics 

import matplotlib.pyplot as plt

def roc_auc(pred, act, plot=True, label = "curve"):
    prob = pred/pred.max() #normalize
    fpr, tpr, threshold = sklearn.metrics.roc_curve(act, prob, drop_intermediate=True)    
    auc = sklearn.metrics.auc(fpr, tpr)

    if plot:
        plt.scatter(x=fpr, y=tpr, color='navy')
        rcolor = tuple(np.random.rand(3,1)[:,0])
        plt.plot(fpr, tpr, c=rcolor, lw=2, label=label + ' (AUC = %0.3f)' % auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc="lower right")
        plt.show()

    return auc

In [3]:
df_raw_all = pd.read_csv('diabetic_data.csv') 
df_raw = df_raw_all.sample(10000)
df_raw = df_raw.replace('?', np.nan) 
df_raw.shape

(10000, 50)

In [11]:
col_data = df_raw.apply(lambda s: set(s.unique()), axis=0).to_frame('uni_val')
col_data['nan_rat'] = df_raw.isnull().sum(axis=0)/len(df_raw)
col_data['n_uni_vals'] = col_data.uni_val.apply(len)
col_data['uni_vals_str'] = col_data[col_data.n_uni_vals<2000].uni_val.astype(str)
col_data = col_data.drop('uni_val', axis=1)
col_data['var_type'] = np.nan
col_data.to_csv("columns_raw.csv")

In [12]:
col_data = pd.read_csv( "columns.csv", index_col=0)
col_data.sample(10)

Unnamed: 0,nan_rat,n_uni_vals,uni_vals_str,var_type,comment
chlorpropamide,0.0,4,"{'Steady', 'Up', 'No', 'Down'}",cat,
diag_1,0.0003,474,"{nan, '644', '226', '568', '225', '637', '388'...",drop,eventually we will include top diagnoses
payer_code,0.3962,17,"{nan, 'HM', 'OG', 'MC', 'DM', 'CM', 'SI', 'MD'...",cat,
glimepiride-pioglitazone,0.0,1,{'No'},drop,no information here!
metformin,0.0,4,"{'Steady', 'Up', 'No', 'Down'}",cat,
acetohexamide,0.0,1,{'No'},drop,no information here!
num_procedures,0.0,7,"{0, 1, 2, 3, 4, 5, 6}",cont,
medical_specialty,0.4941,55,"{nan, 'Surgery-Plastic', 'Pulmonology', 'Osteo...",drop,eventually we will include top specialties
glimepiride,0.0,4,"{'Steady', 'Up', 'No', 'Down'}",cat,
admission_source_id,0.0,13,"{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 22}",cat,


In [5]:
#TODO recapture medical specialty
spec_counts = df_raw.medical_specialty.value_counts()
spec_counts.head(5).to_frame('num patients')
spec_thresh = 5
for (spec, count) in spec_counts.head(spec_thresh).iteritems():
    new_col = 'spec_' + str(spec)
    df_raw[new_col] = (df_raw.medical_specialty == spec)
    
df_raw.filter(regex='spec').sample(10)

Unnamed: 0,medical_specialty,spec_InternalMedicine,spec_Family/GeneralPractice,spec_Emergency/Trauma,spec_Cardiology,spec_Surgery-General
7859,,False,False,False,False,False
50357,Orthopedics-Reconstructive,False,False,False,False,False
54427,,False,False,False,False,False
13934,InternalMedicine,True,False,False,False,False
6090,,False,False,False,False,False
6166,InternalMedicine,True,False,False,False,False
34611,Surgery-Neuro,False,False,False,False,False
31066,Cardiology,False,False,False,True,False
85150,,False,False,False,False,False
42970,InternalMedicine,True,False,False,False,False


In [6]:
diag_counts = (df_raw.diag_1.value_counts() + df_raw.diag_2.value_counts() + df_raw.diag_3.value_counts()).sort_values(ascending=False)
diag_counts.head(10).to_frame('num patients w diag')

Unnamed: 0,num patients w diag
250,1787.0
428,1774.0
276,1345.0
401,1292.0
414,1216.0
427,1163.0
599,703.0
496,597.0
486,594.0
403,545.0


In [7]:
diag_thresh = 10
for (icd9, count) in diag_counts.head(diag_thresh).iteritems():
    new_col = 'diag_' + str(icd9)
    df_raw[new_col] = (df_raw.diag_1 == icd9)|(df_raw.diag_2 == icd9)|(df_raw.diag_3 == icd9)
    
df_raw.filter(regex='diag_').sample(10)

Unnamed: 0,diag_1,diag_2,diag_3,diag_250,diag_428,diag_276,diag_401,diag_414,diag_427,diag_599,diag_496,diag_486,diag_403
2921,486,571.0,38,False,False,False,False,False,False,False,False,True,False
11835,590,413.0,458,False,False,False,False,False,False,False,False,False,False
7650,250.6,707.0,41,False,False,False,False,False,False,False,False,False,False
59780,414,427.0,70,False,False,False,False,True,True,False,False,False,False
46121,V57,396.0,799,False,False,False,False,False,False,False,False,False,False
95865,996,403.0,585,False,False,False,False,False,False,False,False,False,True
33716,780,250.0,278,True,False,False,False,False,False,False,False,False,False
82970,789,424.0,428,False,True,False,False,False,False,False,False,False,False
77972,721,250.02,724,False,False,False,False,False,False,False,False,False,False
63520,493,493.0,250,True,False,False,False,False,False,False,False,False,False


In [13]:
df_raw2 = pd.DataFrame(df_raw, copy=True) #preserve df_raw so I can rerun this step
df_raw2['age'] = df_raw2.age.str.extract('(\d+)-\d+')

to_drop = col_data[col_data.var_type.str.contains('drop')].index
df_raw2.drop(to_drop, axis=1, inplace=True)

#break out categorical variables into binaries
cat_cols = col_data[col_data.var_type.str.contains('cat')].index
df_raw2 = pd.get_dummies(df_raw2, columns=cat_cols)

#dropping these leaves up with one binary variable, ideal for simplicity
df_raw2.drop(['readmitted_<30','readmitted_>30'], axis=1, inplace=True)
#cleaning up outcome variable
df_raw2['is_readmitted'] = (df_raw2.readmitted_NO == 0)
df_raw2.drop('readmitted_NO', axis=1, inplace=True)

#ta daaaaaah, the data is ready to go
df = pd.DataFrame(df_raw2)
df.shape

(10000, 158)

In [14]:
df.is_readmitted.value_counts()

False    5408
True     4592
Name: is_readmitted, dtype: int64

In [15]:
#partition training and test data, one balanced training set, all remaining for testing 
outcome_column = 'is_readmitted' 

#Imputing with outlying value since we are focusing on tree based methods
dff = df.fillna(df.mean) # using mean rather than -9999 which I use for tree methods 

#%% Split data for validation
X = dff.drop(outcome_column, axis=1) 
y = dff[outcome_column] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 

In [17]:
##### SINGULAR LOGISTIC REGRESSION
def apply_pearsonr(col):
    isnan = col.isnull()
    xi = col[isnan==False].astype(float)
    yi = y[isnan==False].astype(float)
    (r,p) = pearsonr(xi,yi)
    return (r,p)

res = X.apply(apply_pearsonr).to_frame()
res['r'] = res[0].apply(lambda x: x[0])
res['p'] = res[0].apply(lambda x: x[1])

res.drop(0, axis=1).sort_values('p').head(15)



AttributeError: 'DataFrame' object has no attribute 'to_frame'