In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import sklearn.decomposition as skde
from sklearn import preprocessing

In [15]:
# read the diabetic data to pandas dataframe
diabetes = pd.read_csv("diabetes_cleaned.csv", index_col = 'encounter_id')

In [16]:
# create a numpy array of the label values required for the scikit-learn model
labels = np.array(diabetes['label'])

In [17]:
# write labels values to CSV file to use it later.
np.savetxt(r'Labels.csv',labels,delimiter=',')

In [18]:
# drop cloumns we don't need for our model
diabetes = diabetes.drop(['diag_1','diag_2','diag_3','admission_type_id','patient_nbr', 'readmitted', 'age'], axis =1)

# drop label column after save it as separete file (Lebels.csv)
diabetes = diabetes.drop('label', axis=1)

In [19]:
# convert [discharge_disposition_id, admission_source_id] to string columns ... for more details see IDs_mapping.csv file
diabetes.discharge_disposition_id = diabetes.discharge_disposition_id.astype(str)
diabetes.admission_source_id = diabetes.admission_source_id.astype(str)

In [20]:
categorical_columns = []

for col in diabetes.columns:
    if diabetes[col].dtype == object:
        categorical_columns.append(col)
        
categorical_columns 


['race',
 'gender',
 'discharge_disposition_id',
 'admission_source_id',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'new_age',
 'primary diagnosis',
 'second diagnosis',
 'third diagnosis',
 'admission_type']

In [21]:
# recod categorical variables as binary dummy variables.
categorical_columns = []

for col in diabetes.columns:
    if diabetes[col].dtype == object:
        categorical_columns.append(col)

categorical_columns.remove('race')  

def encode_string(cat_features):
    # encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    # apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

Features = encode_string(diabetes['race'])  
for col in categorical_columns:
    temp = encode_string(diabetes[col])
    Features = np.concatenate([Features, temp], axis = 1)

print(Features.shape)
print(Features[:2, :]) 

(71518, 169)
[[ 0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
   0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.
   1.  0.  0.  0.  1.  0.  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.
   0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.
   1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  1.  1.  0.  1.  0.  0.
   1.  1.  0.  1.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.
   0.  0.  0.  0.  0.  1.  0.]
 [ 0.  0.  1.  0.  0.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.
   0.  0.  0.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0

In [22]:
nomeric_columns = []

for col in diabetes.columns:
    if diabetes[col].dtype == 'int64':
        nomeric_columns.append(col)
nomeric_columns       

['time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses']

In [23]:
#concatenate numeric features to the numpy array
nomeric_columns = []

for col in diabetes.columns:
    if diabetes[col].dtype == 'int64':
        nomeric_columns.append(col)
        
Features = np.concatenate([Features, np.array(diabetes[nomeric_columns])], axis = 1)
print(Features.shape)
print(Features[:2, :])

(71518, 177)
[[  0.   0.   1.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.
    0.   1.   0.   0.   1.   0.   0.   0.   1.   0.   0.   0.   1.   0.
    0.   0.   1.   0.   0.   0.   1.   0.   0.   1.   0.   0.   1.   0.
    0.   0.   1.   0.   0.   1.   0.   0.   1.   0.   0.   0.   1.   0.
    0.   1.   0.   0.   0.   1.   0.   0.   1.   0.   1.   0.   0.   1.
    0.   0.   0.   1.   0.   0.   1.   0.   1.   1.   0.   1.   0.   0.
    1.   1.   0.   1.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.
    0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   1.
    0.   1.  41.   0.   1.   0.   0.   0.   1.]
 [  0.   0.   1.   0.   0.   1.   0.   0.   1.   0.   0.   0.   0.   0.
   

In [24]:
# use the StanardScaler function to Zscore scale the numeric features.
scaler = preprocessing.StandardScaler().fit(Features[:,168:])
Features[:,168:] = scaler.transform(Features[:,168:])
print(Features.shape)
print(Features[:2, :])

(71518, 177)
[[ 0.          0.          1.          0.          0.          1.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          1.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          1.          0.          0.
   0.          1.          0.          0.          1.          0.          0.
   0.          1.          0.          0.          0.          1.          0.
   0.          0.          1.          0.          0.          0.          1.
   0.          0.          1.          0.          0.          1.          0.
   0.          0.          1.          0.          

In [25]:
# write features values to CSV file to use it later.
np.savetxt(r'Features.csv',Features,delimiter=',')