In [3]:
"""
Get the packages we need to manipulate the dataset 
"""
import numpy as np 
import pandas as pd 
import os 


In [4]:
"""
Specify the directories for data. These are universal for our repository-
this is the only thing you should not change.
"""

#rewrite this in the preferred way using os.join and os.path 

DATA_DIR = './' + 'raw_data'

DATA_NAME = '/diabetic_data.csv'

DATA_PATH = DATA_DIR + DATA_NAME

os.listdir(DATA_DIR)

if os.path.isfile(DATA_PATH): 
    print('DATA_PATH is good path')
else:
    raise ValueError('DATA_PATH is not valid path')


ValueError: DATA_PATH is not valid path

In [None]:
"""
Define size of data sample to read-in. Use to_skip to define the logic. 

Import data as pandas dataframe for that format 
"""

SAMPLE_SIZE = (10**4)

def to_skip(index): 
    """
    Function to be used with pandas skiprows
    """
    keeprow = False
    
    if index <= SAMPLE_SIZE:
        keeprow = True
    else:
        keeprow = False
    
    return keeprow

raw_data = pd.read_csv(DATA_PATH,nrows = SAMPLE_SIZE)
                                              
print(raw_data.shape)
raw_data.head(10)
print(raw_data.columns)

"""
Process raw data according to the values which are missing. Define the features to keep based on initial report of 
completeness and information
"""
raw_data.columns


to_keep = ['race','gender','age','weight','admission_type_id','discharge_disposition_id',\
           'time_in_hospital','num_lab_procedures','num_procedures','num_medications',
           'number_outpatient','number_emergency','number_inpatient','diag_1','diag_2']
           
           
           

In [None]:
"""
Define the features desired to keep based on the initial summary of data completeness provided here: 

https://www.hindawi.com/journals/bmri/2014/781670/tab1/

This cell is intended only for list manipulation - we actually slice the data in the next cell 
""" 

features = raw_data.columns
print('Here are all possible features: \n\n',features,'\n')

#Here are the 'simple' features identified by Will - we roughly know how to interpret these 
ft_basic = ['race','gender','age','admission_type_id','discharge_disposition_id',
          'time_in_hospital','num_lab_procedures','num_procedures','num_medications', 'number_outpatient',\
           'number_emergency','number_inpatient','number_diagnoses','change',\
            'diabetesMed', 'readmitted']

ft_basic_keep = ft_basic[:]

#Here are complex medical features we do not understand 

ft_med = ['diag_1','diag_2','diag_3','max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone']

ft_med_keep = list()

#Here are all the features we will keep or toss 
ft_keep = ft_basic_keep + ft_med_keep 
ft_toss = [x for x in features if (x in features) and (x not in ft_keep)]

print('Here are the features we decided to keep:\n\n', ft_keep,'\n')
print('Here are the features we decided to eliminate: \n\n',ft_toss)



In [None]:
"""
Subset the Raw Data: Here, we refine data according to the desired features to keep defined in the preivous cell. Then 
we type convert the columns in order to actually apply clustering to them. 
"""
data = raw_data[ft_keep]
ft_kept = data.columns 
data.head(10)

In [None]:
"""
Clean the Data: Here we identify rows with missing fields in the desired categories. First, nan/None type is filtered and 
the row number is added to a list of rows which will be skipped when reading in the data for processing
"""

#Iterate over the list of desired features to keep; extract the series aech time, and build the list of rows associated
#to a missing data value available in that series. This is better than simply calling .dropna() one time 

rows_nan = list()

#This is an iterative approach by checking each column - this can likely be done across the whole array simultaneously 

for feature in ft_kept:
    feature_series = data[feature]
    missing_bool = feature_series.isnull()
    nan_indices = feature_series.index[missing_bool]
    print(nan_indices)
    
value_dictionary = dict()
for feature in ft_kept:
    feature_series = data[feature]
    unique_vals = list()
    for x in feature_series:
        if x in unique_vals:
            pass
        else: 
            unique_vals.append(x)
    value_dictionary[feature] = unique_vals

#Here are the available values in each feature that are stored 
print(value_dictionary)