In [1]:
#Loading relevant modules 
import numpy as np
import matplotlib as plt 
import pandas as pd 
from scipy import stats 

In [2]:
#Importing the data 
df = pd.read_csv("diabetic_data.csv")


In [3]:
'''Data Cleaning'''

#The shape of the dataframe 
print(df.shape) 

#Replace all missing values
df.replace('?',np.nan,inplace=True) #Replace ? with NaN 
df.replace(r'^\s*$', np.nan, regex=True, inplace=True) #Replace empty space with NaN

min50= float(0.5*(df.shape[0]+1)) #create object with value of 50% row total
df= df.dropna(axis=1,thresh=min50) #drop columns with values missing from 50% of rows 
df.dropna(axis=0,how='any') #drop any rows with missing values 



#drop columns with 95% same values 
col_heads=list(df.columns)
df['age'].value_counts(dropna=False) #returns each value with list of counts 
min95=float(0.95*(df.shape[0]+1)) #determine 95% of values 
for column in col_heads: #for each column in col_heads 
    mylist= df[column].value_counts(dropna=False) #make a list of counts for each value found in the column
    for x in mylist: #for each value (count no. of each value in col)
        if x >= min95: #if the value is present in 95% of rows 
            df.drop(axis=1,columns=column,inplace=True)
            statement= 'dropping column: {}'.format(column)
            print(f"Droppinf column {column}")

#Shape of dataframe after column removal 
print(df.shape)

(101766, 50)
Droppinf column repaglinide
Droppinf column nateglinide
Droppinf column chlorpropamide
Droppinf column acetohexamide
Droppinf column tolbutamide
Droppinf column acarbose
Droppinf column miglitol
Droppinf column troglitazone
Droppinf column tolazamide
Droppinf column examide
Droppinf column citoglipton
Droppinf column glyburide-metformin
Droppinf column glipizide-metformin
Droppinf column glimepiride-pioglitazone
Droppinf column metformin-rosiglitazone
Droppinf column metformin-pioglitazone
(101766, 33)


In [4]:
#Transforming the age column to middle value
ages = df["age"]
middle_age_value = [int(age.split("-")[1].strip(")"))-5 for age in ages]
series = pd.Series(middle_age_value)
df["age"] = series 
df.head(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,5,6,25,1,1,,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,15,1,1,7,3,,...,No,No,No,No,No,No,Up,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,25,1,1,7,2,,...,No,No,Steady,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,35,1,1,7,2,,...,No,No,No,No,No,No,Up,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,45,1,1,7,1,,...,No,No,Steady,No,No,No,Steady,Ch,Yes,NO


In [5]:
#Replacing missing values in columns diag_1/2/3 
df["diag_1"].fillna(0, inplace=True)
df["diag_2"].fillna(0, inplace=True)
df["diag_3"].fillna(0, inplace=True)

In [6]:
#List of numerical and categorical features  
col_heads=list(df.columns)
catlist= df.select_dtypes(object).columns.values.tolist()
numlist= df.select_dtypes(np.number).columns.values.tolist()

for col in numlist:
    if col[-2:]=='id':
        numlist.remove(col)
        catlist.append(col)

print(f'Numerical columns are: {numlist}')       

Numerical columns are: ['patient_nbr', 'age', 'discharge_disposition_id', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']


In [7]:
print(f'Categorical columns are: {catlist}')

Categorical columns are: ['race', 'gender', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change', 'diabetesMed', 'readmitted', 'encounter_id', 'admission_type_id', 'admission_source_id']


In [8]:
#Removing duplicates in the column patient_nbr 
df = df.drop_duplicates(subset='patient_nbr',keep='first')
df.shape

(71518, 33)

In [9]:
#Identifying and removing outliers in the numerical columns
df2=df.copy() #create a copy to identify outliers 
for col in numlist: #for each numerical column... 
    z= np.abs(stats.zscore(df[col])) #calculate a z score 
    df2[col]=z #define a new column with z scores 
    myindex=df2[(df2[col]>3)].index #index depending on whether or not values deviate > 3 sds from mean 
    df.drop(myindex,inplace=True) #drop all rows with numerical values > 3 sd from mean 
df.shape


ValueError: Length of values (70831) does not match length of index (71518)

In [None]:
df.loc[4]

In [None]:
'''PART 2 -- DATA EXPLORATION'''