In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
df = pd.read_csv('data/train.csv')

In [54]:
df.head()

Unnamed: 0,enc_id,patient_id,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmission_id
0,88346340,2488608,Caucasian,Male,[60-70),,1,2,6,3,...,No,Steady,No,No,No,No,No,Ch,Yes,2
1,92001408,52133202,Caucasian,Male,[70-80),[100-125),2,6,1,7,...,No,No,No,No,No,No,No,No,Yes,1
2,169424316,40945509,Caucasian,Female,[70-80),,3,2,1,7,...,No,Up,No,No,No,No,No,Ch,Yes,1
3,272987082,38850777,Caucasian,Female,[50-60),,1,1,7,1,...,No,No,No,No,No,No,No,No,Yes,2
4,150600612,72738225,Caucasian,Female,[80-90),,1,6,7,6,...,No,Down,No,No,No,No,No,Ch,Yes,2


In [55]:
df.shape

(71236, 50)

In [56]:
df.gender.value_counts() # ! so we have to encode this gender column

gender
Female             38235
Male               32998
Unknown/Invalid        3
Name: count, dtype: int64

## Filling Missing Values

- payer_code, medical_specialty have more than 40% missing values
- weight, max_glu_serum, A1Cresult have more than 80% missing values

In [57]:
df.drop(['payer_code','medical_specialty', 'weight'], axis = 1, inplace=True)

In [58]:
(df['gender'] == 'Unknown/Invalid').sum() # ! we can drop this rows

3

In [59]:
# drop rows in whose gender is Unknown/Invalid
df = df[df['gender'] != 'Unknown/Invalid']

In [60]:
print(df['citoglipton'].value_counts())
print(df['examide'].value_counts())

# ! have same value for all rows so we can drop these columns

citoglipton
No    71233
Name: count, dtype: int64
examide
No    71233
Name: count, dtype: int64


In [61]:
df['max_glu_serum'].value_counts() # ! 96420 values are missing

max_glu_serum
Norm    1790
>200    1034
>300     897
Name: count, dtype: int64

In [62]:
colsToDrop = ['citoglipton', 'examide', 'max_glu_serum', 'A1Cresult']
df.drop(colsToDrop, axis = 1, inplace=True)

In [63]:
# print % number of missing values in each column
def missing_values(df):
    for col in df.columns:
        print(col, df[col].isnull().sum()/len(df)*100)

In [64]:
# drop rows in columns: [diag_1, diag_2, diag_3, race] which have missing values
df = df[df['diag_1'].notna()]
df = df[df['diag_2'].notna()]
df = df[df['diag_3'].notna()]
df = df[df['race'].notna()]

In [65]:
missing_values(df)

enc_id 0.0
patient_id 0.0
race 0.0
gender 0.0
age 0.0
admission_type_id 0.0
discharge_disposition_id 0.0
admission_source_id 0.0
time_in_hospital 0.0
num_lab_procedures 0.0
num_procedures 0.0
num_medications 0.0
number_outpatient 0.0
number_emergency 0.0
number_inpatient 0.0
diag_1 0.0
diag_2 0.0
diag_3 0.0
number_diagnoses 0.0
metformin 0.0
repaglinide 0.0
nateglinide 0.0
chlorpropamide 0.0
glimepiride 0.0
acetohexamide 0.0
glipizide 0.0
glyburide 0.0
tolbutamide 0.0
pioglitazone 0.0
rosiglitazone 0.0
acarbose 0.0
miglitol 0.0
troglitazone 0.0
tolazamide 0.0
insulin 0.0
glyburide-metformin 0.0
glipizide-metformin 0.0
glimepiride-pioglitazone 0.0
metformin-rosiglitazone 0.0
metformin-pioglitazone 0.0
change 0.0
diabetesMed 0.0
readmission_id 0.0


## Feature Engineering

In [70]:
df['chlorpropamide'].value_counts()

chlorpropamide
No        68550
Steady       60
Up            4
Name: count, dtype: int64

* Number of medication changes: The dataset contains 23 features for 23 drugs (or combos) which indicate for each of these, whether a change in that medication was made or not during the current hospital stay of patient. Medication change for diabetics upon admission has been shown by previous research to be associated with lower readmission rates. We decided to count how many changes were made in total for each patient, and declared that a new feature. The reasoning here was to both simplify the model and possibly discover a relationship with number of changes regardless of which drug was changed.

In [73]:
drugs = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 'metformin-pioglitazone','metformin-rosiglitazone', 'glimepiride-pioglitazone', 'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide']
for drug in drugs:
    new_col = drug + 'temp'
    df[new_col] = df[drug].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)

In [74]:
df.columns

Index(['enc_id', 'patient_id', 'race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmission_id',
       'metformintemp', 'repaglinidetemp', 'nateglinidetemp',
       'chlorpropamidetemp', 'glimepiridetemp', 'glipizidetemp',
       'glyburidetemp', 'pioglitazonetemp', 'rosiglitazonetemp',
       'acarbosete

### 