In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv("diabetic_data.csv", delimiter = ",")
df.dtypes
df.replace("?", np.nan, inplace = True)

In [39]:
### Checking Missing Values:
for i in df.columns:
    print(str(i) + ":::" + str(df[i].isna().sum()))

encounter_id:::0
patient_nbr:::0
race:::2273
gender:::0
age:::0
weight:::98569
admission_type_id:::0
discharge_disposition_id:::0
admission_source_id:::0
time_in_hospital:::0
payer_code:::40256
medical_specialty:::49949
num_lab_procedures:::0
num_procedures:::0
num_medications:::0
number_outpatient:::0
number_emergency:::0
number_inpatient:::0
diag_1:::21
diag_2:::358
diag_3:::1423
number_diagnoses:::0
max_glu_serum:::0
A1Cresult:::0
metformin:::0
repaglinide:::0
nateglinide:::0
chlorpropamide:::0
glimepiride:::0
acetohexamide:::0
glipizide:::0
glyburide:::0
tolbutamide:::0
pioglitazone:::0
rosiglitazone:::0
acarbose:::0
miglitol:::0
troglitazone:::0
tolazamide:::0
examide:::0
citoglipton:::0
insulin:::0
glyburide-metformin:::0
glipizide-metformin:::0
glimepiride-pioglitazone:::0
metformin-rosiglitazone:::0
metformin-pioglitazone:::0
change:::0
diabetesMed:::0
readmitted:::0


In [40]:
### Seems like most of the NA's are in the columns that are strings. So we should focus on getting those to be encoded
### as ordinalencoder from sklearn does not do well with NA values.
nan_cols = ['race','weight','medical_specialty', 'diag_1', 'diag_2', 'diag_3']

In [41]:
df[nan_cols].dtypes

race                 object
weight               object
medical_specialty    object
diag_1               object
diag_2               object
diag_3               object
dtype: object

### Let's do the race column first

Pseudocode:

1. Replace the NA values with "NA_RACE" string.
2. Then, create the ranking such that NA_RACE is at the front of the sorted list.
3. Now, race columns will have value 0.
4. Call the replace function to replace 0 with np.NaN
5. For now, we will use the regular ordinal encoder for all out columns to simplify the data cleaning

In [42]:
X = df.iloc[:,:-1]
t = df['readmitted']

In [43]:
X['race'] = X['race'].replace(np.nan, "NA_RACE")

In [44]:
### Create the ranking
race_ranking = list(X['race'].unique())
race_ranking.remove("NA_RACE")
race_ranking.insert(0, "NA_RACE")
race_ranking

['NA_RACE', 'Caucasian', 'AfricanAmerican', 'Other', 'Asian', 'Hispanic']

In [45]:
X['race']

0               Caucasian
1               Caucasian
2         AfricanAmerican
3               Caucasian
4               Caucasian
               ...       
101761    AfricanAmerican
101762    AfricanAmerican
101763          Caucasian
101764          Caucasian
101765          Caucasian
Name: race, Length: 101766, dtype: object

In [46]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories = [race_ranking])
X['race'] = enc.fit_transform(X[['race']])
X['race'] = X['race'].replace(0, np.nan)
X['race'].unique()

array([ 1.,  2., nan,  3.,  4.,  5.])

In [47]:
### Now let's do the weight.
### Now that is A LOT OF missing values.
### However, the paper says we have to impute them so here we go.
### We have to be careful here. We actually do care about the ordinal levels. So we should sort them
print(X['weight'].isna().sum())

98569


In [48]:
X['weight'] = X['weight'].replace(np.nan, "NA_WEIGHT")

In [49]:
weight_ranking = list(X['weight'].unique())
weight_ranking

['NA_WEIGHT',
 '[75-100)',
 '[50-75)',
 '[0-25)',
 '[100-125)',
 '[25-50)',
 '[125-150)',
 '[175-200)',
 '[150-175)',
 '>200']

In [50]:
### The order goes as follows:
weight_ranking = ['NA_WEIGHT','[0-25)','[25-50)','[50-75)','[75-100)','[100-125)','[125-150)','[150-175)','[175-200)','>200']

In [51]:
### Fit the encoder and replace 0's with np.nan again.
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories = [weight_ranking])
X['weight'] = enc.fit_transform(X[['weight']])
X['weight'] = X['weight'].replace(0, np.nan)
X['weight'].unique()

array([nan,  4.,  3.,  1.,  5.,  2.,  6.,  8.,  7.,  9.])

In [52]:
### DO the same for medical specialty
X['medical_specialty'] = X['medical_specialty'].replace(np.nan,"NA_MEDICAL")
medical_ranking = list(X['medical_specialty'].unique())
medical_ranking.remove("NA_MEDICAL")
medical_ranking.insert(0, "NA_MEDICAL")
medical_ranking

['NA_MEDICAL',
 'Pediatrics-Endocrinology',
 'InternalMedicine',
 'Family/GeneralPractice',
 'Cardiology',
 'Surgery-General',
 'Orthopedics',
 'Gastroenterology',
 'Surgery-Cardiovascular/Thoracic',
 'Nephrology',
 'Orthopedics-Reconstructive',
 'Psychiatry',
 'Emergency/Trauma',
 'Pulmonology',
 'Surgery-Neuro',
 'Obsterics&Gynecology-GynecologicOnco',
 'ObstetricsandGynecology',
 'Pediatrics',
 'Hematology/Oncology',
 'Otolaryngology',
 'Surgery-Colon&Rectal',
 'Pediatrics-CriticalCare',
 'Endocrinology',
 'Urology',
 'Psychiatry-Child/Adolescent',
 'Pediatrics-Pulmonology',
 'Neurology',
 'Anesthesiology-Pediatric',
 'Radiology',
 'Pediatrics-Hematology-Oncology',
 'Psychology',
 'Podiatry',
 'Gynecology',
 'Oncology',
 'Pediatrics-Neurology',
 'Surgery-Plastic',
 'Surgery-Thoracic',
 'Surgery-PlasticwithinHeadandNeck',
 'Ophthalmology',
 'Surgery-Pediatric',
 'Pediatrics-EmergencyMedicine',
 'PhysicalMedicineandRehabilitation',
 'InfectiousDiseases',
 'Anesthesiology',
 'Rheumatol

In [53]:
enc = OrdinalEncoder(categories = [medical_ranking])
X['medical_specialty'] = enc.fit_transform(X[['medical_specialty']])
X['medical_specialty'] = X['medical_specialty'].replace(0, np.nan)
X['medical_specialty'].unique()

array([ 1., nan,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
       39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51.,
       52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64.,
       65., 66., 67., 68., 69., 70., 71., 72.])

In [54]:
### Here, for diagnosis 1, 2, and 3 we should probably sort them.
X['diag_1'] = X['diag_1'].replace(np.nan, "NA_DIAG_1")
diag_1_ranking = sorted(list(X['diag_1'].unique()))
### Fortunately, sorting them actually gives us the sorted ranking, Which is GOOD
diag_1_ranking.remove("NA_DIAG_1")
diag_1_ranking.insert(0,"NA_DIAG_1")
diag_1_ranking

['NA_DIAG_1',
 '10',
 '11',
 '110',
 '112',
 '114',
 '115',
 '117',
 '131',
 '133',
 '135',
 '136',
 '141',
 '142',
 '143',
 '145',
 '146',
 '147',
 '148',
 '149',
 '150',
 '151',
 '152',
 '153',
 '154',
 '155',
 '156',
 '157',
 '158',
 '160',
 '161',
 '162',
 '163',
 '164',
 '170',
 '171',
 '172',
 '173',
 '174',
 '175',
 '179',
 '180',
 '182',
 '183',
 '184',
 '185',
 '187',
 '188',
 '189',
 '191',
 '192',
 '193',
 '194',
 '195',
 '196',
 '197',
 '198',
 '199',
 '200',
 '201',
 '202',
 '203',
 '204',
 '205',
 '207',
 '208',
 '210',
 '211',
 '212',
 '214',
 '215',
 '216',
 '217',
 '218',
 '219',
 '220',
 '223',
 '225',
 '226',
 '227',
 '228',
 '229',
 '23',
 '230',
 '233',
 '235',
 '236',
 '237',
 '238',
 '239',
 '240',
 '241',
 '242',
 '244',
 '245',
 '246',
 '250',
 '250.01',
 '250.02',
 '250.03',
 '250.1',
 '250.11',
 '250.12',
 '250.13',
 '250.2',
 '250.21',
 '250.22',
 '250.23',
 '250.3',
 '250.31',
 '250.32',
 '250.33',
 '250.4',
 '250.41',
 '250.42',
 '250.43',
 '250.5',
 '250.

In [55]:
enc = OrdinalEncoder(categories = [diag_1_ranking])
X['diag_1'] = enc.fit_transform(X[['diag_1']])
X['diag_1'] = X['diag_1'].replace(0, np.nan)
X['diag_1'].unique()

array([125., 144., 455., 555.,  55., 264., 277., 253., 283., 121.,  27.,
       349., 697., 260., 476., 255., 524., 393., 709.,  48., 541., 276.,
       694., 145., 406., 307., 315., 261.,  38., 326., 696., 343., 281.,
       441., 163.,  54., 120., 434.,  42., 595., 272., 564., 112., 509.,
       256., 101., 539., 496., 288.,  21., 502., 695.,  56., 384., 566.,
       237., 410., 379., 399., 110., 282., 710., 389.,  45., 367., 133.,
       103., 418., 381., 395., 330., 382.,  93.,  99., 398., 517.,  47.,
       577., 122., 191., 383., 159., 164., 342., 254., 136., 287., 490.,
        98., 332., 449., 440., 392., 525., 413., 114., 563., 302., 294.,
       396., 113., 573., 347., 535., 106., 693.,  85., 124., 508., 542.,
        31., 511., 150., 346., 707., 149., 111., 361., 311., 284., 102.,
       705., 544., 386., 575.,  49., 380., 520., 301., 500., 323.,  60.,
       148., 377.,  77.,  24., 289., 123., 207.,  nan, 664., 412., 340.,
       244.,  26.,  58., 515., 206., 305., 275., 24

In [56]:
### diag_2
### Here, for diagnosis 1, 2, and 3 we should probably sort them.
X['diag_2'] = X['diag_2'].replace(np.nan, "NA_DIAG_2")
diag_2_ranking = sorted(list(X['diag_2'].unique()))
### Fortunately, sorting them actually gives us the sorted ranking, Which is GOOD
diag_2_ranking.remove("NA_DIAG_2")
diag_2_ranking.insert(0,"NA_DIAG_2")
diag_2_ranking

['NA_DIAG_2',
 '11',
 '110',
 '111',
 '112',
 '114',
 '115',
 '117',
 '123',
 '130',
 '131',
 '135',
 '136',
 '137',
 '138',
 '140',
 '141',
 '145',
 '150',
 '151',
 '152',
 '153',
 '154',
 '155',
 '156',
 '157',
 '162',
 '163',
 '164',
 '171',
 '172',
 '173',
 '174',
 '179',
 '180',
 '182',
 '183',
 '185',
 '186',
 '188',
 '189',
 '191',
 '192',
 '193',
 '195',
 '196',
 '197',
 '198',
 '199',
 '200',
 '201',
 '202',
 '203',
 '204',
 '205',
 '208',
 '211',
 '212',
 '214',
 '215',
 '217',
 '218',
 '220',
 '223',
 '225',
 '226',
 '227',
 '228',
 '232',
 '233',
 '235',
 '238',
 '239',
 '240',
 '241',
 '242',
 '244',
 '245',
 '246',
 '250',
 '250.01',
 '250.02',
 '250.03',
 '250.1',
 '250.11',
 '250.12',
 '250.13',
 '250.2',
 '250.21',
 '250.22',
 '250.23',
 '250.3',
 '250.31',
 '250.32',
 '250.33',
 '250.4',
 '250.41',
 '250.42',
 '250.43',
 '250.5',
 '250.51',
 '250.52',
 '250.53',
 '250.6',
 '250.7',
 '250.8',
 '250.81',
 '250.82',
 '250.83',
 '250.9',
 '250.91',
 '250.92',
 '250.93',
 

In [57]:
enc = OrdinalEncoder(categories = [diag_2_ranking])
X['diag_2'] = enc.fit_transform(X[['diag_2']])
X['diag_2'] = X['diag_2'].replace(0, np.nan)
X['diag_2'].unique()

array([ nan,  80.,  79.,  98.,  25., 247., 315., 261.,  47., 242., 146.,
       649., 324.,  32., 259., 286., 240., 472., 319., 262., 387.,  81.,
       246., 650., 647.,  11.,  76., 245., 374., 134., 648., 398., 258.,
       314., 360., 465., 144., 273., 316.,  75., 458., 728.,  82., 209.,
       327.,  45., 236.,  46., 250., 101., 380., 351., 249., 143., 345.,
       510., 334.,  18., 369., 103., 575., 311., 719., 530., 518., 191.,
       377., 384.,  67., 346., 107., 516., 152., 370., 515., 328., 163.,
       485., 100., 138., 429., 365., 413., 277., 226., 714., 381., 135.,
       511.,  97., 136., 260., 386., 293., 241.,  21., 130., 488., 190.,
       585.,  52.,  96.,  86., 151.,  77.,  85., 364., 517., 193., 376.,
       420., 161., 102., 288., 467., 251., 725., 142., 372., 508., 445.,
         4., 150., 269., 148., 112., 424., 352., 238., 176., 468., 668.,
       278., 116., 727., 104., 514., 158., 366.,  26., 145., 279., 519.,
       521., 391.,  49.,  22., 162.,   7., 563., 56

In [58]:
### diag_3
### Here, for diagnosis 1, 2, and 3 we should probably sort them.
X['diag_3'] = X['diag_3'].replace(np.nan, "NA_DIAG_3")
diag_3_ranking = sorted(list(X['diag_3'].unique()))
### Fortunately, sorting them actually gives us the sorted ranking, Which is GOOD
diag_3_ranking.remove("NA_DIAG_3")
diag_3_ranking.insert(0,"NA_DIAG_3")
diag_3_ranking

['NA_DIAG_3',
 '11',
 '110',
 '111',
 '112',
 '115',
 '117',
 '122',
 '123',
 '131',
 '132',
 '135',
 '136',
 '138',
 '139',
 '14',
 '141',
 '146',
 '148',
 '150',
 '151',
 '152',
 '153',
 '154',
 '155',
 '156',
 '157',
 '158',
 '161',
 '162',
 '163',
 '164',
 '17',
 '170',
 '171',
 '172',
 '173',
 '174',
 '175',
 '179',
 '180',
 '182',
 '183',
 '185',
 '186',
 '188',
 '189',
 '191',
 '192',
 '193',
 '195',
 '196',
 '197',
 '198',
 '199',
 '200',
 '201',
 '202',
 '203',
 '204',
 '205',
 '208',
 '211',
 '214',
 '215',
 '216',
 '217',
 '218',
 '220',
 '223',
 '225',
 '226',
 '227',
 '228',
 '230',
 '233',
 '235',
 '236',
 '238',
 '239',
 '240',
 '241',
 '242',
 '243',
 '244',
 '245',
 '246',
 '250',
 '250.01',
 '250.02',
 '250.03',
 '250.1',
 '250.11',
 '250.12',
 '250.13',
 '250.2',
 '250.21',
 '250.22',
 '250.23',
 '250.3',
 '250.31',
 '250.4',
 '250.41',
 '250.42',
 '250.43',
 '250.5',
 '250.51',
 '250.52',
 '250.53',
 '250.6',
 '250.7',
 '250.8',
 '250.81',
 '250.82',
 '250.83',
 '25

In [59]:
enc = OrdinalEncoder(categories = [diag_3_ranking])
X['diag_3'] = enc.fit_transform(X[['diag_3']])
X['diag_3'] = X['diag_3'].replace(0, np.nan)
X['diag_3'].unique()

array([ nan, 122., 767., 249.,  87., 771., 230., 318., 667.,  52., 109.,
       268., 431., 257., 259., 487., 269., 393., 769.,  88., 129., 103.,
       140., 314., 247., 102., 396., 531., 142., 669., 378., 460., 422.,
        89., 169., 480., 327., 408., 488., 265., 341., 367., 545., 254.,
       768., 353., 335., 321., 373., 111., 110., 107., 534., 322., 392.,
       262., 550., 497., 503., 539.,   9., 113., 670., 252., 324.,  90.,
       522., 536., 350., 704., 266., 404., 167., 372., 484., 323., 186.,
       160., 279., 216., 101., 442.,  84., 786., 510., 429., 459., 106.,
       250., 755., 557., 144., 280., 535., 398., 379., 136., 668., 104.,
       628., 395., 361., 538., 267., 495., 117.,  51., 299., 357., 537.,
       612., 148., 296., 438., 530.,  41., 149., 402., 256., 452., 374.,
        56., 210., 760., 156., 532., 308., 293., 725., 211., 202., 158.,
        98., 297., 694., 278., 506., 332., 347., 108., 245., 383., 554.,
       291., 185., 509., 248., 400., 387., 303., 35

In [60]:
### Okay. So we are done converting the categorical classes.

### Let's see what else needs to be fixed:

### Checking Missing Values:
for i in X.columns:
    print(str(i) + ":::" + str(X[i].isna().sum()))

encounter_id:::0
patient_nbr:::0
race:::2273
gender:::0
age:::0
weight:::98569
admission_type_id:::0
discharge_disposition_id:::0
admission_source_id:::0
time_in_hospital:::0
payer_code:::40256
medical_specialty:::49949
num_lab_procedures:::0
num_procedures:::0
num_medications:::0
number_outpatient:::0
number_emergency:::0
number_inpatient:::0
diag_1:::21
diag_2:::358
diag_3:::1423
number_diagnoses:::0
max_glu_serum:::0
A1Cresult:::0
metformin:::0
repaglinide:::0
nateglinide:::0
chlorpropamide:::0
glimepiride:::0
acetohexamide:::0
glipizide:::0
glyburide:::0
tolbutamide:::0
pioglitazone:::0
rosiglitazone:::0
acarbose:::0
miglitol:::0
troglitazone:::0
tolazamide:::0
examide:::0
citoglipton:::0
insulin:::0
glyburide-metformin:::0
glipizide-metformin:::0
glimepiride-pioglitazone:::0
metformin-rosiglitazone:::0
metformin-pioglitazone:::0
change:::0
diabetesMed:::0


In [61]:
X.dtypes

encounter_id                  int64
patient_nbr                   int64
race                        float64
gender                       object
age                          object
weight                      float64
admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
payer_code                   object
medical_specialty           float64
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
number_outpatient             int64
number_emergency              int64
number_inpatient              int64
diag_1                      float64
diag_2                      float64
diag_3                      float64
number_diagnoses              int64
max_glu_serum                object
A1Cresult                    object
metformin                    object
repaglinide                  object
nateglinide                  object
chlorpropamide              

### Next Steps:

Create encodings for the rest of the objects. Fortunately, the rest of them don' have any NA variables.

The encodings we have to account for:

age, payer_code, columns index 22 -> the end

Notice: columns 22 to the last have little NA values. We should create a function to automate the task of encoding to make
things easier. 

Firstly, let's perform the ordinal encoding for age, gender, and payer_code.

In [62]:
age_ranking = X['age'].unique()
sorted(age_ranking)

['[0-10)',
 '[10-20)',
 '[20-30)',
 '[30-40)',
 '[40-50)',
 '[50-60)',
 '[60-70)',
 '[70-80)',
 '[80-90)',
 '[90-100)']

In [63]:
### Perfect. They're already sorted for us!
enc = OrdinalEncoder(categories = [age_ranking])
X['age'] = enc.fit_transform(X[['age']])
X['age'].unique()

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [64]:
### Now lets encode gender, It should be easy as it is just 0, 1 , or 2
gender_ranking = X['gender'].unique()
sorted(gender_ranking)

['Female', 'Male', 'Unknown/Invalid']

In [65]:
enc = OrdinalEncoder(categories = [gender_ranking])
X['gender'] = enc.fit_transform(X[['gender']])
X['gender'].unique()

array([0., 1., 2.])

In [66]:
### Now let's do it for the payer code. Here, let's just form NAN for the payer code. As it may not matter too much.
X['payer_code'] = X['payer_code'].replace(np.nan, "NAN_PAYER")
payer_code_ranking = X['payer_code'].unique()

In [67]:
enc = OrdinalEncoder(categories = [payer_code_ranking])
X['payer_code'] = enc.fit_transform(X[['payer_code']])
X['payer_code'].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17.])

In [68]:
X.iloc[:,22:]

Unnamed: 0,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,,,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,,,No,No,No,No,No,No,No,No,...,No,No,Up,No,No,No,No,No,Ch,Yes
2,,,No,No,No,No,No,No,Steady,No,...,No,No,No,No,No,No,No,No,No,Yes
3,,,No,No,No,No,No,No,No,No,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,,,No,No,No,No,No,No,Steady,No,...,No,No,Steady,No,No,No,No,No,Ch,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,,>8,Steady,No,No,No,No,No,No,No,...,No,No,Down,No,No,No,No,No,Ch,Yes
101762,,,No,No,No,No,No,No,No,No,...,No,No,Steady,No,No,No,No,No,No,Yes
101763,,,Steady,No,No,No,No,No,No,No,...,No,No,Down,No,No,No,No,No,Ch,Yes
101764,,,No,No,No,No,No,No,Steady,No,...,No,No,Up,No,No,No,No,No,Ch,Yes


## The code below will automatically create the ordinal mapping for us. It is important to note that the last 22 columns are categorical data. We have NOT messed with any of the data that was originally numerical

In [69]:
dict_mapping = dict()
for i in X.iloc[:,22:].columns:
    curr = sorted(X[i].unique())
    dict_mapping[i] = curr
for i in dict_mapping.values():
    print(i)
    
for col_name in dict_mapping.keys():
    enc = OrdinalEncoder(categories = [dict_mapping[col_name]])
    X[col_name] = enc.fit_transform(X[[col_name]])
dict_mapping = dict()
for i in X.iloc[:,22:len(X)].columns:
    curr = sorted(X[i].unique())
    dict_mapping[i] = curr
for i in dict_mapping.values():
    print(i)

['>200', '>300', 'None', 'Norm']
['>7', '>8', 'None', 'Norm']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['No', 'Steady']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['No', 'Steady']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['No', 'Steady']
['No', 'Steady', 'Up']
['No']
['No']
['Down', 'No', 'Steady', 'Up']
['Down', 'No', 'Steady', 'Up']
['No', 'Steady']
['No', 'Steady']
['No', 'Steady']
['No', 'Steady']
['Ch', 'No']
['No', 'Yes']
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0, 2.0, 3.0]
[0.0, 1.0]
[0.0, 1.0, 2.0]
[0.0]
[0.0]
[0.0, 1.0, 2.0, 3.

In [70]:
X.dtypes

encounter_id                  int64
patient_nbr                   int64
race                        float64
gender                      float64
age                         float64
weight                      float64
admission_type_id             int64
discharge_disposition_id      int64
admission_source_id           int64
time_in_hospital              int64
payer_code                  float64
medical_specialty           float64
num_lab_procedures            int64
num_procedures                int64
num_medications               int64
number_outpatient             int64
number_emergency              int64
number_inpatient              int64
diag_1                      float64
diag_2                      float64
diag_3                      float64
number_diagnoses              int64
max_glu_serum               float64
A1Cresult                   float64
metformin                   float64
repaglinide                 float64
nateglinide                 float64
chlorpropamide              

In [71]:
for i in X.columns:
    print(str(i) + ": " + str(X[i].isna().sum()))

encounter_id: 0
patient_nbr: 0
race: 2273
gender: 0
age: 0
weight: 98569
admission_type_id: 0
discharge_disposition_id: 0
admission_source_id: 0
time_in_hospital: 0
payer_code: 0
medical_specialty: 49949
num_lab_procedures: 0
num_procedures: 0
num_medications: 0
number_outpatient: 0
number_emergency: 0
number_inpatient: 0
diag_1: 21
diag_2: 358
diag_3: 1423
number_diagnoses: 0
max_glu_serum: 0
A1Cresult: 0
metformin: 0
repaglinide: 0
nateglinide: 0
chlorpropamide: 0
glimepiride: 0
acetohexamide: 0
glipizide: 0
glyburide: 0
tolbutamide: 0
pioglitazone: 0
rosiglitazone: 0
acarbose: 0
miglitol: 0
troglitazone: 0
tolazamide: 0
examide: 0
citoglipton: 0
insulin: 0
glyburide-metformin: 0
glipizide-metformin: 0
glimepiride-pioglitazone: 0
metformin-rosiglitazone: 0
metformin-pioglitazone: 0
change: 0
diabetesMed: 0


In [2]:
df['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

## That is a HUGE imbalance.

However, regardless of the data imbalance my goal is to prevent hospital readmissions in general.

Doing so can actually tell us more about some underlying conditions that are more serious and should be treated.

### Implementing the methods from the paper:

Using the paper, we will perform some feaature imputation and outlier removal.

We actually end up turning the problem into a binary classification task.

I.e, our goal is to prevent future hospital readmissions whether it be years later or less than 30 days.

In [72]:
# Split the data into train and testing
from sklearn.model_selection import train_test_split
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.1)

### Let us tranform the data.
t_train.replace("NO", 0, inplace = True)
t_train.replace(">30", 1, inplace = True)
t_train.replace("<30", 1, inplace = True)

### Let us tranform the data.
t_test.replace("NO", 0, inplace = True)
t_test.replace(">30", 1, inplace = True)
t_test.replace("<30", 1, inplace = True)

In [73]:
X_train.to_csv("X_train", sep=',', encoding='utf-8', index = False)
X_test.to_csv("X_test", sep=',', encoding='utf-8', index = False)
t_train.to_csv("t_train", sep=',', encoding='utf-8', index = False)
t_test.to_csv("t_test", sep=',', encoding='utf-8', index = False)