# Machine Learning-Based Prediction of 30-Day Hospital Readmission in Diabetic Patients

## Data Cleaning

In [56]:
import pandas as pd

### Handling Missing Values

In [57]:
df = pd.read_csv('./data/raw/day-25.csv', low_memory=False)

In [58]:
df.sample(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
73723,73723,73723,218429766,58097331,AfricanAmerican,Male,[50-60),,6,1,...,No,Steady,No,No,No,No,No,Ch,Yes,>30
65888,65888,65888,183932922,45305631,AfricanAmerican,Male,[40-50),,3,1,...,No,Steady,No,No,No,No,No,Ch,Yes,>30
32282,32282,32282,102998598,102850371,Caucasian,Female,[70-80),,1,1,...,No,Steady,No,No,No,No,No,No,Yes,NO
91973,91973,91973,316123388,41798907,Caucasian,Male,[70-80),,1,1,...,No,Down,No,No,No,No,No,Ch,Yes,<30
12731,12731,12731,51446850,5010138,Caucasian,Male,[60-70),,1,1,...,No,No,No,No,No,No,No,No,No,>30


In [59]:
# Cheking for Datatypes

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 52 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Unnamed: 0.1              101766 non-null  int64 
 1   Unnamed: 0                101766 non-null  int64 
 2   encounter_id              101766 non-null  int64 
 3   patient_nbr               101766 non-null  int64 
 4   race                      99493 non-null   object
 5   gender                    101766 non-null  object
 6   age                       101766 non-null  object
 7   weight                    3197 non-null    object
 8   admission_type_id         101766 non-null  int64 
 9   discharge_disposition_id  101766 non-null  int64 
 10  admission_source_id       101766 non-null  int64 
 11  time_in_hospital          101766 non-null  int64 
 12  payer_code                61510 non-null   object
 13  medical_specialty         51817 non-null   object
 14  num_

In [60]:
# Checing for missing values

df.isna().sum()

Unnamed: 0.1                    0
Unnamed: 0                      0
encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide

In [61]:
# Checing for missing value percentage

df.isnull().mean() * 100

Unnamed: 0.1                 0.000000
Unnamed: 0                   0.000000
encounter_id                 0.000000
patient_nbr                  0.000000
race                         2.233555
gender                       0.000000
age                          0.000000
weight                      96.858479
admission_type_id            0.000000
discharge_disposition_id     0.000000
admission_source_id          0.000000
time_in_hospital             0.000000
payer_code                  39.557416
medical_specialty           49.082208
num_lab_procedures           0.000000
num_procedures               0.000000
num_medications              0.000000
number_outpatient            0.000000
number_emergency             0.000000
number_inpatient             0.000000
diag_1                       0.020636
diag_2                       0.351787
diag_3                       1.398306
number_diagnoses             0.000000
max_glu_serum               94.746772
A1Cresult                   83.277322
metformin   

In [62]:
# Working on race column

df['race'].unique()

array(['Caucasian', 'AfricanAmerican', nan, 'Other', 'Asian', 'Hispanic'],
      dtype=object)

In [63]:
df['race'].value_counts()

race
Caucasian          76099
AfricanAmerican    19210
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64

In [64]:
# Replace missing values with "Unknown" (create a new category) to avoid artificial bias 
# Mode imputation is acceptable but not ideal for such a small number of missing values

df['race'] = df['race'].fillna('Unknown')

In [65]:
# Working on weight column

df['weight'].unique()

array([nan, '[75-100)', '[50-75)', '[0-25)', '[100-125)', '[25-50)',
       '[125-150)', '[175-200)', '[150-175)', '>200'], dtype=object)

In [66]:
# Droping weight column because it has so much missing value that can no be filled

df = df.drop(columns='weight')

In [67]:
# Working on payer_code column

display(
  df['payer_code'].nunique(),
  df['payer_code'].unique()
)

17

array([nan, 'MC', 'MD', 'HM', 'UN', 'BC', 'SP', 'CP', 'SI', 'DM', 'CM',
       'CH', 'PO', 'WC', 'OT', 'OG', 'MP', 'FR'], dtype=object)

In [68]:
df['payer_code'].value_counts()

payer_code
MC    32439
HM     6274
SP     5007
BC     4655
MD     3532
CP     2533
UN     2448
CM     1937
OG     1033
PO      592
DM      549
CH      146
WC      135
OT       95
MP       79
SI       55
FR        1
Name: count, dtype: int64

In [69]:
# payer_code column can be useful so can not drop it immediately 
# Replace missing values with "Unknown" (create a new category) to avoid artificial bias in single column

df['payer_code'] = df['payer_code'].fillna("Unknown")

In [70]:
# Working on medical_specialty column

display(
  df['medical_specialty'].nunique(),
  df['medical_specialty'].unique()
)

72

array(['Pediatrics-Endocrinology', nan, 'InternalMedicine',
       'Family/GeneralPractice', 'Cardiology', 'Surgery-General',
       'Orthopedics', 'Gastroenterology',
       'Surgery-Cardiovascular/Thoracic', 'Nephrology',
       'Orthopedics-Reconstructive', 'Psychiatry', 'Emergency/Trauma',
       'Pulmonology', 'Surgery-Neuro',
       'Obsterics&Gynecology-GynecologicOnco', 'ObstetricsandGynecology',
       'Pediatrics', 'Hematology/Oncology', 'Otolaryngology',
       'Surgery-Colon&Rectal', 'Pediatrics-CriticalCare', 'Endocrinology',
       'Urology', 'Psychiatry-Child/Adolescent', 'Pediatrics-Pulmonology',
       'Neurology', 'Anesthesiology-Pediatric', 'Radiology',
       'Pediatrics-Hematology-Oncology', 'Psychology', 'Podiatry',
       'Gynecology', 'Oncology', 'Pediatrics-Neurology',
       'Surgery-Plastic', 'Surgery-Thoracic',
       'Surgery-PlasticwithinHeadandNeck', 'Ophthalmology',
       'Surgery-Pediatric', 'Pediatrics-EmergencyMedicine',
       'PhysicalMedicineandRe

In [71]:
# medical_specialty indicates 
#   - Type of admitting doctor
#   - Likely medical condition
#   - Risk pattern for readmission

# Can not drop it , and 50% of data is missing so can not use Mode imputation
# Fill missing data with Unknown and Reduce Cardinality

In [72]:
# Filling Missing data

df['medical_specialty'] = df['medical_specialty'].fillna('Unknown')

In [73]:
# Reducing Cardinality

top_specialties = df['medical_specialty'].value_counts().nlargest(15).index

df['medical_specialty'] = df['medical_specialty'].apply(
    lambda x: x if x in top_specialties else 'Other'
)

In [74]:
display(
  df['medical_specialty'].nunique(),
  df['medical_specialty'].unique()
)

16

array(['Other', 'Unknown', 'InternalMedicine', 'Family/GeneralPractice',
       'Cardiology', 'Surgery-General', 'Orthopedics',
       'Surgery-Cardiovascular/Thoracic', 'Nephrology',
       'Orthopedics-Reconstructive', 'Psychiatry', 'Emergency/Trauma',
       'Pulmonology', 'ObstetricsandGynecology', 'Urology', 'Radiologist'],
      dtype=object)

In [75]:
# Working with columns diag_1, diag_2, diag_3
# ICD-9 codes represent diseases
# Those columns will highly affect readmitted (Target feature)
# Can not Drop it, Filling missing values with Unknown because missing data is to low

In [76]:
df['diag_1'] = df['diag_1'].fillna('Unknown')
df['diag_2'] = df['diag_2'].fillna('Unknown')
df['diag_3'] = df['diag_3'].fillna('Unknown')

In [77]:
# Distinct Values are real problem in this features

display(
  df['diag_1'].nunique(),
  df['diag_2'].nunique(),
  df['diag_3'].nunique(),
)

717

749

790

In [78]:
# To reduce cardinality 
# Convert ICD Codes Into Disease Groups

# Range	    Disease Category
# ---------------------------
# 390–459	  Circulatory
# 460–519	  Respiratory
# 520–579	  Digestive
# 580–629	  Genitourinary
# 250	      Diabetes
# 800–999	  Injury
# 710–739	  Musculoskeletal
# 140–239	  Neoplasms (Cancer)

In [79]:
def categorize_diagnosis(code):
    
  if pd.isna(code):
    return 'Unknown'
  
  code = str(code)
  
  if code.startswith('V'):
    return 'Supplementary'
  elif code.startswith('E'):
    return 'External Injury'
  
  try:
      code = float(code)
  except:
    return 'Other'
  
  if 250 <= code < 251:
    return 'Diabetes'
  elif 140 <= code <= 239:
    return 'Neoplasms'
  elif 390 <= code <= 459:
    return 'Circulatory'
  elif 460 <= code <= 519:
    return 'Respiratory'
  elif 520 <= code <= 579:
    return 'Digestive'
  elif 580 <= code <= 629:
    return 'Genitourinary'
  elif 710 <= code <= 739:
    return 'Musculoskeletal'
  elif 800 <= code <= 999:
    return 'Injury'
  else:
    return 'Other'


In [80]:
df['diag_1'] = df['diag_1'].apply(categorize_diagnosis)
df['diag_2'] = df['diag_2'].apply(categorize_diagnosis)
df['diag_3'] = df['diag_3'].apply(categorize_diagnosis)

In [81]:
display(
  df['diag_1'].nunique(),
  df['diag_2'].nunique(),
  df['diag_3'].nunique(),
)

11

11

11

In [82]:
# Working with max_glu_serum feature
# 95% fo data is missing
# droping the column is only option here

In [83]:
df['max_glu_serum'].value_counts(dropna=False)


max_glu_serum
NaN     96420
Norm     2597
>200     1485
>300     1264
Name: count, dtype: int64

In [84]:
df = df.drop(columns=['max_glu_serum'])

In [85]:
# Working with A1Cresult feature
# It represent - > A blood test that measures average blood glucose over the last 2–3 months

# Value	          Meaning
# ----------------------------------------
# normal (<7%)	  Good diabetes control
# >7%	            Poor control
# >8%	            Very poor control
# none	          Test not done

In [86]:
df['A1Cresult'].value_counts(dropna=False)

A1Cresult
NaN     84748
>8       8216
Norm     4990
>7       3812
Name: count, dtype: int64

In [87]:
# in this column Value "none" is represented as NaN
# We fill NaN with None

df['A1Cresult'] = df['A1Cresult'].fillna('NotTested')

In [88]:
df['A1Cresult'].value_counts()

A1Cresult
NotTested    84748
>8            8216
Norm          4990
>7            3812
Name: count, dtype: int64

In [89]:
# Handled all missing values of dataset

df.isna().sum()

Unnamed: 0.1                0
Unnamed: 0                  0
encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
payer_code                  0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
A1Cresult                   0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazo

### Data Preparation

In [90]:
# Droping unwanted columns

df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True) # index columns
df.drop(columns=['encounter_id', 'patient_nbr'], inplace=True) # id columns

In [91]:
# Cheking datatypes

df.dtypes

race                        object
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide                   object
glyburide                   object
tolbutamide         

In [92]:
# Replace the numeric codes with their actual category names.

# columns 
# admission_type_id
# discharge_disposition_id
# admission_source_id

In [93]:
admission_type_map = {
  1: 'Emergency',
  2: 'Urgent',
  3: 'Elective',
  4: 'Newborn',
  5: 'Not Available',
  6: 'NULL',
  7: 'Trauma Center',
  8: 'Not Mapped'
}

discharge_disposition_map = {
  1: 'Discharged to home',
  2: 'Transferred to short term hospital',
  3: 'Transferred to SNF',
  4: 'Transferred to ICF',
  5: 'Transferred to inpatient care institution',
  6: 'Home with home health service',
  7: 'Left AMA',
  8: 'Home under care of Home IV provider',
  9: 'Admitted as inpatient to this hospital',
  10: 'Neonate discharged to another hospital',
  11: 'Expired',
  12: 'Still patient',
  13: 'Hospice home',
  14: 'Hospice medical facility',
  15: 'Transferred to swing bed',
  16: 'Transferred outpatient institution',
  17: 'Referred outpatient institution',
  18: 'NULL',
  19: 'Expired at home hospice',
  20: 'Expired in medical facility hospice',
  21: 'Expired place unknown',
  22: 'Transferred to rehab facility',
  23: 'Transferred to long term care hospital',
  24: 'Transferred to nursing facility Medicaid only',
  25: 'Not Mapped',
  26: 'Unknown/Invalid',
  27: 'Transferred to federal health care facility',
  28: 'Transferred to psychiatric hospital',
  29: 'Transferred to Critical Access Hospital',
  30: 'Transferred to other health care institution'
}


admission_source_map = {
  1: 'Physician Referral',
  2: 'Clinic Referral',
  3: 'HMO Referral',
  4: 'Transfer from hospital',
  5: 'Transfer from SNF',
  6: 'Transfer from health care facility',
  7: 'Emergency Room',
  8: 'Court/Law Enforcement',
  9: 'Not Available',
  10: 'Transfer from critical access hospital',
  11: 'Normal Delivery',
  12: 'Premature Delivery',
  13: 'Sick Baby',
  14: 'Extramural Birth',
  15: 'Not Available',
  17: 'NULL',
  18: 'Transfer from Home Health Agency',
  19: 'Readmission to Same Home Health Agency',
  20: 'Not Mapped',
  21: 'Unknown/Invalid',
  22: 'Transfer within same facility',
  23: 'Born inside this hospital',
  24: 'Born outside this hospital',
  25: 'Transfer from Ambulatory Surgery Center',
  26: 'Transfer from Hospice'
}

In [94]:
def apply_mapping(df, column_name, mapping_dict):
  df[column_name] = df[column_name].map(mapping_dict)
  df[column_name] = df[column_name].fillna('Unknown')
  return df

In [95]:
df = apply_mapping(df, 'admission_type_id', admission_type_map)
df = apply_mapping(df, 'discharge_disposition_id', discharge_disposition_map)
df = apply_mapping(df, 'admission_source_id', admission_source_map)

In [96]:
# Renaming Columns

df.rename(columns={
    'admission_type_id': 'admission_type',
    'discharge_disposition_id': 'discharge_disposition',
    'admission_source_id': 'admission_source'
}, inplace=True)


In [97]:
# Cheking for missing values in new columns

In [98]:
df[['admission_type', 
    'discharge_disposition', 
    'admission_source']].isna().sum()

admission_type           0
discharge_disposition    0
admission_source         0
dtype: int64

In [99]:
df['admission_type'].value_counts()

admission_type
Emergency        53990
Elective         18869
Urgent           18480
NULL              5291
Not Available     4785
Not Mapped         320
Trauma Center       21
Newborn             10
Name: count, dtype: int64

In [100]:
df['discharge_disposition'].value_counts()

discharge_disposition
Discharged to home                               60234
Transferred to SNF                               13954
Home with home health service                    12902
NULL                                              3691
Transferred to short term hospital                2128
Transferred to rehab facility                     1993
Expired                                           1642
Transferred to inpatient care institution         1184
Not Mapped                                         989
Transferred to ICF                                 815
Left AMA                                           623
Transferred to long term care hospital             412
Hospice home                                       399
Hospice medical facility                           372
Transferred to psychiatric hospital                139
Home under care of Home IV provider                108
Transferred to swing bed                            63
Transferred to nursing facility Medicaid on

In [101]:
df['admission_source'].value_counts()

admission_source
Emergency Room                             57494
Physician Referral                         29565
NULL                                        6781
Transfer from hospital                      3187
Transfer from health care facility          2264
Clinic Referral                             1104
Transfer from SNF                            855
HMO Referral                                 187
Not Mapped                                   161
Not Available                                125
Court/Law Enforcement                         16
Transfer within same facility                 12
Transfer from critical access hospital         8
Extramural Birth                               2
Normal Delivery                                2
Transfer from Ambulatory Surgery Center        2
Sick Baby                                      1
Name: count, dtype: int64

In [102]:
# Merging Categories like NULL, Not Available, Not Mapped, Unknown/Invalid into single category "Unknown"

def clean_unknowns(column):
  return column.replace(
      ['NULL', 'Not Available', 'Not Mapped', 'Unknown/Invalid'],
      'Unknown'
  )

df['admission_type'] = clean_unknowns(df['admission_type'])
df['discharge_disposition'] = clean_unknowns(df['discharge_disposition'])
df['admission_source'] = clean_unknowns(df['admission_source'])


In [103]:
# Combining Expired categories in discharge_disposition feature

df['discharge_disposition'] = df['discharge_disposition'].replace(
    ['Expired at home hospice', 
     'Expired in medical facility hospice'],
    'Expired'
)


In [104]:
# Combining rare categories into "Others"

def combine_rare_categories(df, column, threshold=100):
  counts = df[column].value_counts()
  rare_categories = counts[counts < threshold].index
  df[column] = df[column].replace(rare_categories, 'Other')
  return df

In [105]:
df = combine_rare_categories(df, 'admission_type')
df = combine_rare_categories(df, 'discharge_disposition')
df = combine_rare_categories(df, 'admission_source')

In [106]:
# Cheking gender feature

df['gender'].value_counts()

gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: count, dtype: int64

In [107]:
# droping 3 rows with Unknown gender value

df = df[df['gender'] != 'Unknown/Invalid']

In [108]:
df['readmitted'].value_counts()

readmitted
NO     54861
>30    35545
<30    11357
Name: count, dtype: int64

### Saving Clean File of Day-26

In [112]:
# Saving cleaned csv from Day-28

df.to_csv("./data/clean/day-26.csv", index=False)