# Hospital Readmission Prediction

This project uses the **Diabetic Hospital Readmission Dataset** to predict whether a patient is likely to be readmitted within 30 days of discharge.

The goal is to:
- Perform basic data exploration and preprocessing
- Handle missing values and categorical data
- Build a simple machine learning model using **scikit-learn**
- Evaluate model performance

This notebook is part of an end-to-end machine learning pipeline structured using **best practices** (modular code, reproducibility, clean folder structure).


In [1]:
#importing necessary libraries for data exploration
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
#reading the dataset
data_path = Path("../data/raw/diabetic_data.csv")
df = pd.read_csv(data_path)


In [3]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
#checking the shape and is there any null values
df.info()
#we can see there is no null valuse in the dataset 
# but there are null values in the dataset its not showing bcause 
# of the datatype is object and the null valuse are represented as '?'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

# Cleaning the data and adding some usefull columns


In [5]:
# Convert diag_1 to numeric, coerce errors to NaN
df['diag_1'] = pd.to_numeric(df['diag_1'], errors='coerce')

# Fill missing values with the median
df['diag_1'] = df['diag_1'].fillna(df['diag_1'].median())
df['diag_1'] = pd.to_numeric(df['diag_1'], errors='coerce')

# Convert diag_2 to numeric, coerce errors to NaN
df['diag_2'] = pd.to_numeric(df['diag_2'], errors='coerce')

# Fill missing values with the median
df['diag_2'] = df['diag_2'].fillna(df['diag_2'].median())
df['diag_3'] = pd.to_numeric(df['diag_3'], errors='coerce')

# Fill missing values with the median
df['diag_3'] = df['diag_3'].fillna(df['diag_3'].median())

In [6]:
def map_icd9(code):
    try:
        code=float(code)
        if 390 <= code <= 459 or code == 785:
            return 'Circulatory'
        elif 460 <= code <= 519 or code == 786:
            return 'Respiratory'
        elif 520 <= code <= 579 or code == 787:
            return 'Digestive'
        elif 250 <= code < 251:
            return 'Diabetes'
        elif 800 <= code <= 999:
            return 'Injury'
        elif 710 <= code <= 739:
            return 'Musculoskeletal'
        elif 580 <= code <= 629 or code == 788:
            return 'Genitourinary'
        else:
            return 'Other'
    except:
        return 'Unknown'

In [7]:
df['diag-goup_1'] = df['diag_1'].apply(map_icd9)
df['diag-goup_2'] = df['diag_2'].apply(map_icd9)
df['diag-goup_3'] = df['diag_3'].apply(map_icd9)

In [8]:
df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,diag-goup_1,diag-goup_2,diag-goup_3
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,NO,Diabetes,Circulatory,Circulatory
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,No,No,No,Ch,Yes,>30,Other,Diabetes,Other
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,Yes,NO,Other,Diabetes,Circulatory
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,No,No,No,Ch,Yes,NO,Other,Diabetes,Circulatory
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,No,No,No,Ch,Yes,NO,Other,Other,Diabetes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,No,No,No,Ch,Yes,>30,Diabetes,Other,Circulatory
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,No,No,No,No,Yes,NO,Digestive,Other,Digestive
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,No,No,No,Ch,Yes,NO,Other,Genitourinary,Other
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,No,No,No,Ch,Yes,NO,Injury,Other,Injury


In [9]:
df['race'].isnull().sum()

np.int64(0)

In [10]:
df['race'].nunique()


6

In [11]:
df['race'].replace('?',np.nan,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['race'].replace('?',np.nan,inplace=True)


In [12]:
df.race.isnull().sum()

np.int64(2273)

In [13]:
df.race.fillna(df.race.mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.race.fillna(df.race.mode()[0], inplace=True)


In [14]:
df.race.isnull().sum()

np.int64(0)

I calculated the weight null percentage it was around 95% so i decided to drop the column

In [15]:
df.weight.head(1000)

0      ?
1      ?
2      ?
3      ?
4      ?
      ..
995    ?
996    ?
997    ?
998    ?
999    ?
Name: weight, Length: 1000, dtype: object

In [16]:
df.drop(columns=['weight'], inplace=True)

In [17]:
df['medical_specialty'].unique()

array(['Pediatrics-Endocrinology', '?', 'InternalMedicine',
       'Family/GeneralPractice', 'Cardiology', 'Surgery-General',
       'Orthopedics', 'Gastroenterology',
       'Surgery-Cardiovascular/Thoracic', 'Nephrology',
       'Orthopedics-Reconstructive', 'Psychiatry', 'Emergency/Trauma',
       'Pulmonology', 'Surgery-Neuro',
       'Obsterics&Gynecology-GynecologicOnco', 'ObstetricsandGynecology',
       'Pediatrics', 'Hematology/Oncology', 'Otolaryngology',
       'Surgery-Colon&Rectal', 'Pediatrics-CriticalCare', 'Endocrinology',
       'Urology', 'Psychiatry-Child/Adolescent', 'Pediatrics-Pulmonology',
       'Neurology', 'Anesthesiology-Pediatric', 'Radiology',
       'Pediatrics-Hematology-Oncology', 'Psychology', 'Podiatry',
       'Gynecology', 'Oncology', 'Pediatrics-Neurology',
       'Surgery-Plastic', 'Surgery-Thoracic',
       'Surgery-PlasticwithinHeadandNeck', 'Ophthalmology',
       'Surgery-Pediatric', 'Pediatrics-EmergencyMedicine',
       'PhysicalMedicineandRe

In [18]:
df['medical_specialty'].replace('?',np.nan,inplace=True)
df.medical_specialty.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['medical_specialty'].replace('?',np.nan,inplace=True)


np.int64(49949)

In [19]:
#checking the null percentage of medical_specialty
df.medical_specialty.isnull().mean()*100

np.float64(49.08220820313268)

In [20]:
df['medical_specialty'].fillna(df['medical_specialty'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['medical_specialty'].fillna(df['medical_specialty'].mode()[0],inplace=True)


In [21]:
df.medical_specialty.isnull().sum()

np.int64(0)

In [22]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,diag-goup_1,diag-goup_2,diag-goup_3
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,?,...,No,No,No,No,No,No,NO,Diabetes,Circulatory,Circulatory
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,?,...,No,No,No,No,Ch,Yes,>30,Other,Diabetes,Other
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,?,...,No,No,No,No,No,Yes,NO,Other,Diabetes,Circulatory
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,?,...,No,No,No,No,Ch,Yes,NO,Other,Diabetes,Circulatory
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,?,...,No,No,No,No,Ch,Yes,NO,Other,Other,Diabetes


In [23]:
#few column are not useful for our analysis so we can drop it
df = df.drop(columns=['payer_code','max_glu_serum','A1Cresult'])


In [24]:
(df == '?').sum().sort_values(ascending=False)


encounter_id                0
patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
medical_specialty           0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
number_diagnoses            0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazo

In [25]:
#finally we can save the cleaned dataset
df.to_parquet(Path("../data/processed/cleaned_data.parquet"), index=False)
#i have saved the cleaned dataset as parquet to reduce the size and for faster loading in future