In [19]:
import pandas as pd

df = pd.read_csv("diabetic_data.csv")

# Drop irrelevant columns
df.drop(["encounter_id", "patient_nbr", "weight", "payer_code", "medical_specialty"], axis=1, inplace=True)

# Convert target variable
df["readmitted_30"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)

df.head()


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,readmitted_30
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,NO,0
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,Up,No,No,No,No,No,Ch,Yes,>30,0
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,Yes,NO,0
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,Up,No,No,No,No,No,Ch,Yes,NO,0
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,Steady,No,No,No,No,No,Ch,Yes,NO,0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 46 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      101766 non-null  object
 1   gender                    101766 non-null  object
 2   age                       101766 non-null  object
 3   admission_type_id         101766 non-null  int64 
 4   discharge_disposition_id  101766 non-null  int64 
 5   admission_source_id       101766 non-null  int64 
 6   time_in_hospital          101766 non-null  int64 
 7   num_lab_procedures        101766 non-null  int64 
 8   num_procedures            101766 non-null  int64 
 9   num_medications           101766 non-null  int64 
 10  number_outpatient         101766 non-null  int64 
 11  number_emergency          101766 non-null  int64 
 12  number_inpatient          101766 non-null  int64 
 13  diag_1                    101766 non-null  object
 14  diag

In [21]:
# Drop columns with too many unique IDs
df.drop(["diag_1", "diag_2", "diag_3"], axis=1, inplace=True)

# Encode categorical variables
categorical_cols = df.select_dtypes(include='object').columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [22]:
df.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes,readmitted_>30,readmitted_NO
0,6,25,1,1,41,0,1,0,0,0,...,False,False,False,False,False,False,True,False,False,True
1,1,1,7,3,59,0,18,0,0,0,...,False,False,False,False,False,False,False,True,True,False
2,1,1,7,2,11,5,13,2,0,1,...,False,False,False,False,False,False,True,True,False,True
3,1,1,7,2,44,1,16,0,0,0,...,False,False,False,False,False,False,False,True,False,True
4,1,1,7,1,51,0,8,0,0,0,...,False,False,False,False,False,False,False,True,False,True


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop("readmitted_30", axis=1)
y = df["readmitted_30"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [7]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_pred))


Confusion Matrix:
 [[18069     0]
 [    0  2285]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18069
           1       1.00      1.00      1.00      2285

    accuracy                           1.00     20354
   macro avg       1.00      1.00      1.00     20354
weighted avg       1.00      1.00      1.00     20354

AUC Score: 1.0


In [8]:
# Perform feature engineering for hospital readmission classification using a synthetic subset of the UCI dataset

import pandas as pd
import numpy as np

# Sample data (mimicking the structure of the original UCI diabetes readmission dataset)
data = {
    'age': ['[70-80)', '[60-70)', '[50-60)', '[80-90)', '[30-40)'],
    'gender': ['Female', 'Male', 'Female', 'Male', 'Female'],
    'race': ['Caucasian', 'AfricanAmerican', 'Hispanic', 'Asian', 'Other'],
    'admission_type_id': [1, 3, 1, 2, 4],
    'discharge_disposition_id': [1, 2, 1, 3, 1],
    'admission_source_id': [7, 1, 4, 6, 1],
    'num_lab_procedures': [41, 59, 33, 49, 35],
    'num_procedures': [0, 1, 0, 3, 2],
    'num_medications': [1, 13, 6, 8, 10],
    'number_outpatient': [0, 2, 1, 0, 3],
    'number_emergency': [0, 0, 0, 1, 0],
    'number_inpatient': [0, 1, 0, 0, 2],
    'readmitted': ['NO', '<30', '>30', '<30', 'NO']
}

df = pd.DataFrame(data)

# Convert target: 1 if readmitted <30 days, else 0
df['readmitted_30'] = df['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

# Drop original target column
df.drop(columns=['readmitted'], inplace=True)

# Convert age group to ordinal feature
age_order = {'[0-10)': 0, '[10-20)': 1, '[20-30)': 2, '[30-40)': 3, '[40-50)': 4,
             '[50-60)': 5, '[60-70)': 6, '[70-80)': 7, '[80-90)': 8, '[90-100)': 9}
df['age'] = df['age'].map(age_order)

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=['gender', 'race'], drop_first=True)

df_encoded.head()



Unnamed: 0,age,admission_type_id,discharge_disposition_id,admission_source_id,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,readmitted_30,gender_Male,race_Asian,race_Caucasian,race_Hispanic,race_Other
0,7,1,1,7,41,0,1,0,0,0,0,False,False,True,False,False
1,6,3,2,1,59,1,13,2,0,1,1,True,False,False,False,False
2,5,1,1,4,33,0,6,1,0,0,0,False,False,False,True,False
3,8,2,3,6,49,3,8,0,1,0,1,True,True,False,False,False
4,3,4,1,1,35,2,10,3,0,2,0,False,False,False,False,True


In [9]:
df.head()

Unnamed: 0,age,gender,race,admission_type_id,discharge_disposition_id,admission_source_id,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,readmitted_30
0,7,Female,Caucasian,1,1,7,41,0,1,0,0,0,0
1,6,Male,AfricanAmerican,3,2,1,59,1,13,2,0,1,1
2,5,Female,Hispanic,1,1,4,33,0,6,1,0,0,0
3,8,Male,Asian,2,3,6,49,3,8,0,1,0,1
4,3,Female,Other,4,1,1,35,2,10,3,0,2,0


In [10]:
age_order = {
    '[0-10)': 0, '[10-20)': 1, '[20-30)': 2, '[30-40)': 3,
    '[40-50)': 4, '[50-60)': 5, '[60-70)': 6,
    '[70-80)': 7, '[80-90)': 8, '[90-100)': 9
}
df['age'] = df['age'].map(age_order)


In [11]:
df.head()

Unnamed: 0,age,gender,race,admission_type_id,discharge_disposition_id,admission_source_id,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,readmitted_30
0,,Female,Caucasian,1,1,7,41,0,1,0,0,0,0
1,,Male,AfricanAmerican,3,2,1,59,1,13,2,0,1,1
2,,Female,Hispanic,1,1,4,33,0,6,1,0,0,0
3,,Male,Asian,2,3,6,49,3,8,0,1,0,1
4,,Female,Other,4,1,1,35,2,10,3,0,2,0


In [17]:
print(df['age'])

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: age, dtype: float64


In [12]:
df = pd.get_dummies(df, columns=['gender', 'race'], drop_first=True)


In [13]:
df.head()

Unnamed: 0,age,admission_type_id,discharge_disposition_id,admission_source_id,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,readmitted_30,gender_Male,race_Asian,race_Caucasian,race_Hispanic,race_Other
0,,1,1,7,41,0,1,0,0,0,0,False,False,True,False,False
1,,3,2,1,59,1,13,2,0,1,1,True,False,False,False,False
2,,1,1,4,33,0,6,1,0,0,0,False,False,False,True,False
3,,2,3,6,49,3,8,0,1,0,1,True,True,False,False,False
4,,4,1,1,35,2,10,3,0,2,0,False,False,False,False,True


In [14]:
df.drop(['encounter_id', 'patient_nbr'], axis=1, errors='ignore', inplace=True)


In [15]:
df.fillna('Unknown', inplace=True)  # or use median/mode for numeric


In [16]:
df.head()

Unnamed: 0,age,admission_type_id,discharge_disposition_id,admission_source_id,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,readmitted_30,gender_Male,race_Asian,race_Caucasian,race_Hispanic,race_Other
0,Unknown,1,1,7,41,0,1,0,0,0,0,False,False,True,False,False
1,Unknown,3,2,1,59,1,13,2,0,1,1,True,False,False,False,False
2,Unknown,1,1,4,33,0,6,1,0,0,0,False,False,False,True,False
3,Unknown,2,3,6,49,3,8,0,1,0,1,True,True,False,False,False
4,Unknown,4,1,1,35,2,10,3,0,2,0,False,False,False,False,True
