In [1]:
!pip install ucimlrepo optuna

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [21]:
import warnings
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import optuna

warnings.filterwarnings('ignore')

In [3]:
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296)

X = diabetes_130_us_hospitals_for_years_1999_2008.data.features
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets

In [4]:
X.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,No
1,Caucasian,Female,[10-20),,1,1,7,3,,,...,No,No,Up,No,No,No,No,No,Ch,Yes
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,...,No,No,No,No,No,No,No,No,No,Yes
3,Caucasian,Male,[30-40),,1,1,7,2,,,...,No,No,Up,No,No,No,No,No,Ch,Yes
4,Caucasian,Male,[40-50),,1,1,7,1,,,...,No,No,Steady,No,No,No,No,No,Ch,Yes


In [5]:
X.shape

(101766, 47)

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 47 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      99493 non-null   object
 1   gender                    101766 non-null  object
 2   age                       101766 non-null  object
 3   weight                    3197 non-null    object
 4   admission_type_id         101766 non-null  int64 
 5   discharge_disposition_id  101766 non-null  int64 
 6   admission_source_id       101766 non-null  int64 
 7   time_in_hospital          101766 non-null  int64 
 8   payer_code                61510 non-null   object
 9   medical_specialty         51817 non-null   object
 10  num_lab_procedures        101766 non-null  int64 
 11  num_procedures            101766 non-null  int64 
 12  num_medications           101766 non-null  int64 
 13  number_outpatient         101766 non-null  int64 
 14  numb

In [7]:
X.describe()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [8]:
mask = (
        (X[["diag_1", "diag_2", "diag_3"]] == "?").any(axis=1)
        | (X["race"] == "?")
        | (X["discharge_disposition_id"] == 11)  # Expired
        | (X["gender"] == "Unknown/Invalid")
    )
X = X[~mask]

In [9]:
X["admission_type_id"] = X["admission_type_id"].replace({2: 1, 7: 1, 6: 5, 8: 5})

discharge_mappings = {
        6: 1,
        8: 1,
        9: 1,
        13: 1,
        3: 2,
        4: 2,
        5: 2,
        14: 2,
        22: 2,
        23: 2,
        24: 2,
        12: 10,
        15: 10,
        16: 10,
        17: 10,
        25: 18,
        26: 18,
    }
X["discharge_disposition_id"] = X["discharge_disposition_id"].replace(
        discharge_mappings)

admission_mappings = {
        2: 1,
        3: 1,
        5: 4,
        6: 4,
        10: 4,
        22: 4,
        25: 4,
        15: 9,
        17: 9,
        20: 9,
        21: 9,
        13: 11,
        14: 11,
    }
X["admission_source_id"] = X["admission_source_id"].replace(admission_mappings)

categorical_mappings = {
        "change": {"Ch": 1, "No": 0},
        "gender": {"Male": 1, "Female": 0},
        "diabetesMed": {"Yes": 1, "No": 0},
        "A1Cresult": {">7": 1, ">8": 1, "Norm": 0, "None": -99},
        "max_glu_serum": {">200": 1, ">300": 1, "Norm": 0, "None": -99},}
for col, mapping in categorical_mappings.items():
        X[col] = X[col].replace(mapping)

age_mapping = {f"[{i*10}-{(i+1)*10})": i + 1 for i in range(10)}
X["age"] = X["age"].replace(age_mapping)

y = y.replace({">30":"YES","<30":"YES"})

In [10]:
null_vals = X.isnull().sum().sort_values(ascending=False)
null_vals = null_vals[null_vals > 0]

In [11]:
drop_cols = null_vals[null_vals/X.shape[0] > 0.3].index

In [12]:
X = X.drop(drop_cols,axis=1)

In [13]:
object_cols = X.select_dtypes(include='object').columns
object_cols

Index(['race', 'diag_1', 'diag_2', 'diag_3', 'metformin', 'repaglinide',
       'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-rosiglitazone', 'metformin-pioglitazone'],
      dtype='object')

In [14]:
null_cols = null_vals.index.difference(drop_cols)

imputer = SimpleImputer(strategy='most_frequent')
X[null_cols] = imputer.fit_transform(X[null_cols])

In [15]:
enc = OrdinalEncoder()
X[object_cols] = enc.fit_transform(X[object_cols])
y = y.loc[X.index]
y = enc.fit_transform(y)

In [17]:
df = X
df['readmitted'] = y

maj = df[y == 0]
mino = df[y == 1]

df_min = resample(mino, replace=True, n_samples=len(maj),random_state=20)

X = pd.concat([maj,df_min])
y = pd.Series([0] * len(maj) + [1] * len(df_min))

In [18]:
X = X.drop('readmitted',axis=1)

In [19]:
sc = StandardScaler()
Xsc = sc.fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(Xsc,y,test_size=0.2,random_state=42)

lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.612880496054115

In [22]:
coef_df = pd.DataFrame({
    'feature': X.columns,
    'coef_abs': np.abs(lr.coef_[0])
}).sort_values(by='coef_abs', ascending=False)

imp_features = coef_df[coef_df['coef_abs'] > 0.01]['feature']
len(imp_features)

29

In [23]:
def get_metrics(model,X_test,y_test):
  preds = model.predict(X_test)
  print(accuracy_score(preds,y_test))
  print()
  print(confusion_matrix(preds,y_test))
  print()
  print(classification_report(preds,y_test))

In [24]:
sc = StandardScaler()
Xsc = sc.fit_transform(X[imp_features])

X_train,X_test,y_train,y_test = train_test_split(Xsc,y,test_size=0.2,random_state=42)

lr = LogisticRegression(fit_intercept=True, penalty="l2", solver="liblinear")
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.6141018414130026

In [25]:
get_metrics(lr,X_test,y_test)

0.6141018414130026

[[7521 5016]
 [3199 5552]]

              precision    recall  f1-score   support

           0       0.70      0.60      0.65     12537
           1       0.53      0.63      0.57      8751

    accuracy                           0.61     21288
   macro avg       0.61      0.62      0.61     21288
weighted avg       0.63      0.61      0.62     21288



In [26]:
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train,y_train)
get_metrics(dtree,X_test,y_test)

0.7057966929725668

[[6760 2303]
 [3960 8265]]

              precision    recall  f1-score   support

           0       0.63      0.75      0.68      9063
           1       0.78      0.68      0.73     12225

    accuracy                           0.71     21288
   macro avg       0.71      0.71      0.70     21288
weighted avg       0.72      0.71      0.71     21288



In [27]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)
get_metrics(rf,X_test,y_test)

0.7785137166478767

[[8381 2376]
 [2339 8192]]

              precision    recall  f1-score   support

           0       0.78      0.78      0.78     10757
           1       0.78      0.78      0.78     10531

    accuracy                           0.78     21288
   macro avg       0.78      0.78      0.78     21288
weighted avg       0.78      0.78      0.78     21288



In [31]:
def objective(trial):
  classifier_name = trial.suggest_categorical('classifier', ['DTree', 'RF'])

  if classifier_name == "DTree":
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'random_state': 42
    }
    dt = DecisionTreeClassifier(**params)
    dt.fit(X_train,y_train)
    return dt.score(X_test,y_test)
  else:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42
    }
    rf = RandomForestClassifier(**params)
    rf.fit(X_train,y_train)
    return rf.score(X_test,y_test)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2025-11-08 04:25:30,030] A new study created in memory with name: no-name-b521fcc3-475b-4f98-bcdc-b2503c59e3f5
[I 2025-11-08 04:25:34,365] Trial 0 finished with value: 0.620913190529876 and parameters: {'classifier': 'DTree', 'max_depth': 16, 'min_samples_split': 12, 'min_samples_leaf': 13, 'criterion': 'gini'}. Best is trial 0 with value: 0.620913190529876.
[I 2025-11-08 04:25:37,625] Trial 1 finished with value: 0.6163096580232995 and parameters: {'classifier': 'DTree', 'max_depth': 9, 'min_samples_split': 12, 'min_samples_leaf': 6, 'criterion': 'entropy'}. Best is trial 0 with value: 0.620913190529876.
[I 2025-11-08 04:25:40,290] Trial 2 finished with value: 0.6213359639233371 and parameters: {'classifier': 'DTree', 'max_depth': 14, 'min_samples_split': 16, 'min_samples_leaf': 11, 'criterion': 'entropy'}. Best is trial 2 with value: 0.6213359639233371.
[I 2025-11-08 04:25:43,962] Trial 3 finished with value: 0.618564449455092 and parameters: {'classifier': 'DTree', 'max_depth': 1

In [33]:
param_grid = {"n_estimators":315,"max_depth":33,"min_samples_split":6,"max_features":"sqrt","bootstrap":False,"random_state":42}

rf = RandomForestClassifier(**param_grid)
rf.fit(X_train,y_train)
get_metrics(rf,X_test,y_test)

0.7870161593385945

[[8762 2576]
 [1958 7992]]

              precision    recall  f1-score   support

           0       0.82      0.77      0.79     11338
           1       0.76      0.80      0.78      9950

    accuracy                           0.79     21288
   macro avg       0.79      0.79      0.79     21288
weighted avg       0.79      0.79      0.79     21288

