In [27]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/diabetic_data.csv')
n_rows,n_cols = df.shape
df = df.replace({"?":np.nan,"None":np.nan})
target_counts = df['readmitted'].value_counts()
target_pct = ((target_counts/target_counts.sum())*100).round(2)

missing = df.isna().sum().sort_values(ascending=False)
top_missing = (missing/len(df)*100).round(2)

cardinality = {
    "medical_speciality": int(df["medical_specialty"].nunique(dropna=True)),
    "diag1": int(df["diag_1"].nunique(dropna=True)),
    "diag2": int(df["diag_2"].nunique(dropna=True)),
    "diag3": int(df["diag_3"].nunique(dropna=True))
}

print("Rows , Cols",n_rows,n_cols)
print("target counts",target_counts.to_dict())
print("top_missing")
print(top_missing)
print("cardinality",cardinality)



Rows , Cols 101766 50
target counts {'NO': 54864, '>30': 35545, '<30': 11357}
top_missing
weight                      96.86
max_glu_serum               94.75
A1Cresult                   83.28
medical_specialty           49.08
payer_code                  39.56
race                         2.23
diag_3                       1.40
diag_2                       0.35
diag_1                       0.02
patient_nbr                  0.00
time_in_hospital             0.00
admission_source_id          0.00
num_lab_procedures           0.00
encounter_id                 0.00
admission_type_id            0.00
discharge_disposition_id     0.00
gender                       0.00
age                          0.00
number_inpatient             0.00
number_emergency             0.00
number_outpatient            0.00
num_medications              0.00
num_procedures               0.00
number_diagnoses             0.00
metformin                    0.00
repaglinide                  0.00
nateglinide               

# Data Cleaning and Modeliing

In [49]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import (roc_auc_score,average_precision_score,accuracy_score,f1_score,precision_score,recall_score)
from joblib import dump
import matplotlib.pyplot as plt



In [53]:
df = pd.read_csv('/content/diabetic_data.csv')
df = df.replace({"?":np.nan,"None":np.nan})

EXCLUDE_DDISP = {11,19,20,21}
if "discharge_disposition_id" in df.columns:
    if "discharge_disposition_id" in df.columns:
        df = df[~df["discharge_disposition_id"].isin(EXCLUDE_DDISP)]
for col in['weight','payer_code']:
  if col in df.columns:
    df = df.drop(columns = [col])

engine = create_engine("sqlite:///diabetic_data.db")
df.to_sql("encounters", con=engine,if_exists="replace",index=False)
print('Successfully created sqlite database')


y = (df['readmitted'] == '<30').astype(int)

keep_cols = [
    "race","gender","age",
    "admission_type_id","discharge_disposition_id","admission_source_id",
    "time_in_hospital",
    "num_lab_procedures","num_procedures","num_medications",
    "number_outpatient","number_emergency","number_inpatient",
    "number_diagnoses",
    "A1Cresult","max_glu_serum",
    "change","diabetesMed","insulin"
]
X = df[keep_cols].copy()


cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(exclude="object").columns.tolist()

preprocess = ColumnTransformer([
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols),
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols)
])


groups = df["patient_nbr"].astype(str).values
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

model = {
    "LogistcRegression": LogisticRegression(max_iter=400, class_weight = "balanced",random_state=42,n_jobs=1),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=200,class_weight="balanced",random_state=42,n_jobs=1),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=
                                                             42)
}

train_pipes = {}
results = []

for name, est in model.items():
    pipe = Pipeline([
        ("preprocess", preprocess),
        ("est", est)
    ])
    pipe.fit(X_train, y_train)
    proba = pipe.predict_proba(X_test)[:, 1]
    preds  = pipe.predict(X_test)

    roc = roc_auc_score(y_test, proba)
    ap = average_precision_score(y_test, proba)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)

    results.append({
        "model": name,
        "roc": roc,
        "ap": ap,
        "acc": acc,
        "f1": f1,
        "prec": prec,
        "rec": rec
    })
train_pipes[name] = pipe

result_df = pd.DataFrame(results).round(3)
results_df = result_df.sort_values(by="roc", ascending=False)

print("modelComaprison")
print(results_df.to_string(index=False))


best_name = results_df.iloc[0]['model']
best_pipe = train_pipes[best_name]
path = f"/content/{best_name}_best_model.jolib"
dump(best_pipe, path)
print(f"Best model saved to {path}")


Successfully created sqlite database


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


modelComaprison
                     model   roc    ap   acc    f1  prec   rec
GradientBoostingClassifier 0.672 0.211 0.890 0.012 0.500 0.006
         LogistcRegression 0.646 0.195 0.669 0.253 0.169 0.510
    RandomForestClassifier 0.627 0.171 0.890 0.005 0.357 0.002
Best model saved to /content/GradientBoostingClassifier_best_model.jolib
