In [1]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, auc
from imblearn.over_sampling import ADASYN
from sklearn.feature_selection import VarianceThreshold, RFE
import warnings

In [4]:
df = pd.read_csv(r"C:\Users\Asfand\Downloads\diabetic_data.csv")

In [5]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [6]:
df.shape

(101766, 50)

In [7]:
df.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [8]:
df.isnull().sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [9]:
df.max_glu_serum.unique()

array([nan, '>300', 'Norm', '>200'], dtype=object)

In [10]:
df.A1Cresult.unique()

array([nan, '>7', '>8', 'Norm'], dtype=object)

In [11]:
df.gender.unique()

array(['Female', 'Male', 'Unknown/Invalid'], dtype=object)

In [12]:
df.race.unique()

array(['Caucasian', 'AfricanAmerican', '?', 'Other', 'Asian', 'Hispanic'],
      dtype=object)

In [13]:
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())

race 2273
gender 0
age 0
weight 98569
payer_code 40256
medical_specialty 49949
diag_1 21
diag_2 358
diag_3 1423
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0


In [14]:
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count()) 

gender 3


In [15]:
medication_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 
    'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose', 
    'miglitol', 'insulin', 'glyburide-metformin', 'tolazamide', 
    'metformin-pioglitazone', 'metformin-rosiglitazone', 'glimepiride-pioglitazone', 
    'glipizide-metformin', 'troglitazone', 'tolbutamide', 'acetohexamide'
]
for col in medication_cols:
    df[col] = df[col].map({'No': 0, 'Steady': 1, 'Up': 1, 'Down': 1})

In [16]:
df = df[df['discharge_disposition_id'] != 11]

In [17]:
selector = VarianceThreshold(threshold=0.01)
selector.fit(df[medication_cols])
medication_cols = [medication_cols[i] for i in range(len(medication_cols)) if selector.get_support()[i]]

In [18]:
df_agg = df.groupby('patient_nbr').agg({
    'time_in_hospital': 'mean',
    'num_lab_procedures': 'mean',
    'num_procedures': 'mean',
    'num_medications': 'mean',
    'number_outpatient': 'sum',
    'number_emergency': 'sum',
    'number_inpatient': 'sum',
    'number_diagnoses': 'mean',
    'readmitted': 'max',  # 1 if any encounter has readmitted <30 days
    'gender': 'first',
    'age': 'first',
    'max_glu_serum': lambda x: x.mode()[0] if not x.mode().empty else 'Not Tested',
    'A1Cresult': lambda x: x.mode()[0] if not x.mode().empty else 'Not Tested',
    'change': 'sum',  # Sum of changes (numeric 0/1)
    'diabetesMed': 'max',
    **{col: 'max' for col in medication_cols}  # Max usage per medication
}).reset_index()

In [19]:
encounter_counts = df.groupby('patient_nbr').size().reset_index(name='encounter_count')
df_agg = df_agg.merge(encounter_counts, on='patient_nbr')

In [20]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [21]:
df.isnull().sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               94899
A1Cresult                   83247
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [22]:
def group_icd9(code):
    if code.startswith('250'): return 'Diabetes'
    elif code.startswith('390') or code.startswith('410') or code.startswith('428'): return 'Circulatory'
    elif code.startswith('460') or code.startswith('786'): return 'Respiratory'
    elif code == 'Missing': return 'Missing'
    else: return 'Other'
for col in ['diag_1', 'diag_2', 'diag_3']:
    df_agg[col + '_group'] = df.groupby('patient_nbr')[col].apply(lambda x: group_icd9(x.iloc[0])).reset_index(drop=True)

In [23]:
df_agg.sample(5).T

Unnamed: 0,32694,25473,24729,24441,22706
patient_nbr,43393878,32404005,30324744,29759931,25300170
time_in_hospital,4.5,7.5,1.0,10.0,5.666667
num_lab_procedures,42.5,61.5,17.0,8.0,40.666667
num_procedures,0.5,2.0,0.0,0.0,1.0
num_medications,17.0,33.0,9.0,14.0,15.333333
number_outpatient,22,1,0,0,0
number_emergency,0,0,0,0,0
number_inpatient,3,1,0,0,3
number_diagnoses,9.0,9.0,9.0,9.0,8.333333
readmitted,NO,>30,NO,NO,NO


In [24]:
df_agg['readmitted'].value_counts()

readmitted
NO     53180
>30    14810
<30     2456
Name: count, dtype: int64

In [25]:
df_agg['change'].unique()

array(['ChCh', 'No', 'Ch', 'NoChNoNoCh', 'NoNoCh', 'NoNoNoNo', 'NoCh',
       'NoNo', 'ChChNo', 'ChChNoNo', 'ChNoChChNoChNoNo', 'ChNo', 'NoNoNo',
       'NoChNo', 'ChNoNoNoChChNoNoNoNo', 'NoNoNoNoNoChNo', 'NoChCh',
       'ChChCh', 'NoNoNoChChNo', 'NoNoNoNoChNoNoNoNo', 'NoNoNoNoNo',
       'ChNoChCh', 'NoNoChNoNo', 'NoNoNoNoNoNoCh', 'ChChChNoChChCh',
       'NoNoChChChNo', 'NoChNoCh', 'ChChChNoNo', 'ChNoChNo', 'ChNoNoNo',
       'NoNoNoNoCh', 'NoNoNoNoNoNoNoNoNoNoNoNoNo', 'NoChChNoNoNoChNo',
       'NoChChNo', 'ChNoNo', 'ChNoChChCh', 'ChChChNoNoNoNoCh',
       'ChNoNoNoNo', 'NoNoNoCh', 'NoChNoChChCh', 'NoNoNoNoNoNo',
       'ChChChChNoCh', 'ChChNoChNoChNoChNoNo', 'NoNoNoChNo',
       'ChChChChNoNoNo', 'ChChChChChNo', 'ChChChChCh', 'NoNoChCh',
       'NoChNoNo', 'ChNoCh', 'ChNoChNoChChNo', 'NoNoChNo',
       'NoNoNoNoNoNoNoNo', 'NoNoNoNoNoNoNo', 'ChNoNoNoNoNoNo',
       'ChChNoChChNoNoNoNoChNoChChNoCh', 'NoNoChChCh', 'ChNoNoCh',
       'ChChChCh', 'NoNoNoChNoChNoNoNoNoNoNoNoNoNoNo', 'No

In [26]:
feature_set = df_agg.columns.drop('readmitted')


In [27]:
X = df_agg[feature_set].copy()
y = df_agg['readmitted']


In [28]:
# Numeric columns
numeric_cols = df_agg.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'readmitted']

# Categorical columns
categorical_cols = ['gender', 'change', 'diabetesMed']  # add others if needed

# Combine features
feature_set = numeric_cols + [col for col in df_agg.columns if col.startswith((
    'medical_specialty_group_', 'max_glu_serum_', 'A1Cresult_', 
    'diag_1_group_', 'diag_2_group_', 'diag_3_group_'
))] + categorical_cols + medication_cols  # make sure medication_cols is defined

# Select X and y
X = df_agg[feature_set].copy()
y = df_agg['readmitted']

# Encode categorical variables
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)


In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0
)


In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0
)


In [32]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=7,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',  # helps with imbalance
    random_state=0
)

dt_model.fit(X_train_scaled, y_train)  # or X_train if not scaling


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,7
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,0
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [33]:
y_pred = dt_model.predict(X_test_scaled)  # or X_test if not scaling

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.4653655074520937

Confusion Matrix:
 [[ 298   53  132]
 [1328  863  804]
 [3651 1565 5396]]

Classification Report:
               precision    recall  f1-score   support

         <30       0.06      0.62      0.10       483
         >30       0.35      0.29      0.32      2995
          NO       0.85      0.51      0.64     10612

    accuracy                           0.47     14090
   macro avg       0.42      0.47      0.35     14090
weighted avg       0.72      0.47      0.55     14090



In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [35]:
from sklearn.ensemble import RandomForestClassifier

# Train model
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=7,
    random_state=42
)
rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,7
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, auc

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]  # probability for positive class

# Metrics
print("=== Random Forest ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))




=== Random Forest ===
Accuracy: 0.7576295244854506

Confusion Matrix:
 [[    0     0   498]
 [    0     0  2917]
 [    0     0 10675]]

Classification Report:
               precision    recall  f1-score   support

         <30       0.00      0.00      0.00       498
         >30       0.00      0.00      0.00      2917
          NO       0.76      1.00      0.86     10675

    accuracy                           0.76     14090
   macro avg       0.25      0.33      0.29     14090
weighted avg       0.57      0.76      0.65     14090



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1️⃣ Load your dataset
# df = pd.read_csv("your_dataset.csv")  # Uncomment if reading from CSV
# Make sure your target column name is correct. Replace 'target' with your actual target column
target_col = 'target'  

# Check columns
print("Columns in dataset:", df.columns)

# 2️⃣ Split features and target
X = df.drop(target_col, axis=1)
y = df[target_col]

# 3️⃣ Convert categorical columns to numeric (Label Encoding)
X_encoded = X.copy()
for col in X_encoded.select_dtypes(include=['object', 'category']).columns:
    X_encoded[col] = LabelEncoder().fit_transform(X_encoded[col])

# Encode target if it is categorical
if y.dtype == 'object' or str(y.dtype).startswith('category'):
    y = LabelEncoder().fit_transform(y)

# 4️⃣ Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# 5️⃣ Train XGBoost model
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
xgb_model.fit(X_train, y_train)

# 6️⃣ Predict & evaluate
y_pred = xgb_model.predict(X_test)

print("=== XGBoost ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Columns in dataset: Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')


KeyError: "['target'] not found in axis"

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Target column
target_col = 'readmitted'

# Drop identifier columns
drop_cols = ['encounter_id', 'patient_nbr']
X = df.drop(columns=drop_cols + [target_col])
y = df[target_col]

# Encode categorical columns
X_encoded = X.copy()
for col in X_encoded.select_dtypes(include=['object', 'category']).columns:
    X_encoded[col] = LabelEncoder().fit_transform(X_encoded[col].astype(str))

# Encode target
if y.dtype == 'object' or str(y.dtype).startswith('category'):
    y = LabelEncoder().fit_transform(y.astype(str))

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# Train XGBoost
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
xgb_model.fit(X_train, y_train)

# Predict & evaluate
y_pred = xgb_model.predict(X_test)

print("=== XGBoost ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost ===
Accuracy: 0.5865168539325842

Confusion Matrix:
 [[ 110  943 1215]
 [  66 2907 4077]
 [  49 1930 8728]]

Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.05      0.09      2268
           1       0.50      0.41      0.45      7050
           2       0.62      0.82      0.71     10707

    accuracy                           0.59     20025
   macro avg       0.54      0.43      0.42     20025
weighted avg       0.57      0.59      0.55     20025



In [45]:
import joblib

# Save the trained XGBoost model
joblib.dump(xgb_model, "xgboost_model.pkl")
print("XGBoost model saved successfully!")


XGBoost model saved successfully!


In [46]:
joblib.dump(le, "label_encoder.pkl")
print("Label encoder saved successfully!")


Label encoder saved successfully!


In [50]:
import json

# Save list of training columns for consistency
with open("columns.json", "w") as f:
    json.dump(list(X_train_encoded.columns), f)
print("Training columns saved successfully!")


Training columns saved successfully!


In [52]:
import pickle
from xgboost import XGBClassifier

# Save model
with open("xgboost_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

# Load model
with open("xgboost_model.pkl", "rb") as f:
    xgb_model = pickle.load(f)



In [53]:
from sklearn.preprocessing import LabelEncoder
import pickle

# Save encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# Load encoder
with open("label_encoder.pkl", "rb") as f:
    le = pickle.load(f)
