In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

#### **Load data**

In [0]:
df = pd.read_csv("/Workspace/Users/skmdsali200@gmail.com/Patient_Risk_Prediction/training_data_sets/patient_readmission_data.csv")

#### **Before Encoding (Raw Data Sample)**

In [0]:
print("🔹 Raw Data Sample:")
display(df.head(10))


🔹 Raw Data Sample:


patient_name,gender,age,medical_condition,hospital,insurance_provider,stay_duration_days,billing_amount,days_to_next_admit,readmitted_30d
Aaron Adams,FEMALE,38,Cancer,Hart Llc,Unitedhealthcare,19,26052.10640426882,,0
Aaron Aguirre,MALE,36,Diabetes,Murray-shelton,Unitedhealthcare,15,27087.56055287028,,0
Aaron Anderson,FEMALE,50,Asthma,Tanner-cox,Cigna,28,39804.6586244873,,0
Aaron Anderson Md,FEMALE,20,Hypertension,Ritter Llc,Unitedhealthcare,12,16846.415799072645,,0
Aaron Archer,FEMALE,47,Cancer,"Montes Case And Mendez,",Medicare,7,10602.077185418815,-7.0,0
Aaron Archer,FEMALE,49,Cancer,"Montes Case And Mendez,",Medicare,7,10602.077185418815,,0
Aaron Baker,MALE,73,Cancer,"Wise And Todd, Parker",Medicare,22,10135.88544212296,1060.0,0
Aaron Baker,FEMALE,84,Asthma,"Carter, Abbott And Fuentes",Medicare,6,6826.677362608583,,0
Aaron Baldwin Jr.,MALE,20,Hypertension,"Flores Friedman And White,",Medicare,26,29740.96019872634,,0
Aaron Barnes,MALE,85,Obesity,Taylor Llc,Unitedhealthcare,21,4783.444339176547,,0


#### **Encode categoricals**

In [0]:
for col in ['gender','medical_condition','hospital','insurance_provider']:
    df[col] = LabelEncoder().fit_transform(df[col])

X = df[['age','gender','medical_condition','hospital','insurance_provider',
        'billing_amount','stay_duration_days']]
y = df['readmitted_30d']

#### **After Encoding (Numeric Features)**

In [0]:
print("🔹 Encoded Data Sample:")
display(df[['age','gender','medical_condition','hospital','insurance_provider','billing_amount','stay_duration_days','readmitted_30d']].head(10))


🔹 Encoded Data Sample:


age,gender,medical_condition,hospital,insurance_provider,billing_amount,stay_duration_days,readmitted_30d
38,0,2,16665,4,26052.10640426882,19,0
36,1,3,26919,4,27087.56055287028,15,0
50,0,1,35638,2,39804.6586244873,28,0
20,0,4,31041,4,16846.415799072645,12,0
47,0,2,25935,3,10602.077185418815,7,0
49,0,2,25935,3,10602.077185418815,7,0
73,1,2,39214,3,10135.88544212296,22,0
84,0,1,9043,3,6826.677362608583,6,0
20,1,4,12893,3,29740.96019872634,26,0
85,1,5,35705,4,4783.444339176547,21,0


#### **Check Class Distribution (Before Training)**

In [0]:
print("🔹 Class Distribution:")
print(df['readmitted_30d'].value_counts(normalize=True))

🔹 Class Distribution:
readmitted_30d
0    0.989514
1    0.010486
Name: proportion, dtype: float64


#### **Split**

In [0]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

#### **After Train-Test Split**

In [0]:
print("🔹 Train/Test Split Shape:")
print("Train:", X_train.shape, " Test:", X_test.shape)

print("🔹 Training set class distribution:")
print(pd.Series(y_train).value_counts(normalize=True))
print("🔹 Test set class distribution:")
print(pd.Series(y_test).value_counts(normalize=True))

🔹 Train/Test Split Shape:
Train: (44400, 7)  Test: (11100, 7)
🔹 Training set class distribution:
readmitted_30d
0    0.989685
1    0.010315
Name: proportion, dtype: float64
🔹 Test set class distribution:
readmitted_30d
0    0.988829
1    0.011171
Name: proportion, dtype: float64


#### **Scale**

In [0]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

#### **Before & After Scaling (See Difference)**

In [0]:
# Convert scaled arrays back to DataFrames for inspection
X_train_scaled_df = pd.DataFrame(X_train, columns=['age','gender','medical_condition','hospital','insurance_provider','billing_amount','stay_duration_days'])
X_test_scaled_df = pd.DataFrame(X_test, columns=['age','gender','medical_condition','hospital','insurance_provider','billing_amount','stay_duration_days'])

print("🔹 Original X sample (before scaling):")
display(df[['age','billing_amount','stay_duration_days']].head(10))

print("🔹 Scaled X_train sample (after scaling):")
display(X_train_scaled_df[['age','billing_amount','stay_duration_days']].head(10))


🔹 Original X sample (before scaling):


age,billing_amount,stay_duration_days
38,26052.10640426882,19
36,27087.56055287028,15
50,39804.6586244873,28
20,16846.415799072645,12
47,10602.077185418815,7
49,10602.077185418815,7
73,10135.88544212296,22
84,6826.677362608583,6
20,29740.96019872634,26
85,4783.444339176547,21


🔹 Scaled X_train sample (after scaling):


age,billing_amount,stay_duration_days
-1.0942356798894104,-0.055849171669415,0.7507142143145666
-0.4310841261671542,-1.8065268618923915,0.0580136705410333
0.3851024014910072,1.1341213453809245,-1.442837507634955
0.3340907435123721,0.114411314646905,-0.2883366013457333
-0.1760258362739787,-0.1990934878049816,1.558864848717022
0.9462306392559932,-1.2861626256987122,0.1734637611699555
1.354323903085074,-1.639942272739389,-0.0574364200878888
-1.451317285739856,-0.7715716812201436,-0.2883366013457333
1.0992656131918983,-0.5688025962488682,-0.8655870544903443
0.5381373754269125,-0.1688856492880871,0.2889138517988778


#### **Train**

In [0]:
model = LogisticRegression()
model.fit(X_train,y_train)

#### **After Training (Model Coefficients)**

In [0]:
feature_names = ['age','gender','medical_condition','hospital','insurance_provider','billing_amount','stay_duration_days']
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_[0]})
print("🔹 Model Feature Importance:")
display(coef_df.sort_values(by='Coefficient', ascending=False))

🔹 Model Feature Importance:


Feature,Coefficient
billing_amount,0.0850931594279323
hospital,0.0528262652382539
medical_condition,0.0027483430863176
stay_duration_days,-0.0004442683244947912
gender,-0.0265768715723209
insurance_provider,-0.0540827872256337
age,-0.1171552278611282


#### **Compare Before vs After Scaling Side-by-Side**

In [0]:
comparison = pd.concat([
    df[['age','billing_amount','stay_duration_days']].reset_index(drop=True).head(10),
    X_train_scaled_df[['age','billing_amount','stay_duration_days']].reset_index(drop=True).head(10)
], axis=1)

comparison.columns = ['Age_Original','Billing_Original','Stay_Original','Age_Scaled','Billing_Scaled','Stay_Scaled']
display(comparison)


Age_Original,Billing_Original,Stay_Original,Age_Scaled,Billing_Scaled,Stay_Scaled
38,26052.10640426882,19,-1.0942356798894104,-0.055849171669415,0.7507142143145666
36,27087.56055287028,15,-0.4310841261671542,-1.8065268618923915,0.0580136705410333
50,39804.6586244873,28,0.3851024014910072,1.1341213453809245,-1.442837507634955
20,16846.415799072645,12,0.3340907435123721,0.114411314646905,-0.2883366013457333
47,10602.077185418815,7,-0.1760258362739787,-0.1990934878049816,1.558864848717022
49,10602.077185418815,7,0.9462306392559932,-1.2861626256987122,0.1734637611699555
73,10135.88544212296,22,1.354323903085074,-1.639942272739389,-0.0574364200878888
84,6826.677362608583,6,-1.451317285739856,-0.7715716812201436,-0.2883366013457333
20,29740.96019872634,26,1.0992656131918983,-0.5688025962488682,-0.8655870544903443
85,4783.444339176547,21,0.5381373754269125,-0.1688856492880871,0.2889138517988778


#### **Evaluate**

In [0]:
preds = model.predict(X_test)
print(classification_report(y_test,preds))
print("ROC-AUC:", roc_auc_score(y_test,preds))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     10976
           1       0.00      0.00      0.00       124

    accuracy                           0.99     11100
   macro avg       0.49      0.50      0.50     11100
weighted avg       0.98      0.99      0.98     11100

ROC-AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
