In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

#### **Load data**

In [0]:
df = pd.read_csv("/Workspace/Users/skmdsali200@gmail.com/Patient_Risk_Prediction/training_data_sets/patient_readmission_30d (1).csv")

#### **Before Encoding (Raw Data Sample)**

In [0]:
print("🔹 Raw Data Sample:")
display(df.head(10))


🔹 Raw Data Sample:


patient_sk,patient_name,gender,medical_condition,hospital,insurance_provider,age,stay_duration_days,billing_amount,risk_level,readmitted_30d
1,Joseph Snyder,FEMALE,Hypertension,Mills-brown,Unitedhealthcare,20,28,7834.36,High,0
3,Jeremy Clark,FEMALE,Diabetes,"And Richardson Smith Gibson,",Blue Cross,18,29,34218.36,High,0
5,Tammy Ortega,MALE,Arthritis,Esparza-stewart,Aetna,49,27,43262.84,High,0
10,Gregory Thomas,FEMALE,Cancer,"Davis Maddox Harris, And",Aetna,72,25,11064.03,High,0
12,Barry Wise,MALE,Asthma,Gordon-turner,Unitedhealthcare,33,26,42562.35,High,0
16,Paula Williams,FEMALE,Hypertension,Clements-ortega,Cigna,76,21,38080.93,High,0
18,Shawn Acosta,MALE,Cancer,Jackson Ltd,Blue Cross,50,15,18439.2,High,0
20,Natasha Pearson,FEMALE,Diabetes,"Rodriguez Glover And Brown,",Aetna,71,20,26625.5,High,0
27,Russell Smith,FEMALE,Diabetes,Inc Harris,Blue Cross,50,7,26338.07,High,0
29,Russell Smith,FEMALE,Hypertension,Plc Smith,Blue Cross,29,12,19011.44,High,0


#### **Encode categoricals**

In [0]:
for col in ['gender','medical_condition','hospital','insurance_provider']:
    df[col] = LabelEncoder().fit_transform(df[col])

X = df[['age','gender','medical_condition','hospital','insurance_provider',
        'billing_amount','stay_duration_days']]
y = df['readmitted_30d']

#### **After Encoding (Numeric Features)**

In [0]:
print("🔹 Encoded Data Sample:")
display(df[['age','gender','medical_condition','hospital','insurance_provider','billing_amount','stay_duration_days','readmitted_30d']].head(10))


🔹 Encoded Data Sample:


age,gender,medical_condition,hospital,insurance_provider,billing_amount,stay_duration_days,readmitted_30d
20,0,4,25766,4,7834.36,28,0
18,0,3,3821,1,34218.36,29,0
49,1,0,12276,0,43262.84,27,0
72,0,2,11098,0,11064.03,25,0
33,1,1,14720,4,42562.35,26,0
76,0,4,9828,2,38080.93,21,0
50,1,2,19136,1,18439.2,15,0
71,0,3,31548,0,26625.5,20,0
50,0,3,18718,1,26338.07,7,0
29,0,4,29679,1,19011.44,12,0


#### **Check Class Distribution (Before Training)**

In [0]:
print("🔹 Class Distribution:")
print(df['readmitted_30d'].value_counts(normalize=True))

🔹 Class Distribution:
readmitted_30d
0    0.999584
1    0.000416
Name: proportion, dtype: float64


#### **Split**

In [0]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

#### **After Train-Test Split**

In [0]:
print("🔹 Train/Test Split Shape:")
print("Train:", X_train.shape, " Test:", X_test.shape)

print("🔹 Training set class distribution:")
print(pd.Series(y_train).value_counts(normalize=True))
print("🔹 Test set class distribution:")
print(pd.Series(y_test).value_counts(normalize=True))

🔹 Train/Test Split Shape:
Train: (40429, 7)  Test: (10108, 7)
🔹 Training set class distribution:
readmitted_30d
0    0.999555
1    0.000445
Name: proportion, dtype: float64
🔹 Test set class distribution:
readmitted_30d
0    0.999703
1    0.000297
Name: proportion, dtype: float64


#### **Scale**

In [0]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

#### **Before & After Scaling (See Difference)**

In [0]:
# Convert scaled arrays back to DataFrames for inspection
X_train_scaled_df = pd.DataFrame(X_train, columns=['age','gender','medical_condition','hospital','insurance_provider','billing_amount','stay_duration_days'])
X_test_scaled_df = pd.DataFrame(X_test, columns=['age','gender','medical_condition','hospital','insurance_provider','billing_amount','stay_duration_days'])

print("🔹 Original X sample (before scaling):")
display(df[['age','billing_amount','stay_duration_days']].head(10))

print("🔹 Scaled X_train sample (after scaling):")
display(X_train_scaled_df[['age','billing_amount','stay_duration_days']].head(10))


🔹 Original X sample (before scaling):


age,billing_amount,stay_duration_days
20,7834.36,28
18,34218.36,29
49,43262.84,27
72,11064.03,25
33,42562.35,26
76,38080.93,21
50,18439.2,15
71,26625.5,20
50,26338.07,7
29,19011.44,12


🔹 Scaled X_train sample (after scaling):


age,billing_amount,stay_duration_days
1.7094328520725044,-0.2104187352484808,0.0584696794312924
1.4023646605268807,-0.0869372657563809,-1.095313349526012
1.35118662860261,-0.8324813935577929,-0.6338001379430901
-1.6683172549293566,-0.8789170112023651,-1.095313349526012
1.658254820148234,0.8735384200959443,-0.7491784408388206
-1.617139223005086,0.4723852820987451,1.0968744054928663
-0.542400552595403,0.306254636399024,-1.210691652421742
-0.2353323610497792,-1.493638954172928,-0.6338001379430901
-0.4912225206711323,1.0197519054314723,-0.4030435321516292
-0.8494687441410267,-1.2291909856397596,0.750739496805675


#### **Train**

In [0]:
model = LogisticRegression()
model.fit(X_train,y_train)

#### **After Training (Model Coefficients)**

In [0]:
feature_names = ['age','gender','medical_condition','hospital','insurance_provider','billing_amount','stay_duration_days']
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': model.coef_[0]})
print("🔹 Model Feature Importance:")
display(coef_df.sort_values(by='Coefficient', ascending=False))

🔹 Model Feature Importance:


Feature,Coefficient
medical_condition,0.2581487707247624
hospital,-0.0081429237492057
stay_duration_days,-0.0562765315705914
age,-0.1284421937081026
billing_amount,-0.1560582389556995
insurance_provider,-0.2256314088355228
gender,-0.3194247493662373


#### **Compare Before vs After Scaling Side-by-Side**

In [0]:
comparison = pd.concat([
    df[['age','billing_amount','stay_duration_days']].reset_index(drop=True).head(10),
    X_train_scaled_df[['age','billing_amount','stay_duration_days']].reset_index(drop=True).head(10)
], axis=1)

comparison.columns = ['Age_Original','Billing_Original','Stay_Original','Age_Scaled','Billing_Scaled','Stay_Scaled']
display(comparison)


Age_Original,Billing_Original,Stay_Original,Age_Scaled,Billing_Scaled,Stay_Scaled
20,7834.36,28,1.7094328520725044,-0.2104187352484808,0.0584696794312924
18,34218.36,29,1.4023646605268807,-0.0869372657563809,-1.095313349526012
49,43262.84,27,1.35118662860261,-0.8324813935577929,-0.6338001379430901
72,11064.03,25,-1.6683172549293566,-0.8789170112023651,-1.095313349526012
33,42562.35,26,1.658254820148234,0.8735384200959443,-0.7491784408388206
76,38080.93,21,-1.617139223005086,0.4723852820987451,1.0968744054928663
50,18439.2,15,-0.542400552595403,0.306254636399024,-1.210691652421742
71,26625.5,20,-0.2353323610497792,-1.493638954172928,-0.6338001379430901
50,26338.07,7,-0.4912225206711323,1.0197519054314723,-0.4030435321516292
29,19011.44,12,-0.8494687441410267,-1.2291909856397596,0.750739496805675


#### **Evaluate**

In [0]:
preds = model.predict(X_test)
print(classification_report(y_test,preds))
print("ROC-AUC:", roc_auc_score(y_test,preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10105
           1       0.00      0.00      0.00         3

    accuracy                           1.00     10108
   macro avg       0.50      0.50      0.50     10108
weighted avg       1.00      1.00      1.00     10108

ROC-AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
