#### **Notebook: Regression_StayDuration_Model.ipynb**
#### **Goal: Predict hospital stay duration**
#### **Input Table: patient_risk_prediction.ml.stay_duration**

In [0]:
from pyspark.sql import SparkSession
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#### **Load Delta table into pandas DataFrame**

In [0]:
spark = SparkSession.builder.getOrCreate()
df = spark.table("patient_risk_prediction.ml.stay_duration").toPandas()

#### **Before Encoding (Raw Data Sample)**

In [0]:
print("✅ Loaded data:", df.shape)
display(df.head(5))

✅ Loaded data: (55500, 12)


patient_name,gender,age,blood_type,medical_condition,hospital,insurance_provider,billing_amount,medication,test_results,risk_level,target_stay_duration
Bobby Jackson,MALE,30,B-,Cancer,Sons And Miller,Blue Cross,18856.281305978155,Paracetamol,Normal,High,2
Leslie Terry,MALE,62,A+,Obesity,Kim Inc,Medicare,33643.327286577885,Ibuprofen,Inconclusive,Normal,6
Danny Smith,FEMALE,76,A-,Obesity,Cook Plc,Aetna,27955.096078842456,Aspirin,Normal,Normal,15
Andrew Watts,FEMALE,28,O+,Diabetes,"Hernandez Rogers And Vang,",Medicare,37909.78240987528,Ibuprofen,Abnormal,High,30
Adrienne Bell,FEMALE,43,AB+,Cancer,White-white,Aetna,14238.317813937623,Penicillin,Abnormal,High,20


#### **Encode categorical variables**

In [0]:
categorical_cols = ['gender', 'medical_condition', 'hospital', 'insurance_provider', 'risk_level']
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

#### **Drop nulls in target**

In [0]:
df = df.dropna(subset=['target_stay_duration'])

#### **Define features and target**

In [0]:
X = df[['age', 'gender', 'medical_condition', 'hospital',
        'insurance_provider', 'billing_amount']]
y = df['target_stay_duration']

#### **Display before split**

In [0]:
print("\n🔹 Sample before split:")
display(pd.concat([X.head(3), y.head(3)], axis=1))


🔹 Sample before split:


age,gender,medical_condition,hospital,insurance_provider,billing_amount,target_stay_duration
30,1,2,34469,1,18856.281305978155,2
62,1,5,20829,3,33643.327286577885,6
76,0,5,10300,0,27955.096078842456,15


#### **Train-Test Split**

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### **Scale numeric features**

In [0]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

#### **Train Regression Model**

In [0]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

#### **Evaluate performance**

In [0]:
from math import sqrt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

preds = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = sqrt(mse)
r2 = r2_score(y_test, preds)

In [0]:
import numpy as np
print("\n================ MODEL PERFORMANCE ================")
print("Sample Predictions:", np.round(preds[:10], 2))  # show first 10 predicted values
print(f"MAE  : {mae:.3f}")
print(f"MSE  : {mse:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"R²   : {r2:.5f}")


Sample Predictions: [15.61 15.34 15.46 15.41 15.43 15.61 15.46 15.44 15.8  15.72]
MAE  : 7.480
MSE  : 74.464
RMSE : 8.629
R²   : -0.00015


#### **Display predictions before and after**

In [0]:
pred_df = X_test.copy()
pred_df['Actual Stay'] = y_test.values
pred_df['Predicted Stay'] = preds.round(1)
print("\n🔹 Predicted vs Actual sample:")
display(pred_df.head(10))


🔹 Predicted vs Actual sample:


age,gender,medical_condition,hospital,insurance_provider,billing_amount,Actual Stay,Predicted Stay
57,1,3,36257,3,3616.898449997269,17,15.6
51,0,3,38509,0,36970.07548148127,7,15.3
20,1,1,7943,1,44393.001347688056,29,15.5
74,0,5,39061,0,27554.923707732403,7,15.4
56,1,5,39205,0,27466.31856706151,6,15.4
46,1,2,31484,3,4802.620713726476,28,15.6
44,0,1,13453,1,40207.46321828153,27,15.5
46,1,3,38045,3,30065.260597832428,26,15.4
64,0,0,617,2,392.9135481332535,12,15.8
68,1,2,10177,3,20886.342148398853,30,15.7
