<a href="https://colab.research.google.com/github/Anubis-1001/Colab-notebooks/blob/main/Hospital_Beds_Management_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import kagglehub
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
path = kagglehub.dataset_download("jaderz/hospital-beds-management")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'hospital-beds-management' dataset.
Path to dataset files: /kaggle/input/hospital-beds-management


In [11]:
patients = pd.read_csv(f'{path}/patients.csv')
services_weekly = pd.read_csv(f'{path}/services_weekly.csv')
staff = pd.read_csv(f'{path}/staff.csv')
staff_schedule = pd.read_csv(f'{path}/staff_schedule.csv')

In [12]:
def get_week_of_year(dt):
    return dt.isocalendar().week

def get_week_data(patient, feature):
    wa = patient['week_arrival']
    srv = patient['service']
    match = services_weekly[
        (services_weekly['week'] == wa) &
        (services_weekly['service'] == srv)
    ][feature]
    return match.iloc[0] if not match.empty else None

patients['dt_arrival'] = pd.to_datetime(patients['arrival_date'])
patients['dt_departure'] = pd.to_datetime(patients['departure_date'])
patients['week_arrival'] = patients['dt_arrival'].apply(get_week_of_year)
patients['week_departure'] = patients['dt_departure'].apply(get_week_of_year)

weekly_services_attr = ['staff_morale', 'patient_satisfaction', 'event']

for feature in weekly_services_attr:
  patients[f'pat_{feature}'] = patients.apply(lambda pat: get_week_data(pat, feature), axis=1)


patients['stay_duration'] = ( pd.to_datetime(patients['departure_date'], format="%Y-%m-%d") - pd.to_datetime(patients['arrival_date'], format="%Y-%m-%d" ) ).dt.days

patients['has_ICU'] = patients['service'] == 'ICU'

ohe = OneHotEncoder(sparse_output=False)

patients_encoded = ohe.fit_transform(patients[['pat_event']])

pat_encoded_df = pd.DataFrame(patients_encoded, columns=ohe.get_feature_names_out(), index=patients.index)

patients = pd.concat([patients, pat_encoded_df], axis=1)

In [13]:
# X = patients.drop(columns=['satisfaction', 'patient_id', 'name', 'arrival_date', 'departure_date', 'service', 'dt_departure', 'dt_arrival', 'pat_event'])
X = patients.drop(columns=['satisfaction', 'patient_id', 'name', 'arrival_date', 'departure_date', 'service', 'dt_departure', 'dt_arrival', 'pat_event', 'week_arrival', 'week_departure'])
# X = patients[[ 'age', 'stay_duration', 'has_ICU', 'pat_event_donation' ]]
y = patients['satisfaction']
y_strata = pd.qcut(y, q=4, labels=False)

std_scaler = StandardScaler()

X_train, X_test, y_train,  y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y_strata)
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

In [14]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
import numpy as np

linear_reg = LinearRegression()

linear_reg.fit(X, y)
y_pred = linear_reg.predict(X_test)

r2 = r2_score(y_test, y_pred)
mean_absolute_error = mean_absolute_error(y_test, y_pred)
mean_squared_error = mean_squared_error(y_test, y_pred)

print(f"r2: {r2:.2f}")
print(f"mean_absolute_error: {mean_absolute_error:.2f}")
print(f"mean_squared_error: {mean_squared_error:.2f}")

r2: -0.03
mean_absolute_error: 9.62
mean_squared_error: 131.32




In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

ridge = Ridge()

param_grid = {
    'alpha': [ 0.01, 0.1, 1, 10, 100  ],
    'solver': ['auto', 'svd', 'cholesky'],
     'fit_intercept': [True, False]
}

grid_search = GridSearchCV(
      estimator=ridge,
      param_grid=param_grid,
      scoring='neg_mean_squared_error',
      cv=5,
      n_jobs=-1,
      verbose=1
)


grid_search.fit(X_train, y_train)

print("Ridge")
print("="*50)
print(grid_search.best_params_)
print(grid_search.best_score_)
print("MAE")
pred = grid_search.best_estimator_.predict(X_test)

print(mean_)


dt = DecisionTreeRegressor(random_state=42)



param_grid = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 3, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(
      estimator=dt,
      param_grid=param_grid,
      scoring='neg_mean_squared_error',
      cv=5,
      n_jobs=-1,
      verbose=1
)

grid_search.fit(X_train, y_train)

print("DT")
print(grid_search.best_params_)
print(grid_search.best_score_)
print("Feature importances")
print(std_scaler.feature_names_in_)
print(grid_search.best_estimator_.feature_importances_)

lasso = Lasso()

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'fit_intercept': [True, False],
    'max_iter': [1000, 5000, 10000]
}

grid_search = GridSearchCV(
      estimator=lasso,
      param_grid=param_grid,
      scoring='neg_mean_squared_error',
      cv=5,
      n_jobs=-1,
      verbose=1
)

grid_search.fit(X_train, y_train)

print("Lasso")
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Ridge
{'alpha': 100, 'fit_intercept': True, 'solver': 'auto'}
-135.7440560638974
MAE


NameError: name 'mean_' is not defined

In [None]:
from sklearn.metrics import mean_absolute_error

best_dt = grid_search.best_estimator_
pred = best_dt.predict(X_test)

mean_error = mean_absolute_error(y_test, pred)

print(f"mean absolute error: {mean_error}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

gs_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1,
    verbose=1
)

gs_rf.fit(X_train, y_train)

print("Best params:", gs_rf.best_params_)
print("Best score:", gs_rf.best_score_)

rf_pred=gs_rf.best_estimator_.predict(X_test)
score=mean_absolute_error(y_test, rf_pred)

print(f"Mean Score Value {score}")