In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your dataset
df = pd.read_csv('expanded_hospital_beds_facilities.csv')

# Preview the data
print(df.head())

# Encode categorical features
label_encoder = LabelEncoder()

# Encode the 'Location Type' (Urban/Rural), 'Emergency Department', 'Pediatric Care',
# 'Specialized Department', and 'Ambulance Services' as they are categorical
df['Location Type'] = label_encoder.fit_transform(df['Location Type'])
df['Emergency Department'] = label_encoder.fit_transform(df['Emergency Department'])
df['Pediatric Care'] = label_encoder.fit_transform(df['Pediatric Care'])
df['Specialized Department'] = label_encoder.fit_transform(df['Specialized Department'])
df['Ambulance Services'] = label_encoder.fit_transform(df['Ambulance Services'])

# Feature selection: Dropping the 'Hospital Name' as it's not relevant for the model
X = df.drop(columns=["Beds Available", "Hospital Name"])

# Target: Let's assume we want to predict if more than 50 beds are available
y = df["Beds Available"] > 50  # This will return True if beds > 50, otherwise False

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preview the processed data
print(X_train.head())
print(y_train.head())


  Hospital Name  Beds Available  Ventilators Available  ICU Available  \
0    Hospital 1             496                     68             52   
1    Hospital 2             254                     10             85   
2    Hospital 3              45                     68             67   
3    Hospital 4             218                     56             49   
4    Hospital 5             488                     53             14   

  Location Type Emergency Department Pediatric Care Specialized Department  \
0         Urban                  Yes             No               Oncology   
1         Urban                  Yes             No            Orthopedics   
2         Rural                   No            Yes              Neurology   
3         Urban                  Yes            Yes                General   
4         Urban                   No             No            Orthopedics   

  Ambulance Services  Year of Establishment  
0                Yes                   2012  


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load your preprocessed data (from the previous preprocessing step)
# X_train, X_test, y_train, y_test are already defined

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Save the model using joblib
joblib.dump(rf_model, 'hospital_bed_availability_model.pkl')
print("Model saved as 'hospital_bed_availability_model.pkl'")


Model Accuracy: 94.44%
Model saved as 'hospital_bed_availability_model.pkl'


In [3]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest model
rf = RandomForestClassifier(random_state=42)

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

print(f"Best Parameters: {best_params}")

# Evaluate the model
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with Best Parameters: {accuracy * 100:.2f}%")

# Save the tuned model
joblib.dump(best_rf_model, 'best_hospital_bed_availability_model.pkl')


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Model Accuracy with Best Parameters: 94.44%


['best_hospital_bed_availability_model.pkl']

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)

# Evaluate the model
y_pred = gb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Gradient Boosting Model Accuracy: {accuracy * 100:.2f}%")

# Save the model
joblib.dump(gb_model, 'gb_hospital_bed_availability_model.pkl')


Gradient Boosting Model Accuracy: 94.37%


['gb_hospital_bed_availability_model.pkl']

In [5]:
from sklearn.model_selection import cross_val_score

# Cross-validation scores
cv_scores = cross_val_score(rf, X, y, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean() * 100:.2f}%")


Cross-Validation Scores: [0.93222222 0.93222222 0.93333333 0.93222222 0.93333333]
Mean CV Score: 93.27%
