In [2]:
# Load Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [3]:
# Load datasets
mimic_df = pd.read_csv(r"C:\Users\HP\Downloads\Dataset_Predicting Hospital\mimic_iii_data.csv")
diabetes_df = pd.read_csv(r"C:\Users\HP\Downloads\Dataset_Predicting Hospital\diabetic_data.csv")

# Merge datasets on patient ID
merged_df = pd.merge(diabetes_df, mimic_df, left_on="patient_nbr", right_on="Patient_ID", how="inner")
merged_df.drop(columns=["Patient_ID"], inplace=True)

# Replace missing values marked as '?'
merged_df.replace("?", np.nan, inplace=True)

In [6]:
# Impute missing numerical values with median
num_cols = merged_df.select_dtypes(include=[np.number]).columns
num_imputer = SimpleImputer(strategy='median')
merged_df[num_cols] = num_imputer.fit_transform(merged_df[num_cols])

# Impute missing categorical values with mode
cat_cols = merged_df.select_dtypes(include=['object']).columns
for col in cat_cols:
    merged_df[col].fillna(merged_df[col].mode()[0], inplace=True)

In [8]:
# Encode Categorical Variables
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    merged_df[col] = le.fit_transform(merged_df[col])
    label_encoders[col] = le

In [10]:
# Define Features & Target Variable
# Convert 'readmitted' column: '>30' and '<30' → 1 (readmitted), 'NO' → 0 (not readmitted)
merged_df['readmitted'] = merged_df['readmitted'].replace({'>30': 1, '<30': 1, 'NO': 0})

# Define features and target
X = merged_df.drop(columns=["readmitted"])
y = merged_df["readmitted"]

# Ensure dataset is large enough before splitting
if X.shape[0] < 50:
    raise ValueError(f"Insufficient data for training. Only {X.shape[0]} samples available.")

# Train-test split (80% train, 20% test, stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
# Scale Numerical Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear', probability=True),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)  # Train the model
    
    # Predictions on train and test sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Compute accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Store results
    results[name] = {"Train Accuracy": train_accuracy, "Test Accuracy": test_accuracy}
    
    # Print results
    print(f"{name} Train Accuracy: {train_accuracy:.4f}")
    print(f"{name} Test Accuracy: {test_accuracy:.4f}")
    print(classification_report(y_test, y_test_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
    print("-" * 50)

In [None]:
# Convert results dictionary to DataFrame
results_df = pd.DataFrame(results).T
results_df.sort_values(by="Test Accuracy", ascending=False, inplace=True)
print(results_df)

# Bar plot of model performance
plt.figure(figsize=(10, 5))
results_df.plot(kind="bar", figsize=(10, 5), colormap="viridis")
plt.xticks(rotation=45)
plt.ylabel("Accuracy")
plt.title("Model Comparison (Train & Test Accuracy)")
plt.ylim(0, 1)
plt.show()

In [None]:
# Distribution of Time in Hospital
plt.figure(figsize=(10, 5))
sns.histplot(merged_df['time_in_hospital'], bins=20, kde=True)
plt.title("Distribution of Time in Hospital")
plt.xlabel("Days")
plt.ylabel("Count")
plt.show()

# Distribution of Number of Medications
plt.figure(figsize=(10, 5))
sns.histplot(merged_df['num_medications'], bins=20, kde=True)
plt.title("Distribution of Number of Medications")
plt.xlabel("Number of Medications")
plt.ylabel("Count")
plt.show()

# Correlation Matrix
plt.figure(figsize=(12, 6))
sns.heatmap(merged_df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix - Diabetes Data")
plt.show()

# Readmission Counts
plt.figure(figsize=(10, 5))
sns.countplot(x='readmitted', data=merged_df, palette="pastel")
plt.title("Readmission Counts")
plt.xlabel("Readmitted (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()

# Time in Hospital by Readmission Status
plt.figure(figsize=(10, 5))
sns.violinplot(x='readmitted', y='time_in_hospital', data=merged_df, palette="muted")
plt.title("Time in Hospital by Readmission Status")
plt.xlabel("Readmitted (0 = No, 1 = Yes)")
plt.ylabel("Time in Hospital (days)")
plt.show()