In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
import warnings
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
import logging

warnings.simplefilter('ignore')

# Importing the Dataset

In [2]:
df_train = pd.read_csv("D:/mission_2025/python_learning/av_healthcare\data/train_data.csv")
df_test = pd.read_csv("D:/mission_2025/python_learning/av_healthcare\data/test_data.csv")

In [3]:
df_train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [5]:
print("Training Dataset Info:")
df_train.info()

Training Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  object 
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  object 
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility_Code                 318438 non-null  object 
 9   Bed Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patien

In [6]:
print("Testing Dataset Info: ")
print(df_test.info())

Testing Dataset Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137057 entries, 0 to 137056
Data columns (total 17 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            137057 non-null  int64  
 1   Hospital_code                      137057 non-null  int64  
 2   Hospital_type_code                 137057 non-null  object 
 3   City_Code_Hospital                 137057 non-null  int64  
 4   Hospital_region_code               137057 non-null  object 
 5   Available Extra Rooms in Hospital  137057 non-null  int64  
 6   Department                         137057 non-null  object 
 7   Ward_Type                          137057 non-null  object 
 8   Ward_Facility_Code                 137057 non-null  object 
 9   Bed Grade                          137022 non-null  float64
 10  patientid                          137057 non-null  int64  
 11  City_Code_Patien

In [7]:
df_train.describe()

Unnamed: 0,case_id,Hospital_code,City_Code_Hospital,Available Extra Rooms in Hospital,Bed Grade,patientid,City_Code_Patient,Visitors with Patient,Admission_Deposit
count,318438.0,318438.0,318438.0,318438.0,318325.0,318438.0,313906.0,318438.0,318438.0
mean,159219.5,18.318841,4.771717,3.197627,2.625807,65747.579472,7.251859,3.284099,4880.749392
std,91925.276847,8.633755,3.102535,1.168171,0.873146,37979.93644,4.745266,1.764061,1086.776254
min,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1800.0
25%,79610.25,11.0,2.0,2.0,2.0,32847.0,4.0,2.0,4186.0
50%,159219.5,19.0,5.0,3.0,3.0,65724.5,8.0,3.0,4741.0
75%,238828.75,26.0,7.0,4.0,3.0,98470.0,8.0,4.0,5409.0
max,318438.0,32.0,13.0,24.0,4.0,131624.0,38.0,32.0,11008.0


In [8]:
print("Training Dataset Columns")
print(df_train.columns)

print("\n")

print("Testing Dataset Columns")
print(df_test.columns)

Training Dataset Columns
Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')


Testing Dataset Columns
Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit'],
      dtype='object')


# Preprocessing the dataset

In [9]:
y = df_train['Severity of Illness']

In [26]:
input_columns = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'City_Code_Patient', 'Type of Admission',
       'Bed Grade', 'Visitors with Patient', 'Age',
       'Admission_Deposit']
x = df_train[input_columns]

In [27]:
x.head()

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,City_Code_Patient,Type of Admission,Bed Grade,Visitors with Patient,Age,Admission_Deposit
0,8,c,3,Z,3,radiotherapy,R,F,7.0,Emergency,2.0,2,51-60,4911.0
1,2,c,5,Z,2,radiotherapy,S,F,7.0,Trauma,2.0,2,51-60,5954.0
2,10,e,1,X,2,anesthesia,S,E,7.0,Trauma,2.0,2,51-60,4745.0
3,26,b,2,Y,2,radiotherapy,R,D,7.0,Trauma,2.0,2,51-60,7272.0
4,26,b,2,Y,2,radiotherapy,S,D,7.0,Trauma,2.0,2,51-60,5558.0


In [29]:
le = LabelEncoder()
categorical_columns = ["Department", "Type of Admission", "Hospital_type_code", "Age", "Hospital_region_code", "Ward_Facility_Code", "Ward_Type"]

for column in categorical_columns:
    try:
        x[column] = le.fit_transform(x[column])
    except Exception as e:
        logging.error(f"Error occurred while label encoding {column}: {e}")

try:
    y = le.fit_transform(pd.DataFrame(y))
except Exception as e:
    logging.error(f"Error occurred while label encoding y: {e}")

In [30]:
# preprocessor pipelines
pre_processor_inputs = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("normalizer", Normalizer(norm="l1"))
])

In [31]:
# preprocess the input and outputs
x_preprocessed = pre_processor_inputs.fit_transform(x)

In [32]:
x_preprocessed

array([[-0.11714961,  0.0474331 , -0.05597412, ..., -0.0713501 ,
         0.04524545,  0.00272837],
       [-0.14956732,  0.03829296,  0.00582243, ..., -0.05760127,
         0.03652687,  0.07814625],
       [-0.0809008 ,  0.14997914, -0.10207328, ..., -0.06111877,
         0.03875743, -0.01048787],
       ...,
       [-0.12601341, -0.07866565, -0.02390869, ..., -0.01548   ,
         0.14610678, -0.05711345],
       [-0.08571924, -0.01691183, -0.09033746, ...,  0.09835903,
        -0.16738216, -0.1041877 ],
       [ 0.00794221, -0.08238786,  0.07230132, ..., -0.07327866,
        -0.16663493, -0.01192608]])

# Training the model

In [36]:
# Bagging Model
rf = RandomForestClassifier(
    n_estimators=50,
    criterion="entropy",
    max_depth=12
)

# Boosting Model
dt = DecisionTreeClassifier(
    criterion = "gini",
    splitter = "best",
    max_depth= None,
    min_samples_split= 2,
    min_samples_leaf = 1,
    min_weight_fraction_leaf = 0,
    max_features = 'sqrt',
    random_state = 42
)
ad = AdaBoostClassifier(dt, n_estimators=50, learning_rate=1.0, random_state=42)

# Probabilistic Model
gnb = GaussianNB()

# Voting Model
main_model = VotingClassifier([
    ("rf", rf),
    ("ad", ad),
    ("gn", gnb)
], voting="hard")

In [37]:
main_model.fit(x_preprocessed, y)

# Testing the Model

In [39]:
test_data = pd.read_csv("D:/mission_2025/python_learning/av_healthcare/data/test_data.csv")

In [42]:
test_input_columns = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'City_Code_Patient', 'Type of Admission',
       'Bed Grade', 'Visitors with Patient', 'Age',
       'Admission_Deposit']
x_test = test_data[test_input_columns]
y_test = test_data['Severity of Illness']

In [44]:
test_categorical_columns = ["Department", "Type of Admission", "Hospital_type_code", "Age", "Hospital_region_code", "Ward_Facility_Code","Ward_Type"]

for column in test_categorical_columns:
    x_test[column] = le.fit_transform(x_test[column])

In [45]:
x_test_normalized = pre_processor_inputs.transform(x_test)

In [46]:
y_test = le.fit_transform(y_test)

In [47]:
y_pred = main_model.predict(x_test_normalized)

In [48]:
acc = accuracy_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)
scored = precision_recall_fscore_support(y_test, y_pred)

In [49]:
def print_results(model, acc, conf, scores):
    return f"""
        For the Model {model},

        Accuracy Score: {acc * 100}%
        
        confusion matrix : {conf}

        scores : {scores}
            """

In [50]:
print(print_results("Decision Tree", acc, conf, scored))


        For the Model Decision Tree,

        Accuracy Score: 58.30785731483981%
        
        confusion matrix : [[ 3782  1210 19480]
 [  237  9001 27625]
 [ 1731  6859 67132]]

        scores : (array([0.65773913, 0.52729936, 0.58765549]), array([0.15454397, 0.24417438, 0.88655873]), array([0.25028125, 0.33378451, 0.70680515]), array([24472, 36863, 75722], dtype=int64))
            


Result: the Hard voting sees to give bad results, so implementing soft voting is a better choice for probabilistic model

In [51]:
import joblib

joblib.dump(main_model, "../models/ensemble_model_hard_voted.joblib")

['../models/ensemble_model_hard_voted.joblib']

In [52]:
gnb.fit(x_preprocessed, y)

In [53]:
y_pred_nb = gnb.predict(x_test_normalized)

In [54]:
acc = accuracy_score(y_test, y_pred_nb)
conf = confusion_matrix(y_test, y_pred_nb)
scored = precision_recall_fscore_support(y_test, y_pred_nb)

In [55]:
print(print_results("Gaussian Naive Bayes Classifier", acc, conf, scored))


        For the Model Decision Tree,

        Accuracy Score: 56.51371327258003%
        
        confusion matrix : [[ 1459  1472 21541]
 [  141  8505 28217]
 [ 1012  7218 67492]]

        scores : (array([0.5585758 , 0.49462053, 0.57562473]), array([0.05961916, 0.23071915, 0.89131296]), array([0.10773889, 0.31466203, 0.69950045]), array([24472, 36863, 75722], dtype=int64))
            


Result: Gaussian NB brings down the accuracy of the model, it must not be used in the deployment

In [56]:
ad.fit(x_preprocessed, y)

In [57]:
y_pred_ad = ad.predict(x_test_normalized)

In [58]:
acc = accuracy_score(y_test, y_pred_ad)
conf = confusion_matrix(y_test, y_pred_ad)
scored = precision_recall_fscore_support(y_test, y_pred_ad)

In [59]:
print(print_results("Adaboost", acc, conf, scored))


        For the Model Adaboost,

        Accuracy Score: 54.642958768979334%
        
        confusion matrix : [[ 4994  3023 16455]
 [ 1242 13044 22577]
 [ 5069 13799 56854]]

        scores : (array([0.44175144, 0.43675082, 0.59293327]), array([0.20406996, 0.35385074, 0.75082539]), array([0.27917377, 0.39095446, 0.66260314]), array([24472, 36863, 75722], dtype=int64))
            


Result: ML Models Fails to Bring Proper Accuracy

# Compressing the Model

In [3]:
import joblib

In [4]:
main_model_loaded = joblib.load("../models/ensemble_model_hard_voted.joblib")

In [5]:
for estimators in main_model_loaded.estimators_:
    if hasattr(estimators, "n_estimators") and estimators.n_estimators > 100:
        estimators.n_estimators = 100

In [6]:
for estimators in main_model_loaded.estimators_:
    if hasattr(estimators, "max_depth") and estimators.max_depth is None:
        estimators.max_depth = 10

In [7]:
for estimators in main_model_loaded.estimators_:
    if hasattr(estimators, "oob_decision_function_"):
        del estimators.oob_decision_function_ 

for estimators in main_model_loaded.estimators_:
    if hasattr(estimators, "feature_importance_"):
        del estimators.feature_importance_

In [8]:
joblib.dump(
    main_model_loaded, 
    "../models/ensemble_model_voting_hard_compressed.joblib",
    compress=9
)

['../models/ensemble_model_voting_hard_compressed.joblib']

In [67]:
report = classification_report(y_test, y_pred, output_dict=True)
print(report)

{'0': {'precision': 0.6577391304347826, 'recall': 0.15454396861719516, 'f1-score': 0.2502812520680299, 'support': 24472.0}, '1': {'precision': 0.5272993555946104, 'recall': 0.2441743753899574, 'f1-score': 0.3337845104110656, 'support': 36863.0}, '2': {'precision': 0.5876554881518247, 'recall': 0.8865587279786588, 'f1-score': 0.7068051526908438, 'support': 75722.0}, 'accuracy': 0.5830785731483981, 'macro avg': {'precision': 0.5908979913937392, 'recall': 0.4284256906619371, 'f1-score': 0.43029030505664645, 'support': 137057.0}, 'weighted avg': {'precision': 0.5839357130180625, 'recall': 0.5830785731483981, 'f1-score': 0.5249631976473147, 'support': 137057.0}}


In [68]:
params = main_model.get_params()
print(params)

{'estimators': [('rf', RandomForestClassifier(criterion='entropy', max_depth=12, n_estimators=50)), ('ad', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_features='sqrt',
                                                    min_weight_fraction_leaf=0,
                                                    random_state=42),
                   random_state=42)), ('gn', GaussianNB())], 'flatten_transform': True, 'n_jobs': None, 'verbose': False, 'voting': 'hard', 'weights': None, 'rf': RandomForestClassifier(criterion='entropy', max_depth=12, n_estimators=50), 'ad': AdaBoostClassifier(estimator=DecisionTreeClassifier(max_features='sqrt',
                                                    min_weight_fraction_leaf=0,
                                                    random_state=42),
                   random_state=42), 'gn': GaussianNB(), 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'entropy', 'rf__max_depth': 12, 'rf__max_features': 'sqrt

In [69]:
joblib.dump(le, "../models/label_encoder.joblib")
joblib.dump(pre_processor_inputs, "../models/pre_processor.joblib")

['../models/pre_processor.joblib']