# Importing Libraries

In [5]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import joblib
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the data

In [6]:
df = pd.read_csv('data/processed_data.csv')
df.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Date of Admission,Hospital,Insurance Provider,Billing Amount,Admission Type,Discharge Date,Medication,Test Results,target
0,30,Male,B-,Cancer,2024-01-31,Sons and Miller,Blue Cross,18856.281306,Urgent,2024-02-02,Paracetamol,Normal,0
1,62,Male,A+,Obesity,2019-08-20,Kim Inc,Medicare,33643.327287,Emergency,2019-08-26,Ibuprofen,Inconclusive,1
2,76,Female,A-,Obesity,2022-09-22,Cook PLC,Aetna,27955.096079,Emergency,2022-10-07,Aspirin,Normal,1
3,28,Female,O+,Diabetes,2020-11-18,"Hernandez Rogers and Vang,",Medicare,37909.78241,Elective,2020-12-18,Ibuprofen,Abnormal,0
4,43,Female,AB+,Cancer,2022-09-19,White-White,Aetna,14238.317814,Urgent,2022-10-09,Penicillin,Abnormal,0


In [7]:
df['target'].value_counts()

target
0    37231
1    18269
Name: count, dtype: int64

# Data Cleaning and Prep

In [9]:
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])

# Calculate length of stay
df['Length of Stay'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

# Define billing amount bins 
bins = [df['Billing Amount'].min(), 
        df['Billing Amount'].quantile(0.33),  
        df['Billing Amount'].quantile(0.67),  
        df['Billing Amount'].max()]  

# Define category labels
labels = ['Low-cost', 'Medium-cost', 'High-cost']
df['Hospital Category'] = pd.cut(df['Billing Amount'], bins=bins, labels=labels, include_lowest=True)

# Drop original date columns
df.drop(columns=['Date of Admission', 'Discharge Date','Admission Type', 'Medication','Hospital'], inplace=True)

# Encode categorical variables
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})  

scaler = StandardScaler()
df['Billing Amount'] = scaler.fit_transform(df[['Billing Amount']])

# Label Encoding
label_cols = ['Blood Type', 'Test Results', 'Insurance Provider','Medical Condition', 'Hospital Category']
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [10]:
df.head()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Insurance Provider,Billing Amount,Test Results,target,Length of Stay,Hospital Category
0,30,1,5,2,1,-0.470261,2,0,2,2
1,62,1,0,5,3,0.57025,1,1,6,2
2,76,0,1,5,0,0.16999,2,1,15,2
3,28,0,6,3,3,0.870465,0,0,30,0
4,43,0,2,2,0,-0.795211,0,0,20,1


In [11]:
df.corr()

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Insurance Provider,Billing Amount,Test Results,target,Length of Stay,Hospital Category
Age,1.0,-0.002002,-0.000402,-0.002809,-0.004784,-0.003832,-0.006474,0.000502,0.00822,-7e-05
Gender,-0.002002,1.0,-0.000839,0.000317,-0.002318,0.004827,0.004892,-0.009003,0.003836,-0.009146
Blood Type,-0.000402,-0.000839,1.0,-0.004616,-0.000478,-0.001152,-0.000825,-0.000238,-0.001964,-0.000734
Medical Condition,-0.002809,0.000317,-0.004616,1.0,-0.004629,0.005512,0.002396,0.001472,-0.005907,0.00276
Insurance Provider,-0.004784,-0.002318,-0.000478,-0.004629,1.0,-0.003242,-0.001624,-0.006766,0.002049,-0.005202
Billing Amount,-0.003832,0.004827,-0.001152,0.005512,-0.003242,1.0,-0.002345,-0.002066,-0.005602,-0.467164
Test Results,-0.006474,0.004892,-0.000825,0.002396,-0.001624,-0.002345,1.0,0.0024,0.001808,-0.001504
target,0.000502,-0.009003,-0.000238,0.001472,-0.006766,-0.002066,0.0024,1.0,0.00696,0.00484
Length of Stay,0.00822,0.003836,-0.001964,-0.005907,0.002049,-0.005602,0.001808,0.00696,1.0,0.003212
Hospital Category,-7e-05,-0.009146,-0.000734,0.00276,-0.005202,-0.467164,-0.001504,0.00484,0.003212,1.0


# Split into Dependent and Independent Variables, Test and Train

In [12]:
X=df.drop('target',axis=1)
y=df['target']

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2,random_state=49, stratify=y)

In [14]:
X_test.columns

Index(['Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Insurance Provider', 'Billing Amount', 'Test Results',
       'Length of Stay', 'Hospital Category'],
      dtype='object')

# Function to train models

In [15]:
def train_models(X_train, y_train):
    models = {}

    # Support Vector Machine (SVM)
    svm = SVC(probability=True, random_state=42)
    svm.fit(X_train, y_train)
    models['SVM'] = svm

    # K-Nearest Neighbors (KNN)
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    models['KNN'] = knn

    # Extreme Gradient Boosting (XGBoost)
    xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss')
    xgb.fit(X_train, y_train)
    models['XGBoost'] = xgb

    # LightGBM
    lgbm = LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    lgbm.fit(X_train, y_train)
    models['LightGBM'] = lgbm

    # CatBoost
    catboost = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, verbose=0, random_state=42)
    catboost.fit(X_train, y_train)
    models['CatBoost'] = catboost

    return models

# Train the models
models = train_models(X_train, y_train)

# Function to evaluate models
def evaluate_models(models, X_test, y_test):
    if not os.path.exists('model_reports'):
        os.makedirs('model_reports')
    if not os.path.exists('models'):
        os.makedirs('models')

    reports = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)
        reports[name] = {'report': report, 'confusion_matrix': cm}

        # Save the classification report
        df_report = pd.DataFrame(report).transpose()
        df_report.to_csv(f'model_reports/{name}_classification_report.csv', index=True)

        # Save the confusion matrix
        df_cm = pd.DataFrame(cm, index=['Actual_No', 'Actual_Yes'], columns=['Predicted_No', 'Predicted_Yes'])
        df_cm.to_csv(f'model_reports/{name}_confusion_matrix.csv', index=True)

        # Save the model
        joblib.dump(model, f'models/{name}_model.pkl')

    return reports

# Evaluate the models
reports = evaluate_models(models, X_test, y_test)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 14615, number of negative: 29785
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 44400, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.329167 -> initscore=-0.711957
[LightGBM] [Info] Start training from score -0.711957


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
