---

**Load essential libraries**

---

In [None]:
## Load modules

# Standard modules
import pandas as pd
import numpy as np
import sys

# Preprocessing modules
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,\
 OneHotEncoder, LabelEncoder

# Train-test split module
from sklearn.model_selection import train_test_split

# Dimension reduction module
from sklearn.decomposition import PCA

# Classifier module
from sklearn.ensemble import GradientBoostingClassifier


# Pipeline modules
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Performance metric modules
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline
plt.rcParams['figure.figsize'] = (4.0, 4.0) # set default size of plots

pd.options.display.max_columns = None

In [None]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    # Change path below starting from /content/drive/MyDrive/Colab Notebooks/
    # depending on how data is organized inside your Colab Notebooks folder in
    # Google Drive
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/OddSem2024MAHE'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

---

Load ICU Data

---

In [None]:
## Load ICU Data
file = DATA_DIR+'ICU_Complete.csv'
dfICU = pd.read_csv(file)
dfICU.head(5)

---

Drop the 'In-hospital_death' and 'Length_of_stay' columns as we will develop a prospective model which takes as input information available at the time of patient admission and will predict whether a patient will need mechanical ventilation or not.

---

In [None]:
## Drop the 'In-hospital_death' and 'Length_of_stay' columns
dfICU.drop(['In-hospital_death', 'Length_of_stay'], axis = 1, inplace = True)

---

Create lists of categorical and continuous features

---

In [None]:
## Create lists of categorical and continuous features
categorical_features = ['Gender', 'MechVent']
continuous_features = dfICU.columns[~dfICU.columns.isin(categorical_features)].to_list()
dfICU.dtypes

---

Convert categorical features to the categorical type

---

In [None]:
dfICU[categorical_features] = dfICU[categorical_features].astype('category')
dfICU.dtypes

---

How balanced is the dataset w.r.t. the target variable 'MechVent'?

---

In [None]:
## How balanced is the dataset w.r.t. the target variable 'MechVent'?
dfICU['MechVent'].value_counts().plot(kind = 'barh');

---

Remove the target variable 'MechVent' from the list of categorical features

---

In [None]:
## Remove the target variable 'MechVent' from the list of categorical features
categorical_features.remove('MechVent')

---

Stratified train and test split of the data

---

In [None]:
## Stratified train and test split of the data
X = dfICU.drop('MechVent', axis = 1)
y = dfICU['MechVent']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify = y,
                                                    test_size = 0.2,
                                                    random_state = 1)
print(f'{X_train.shape[0]} training samples and {X_test.shape[0]} test samples')

In [None]:
print(np.mean(y_train == 'No'))
print(np.mean(y_train == 'Yes'))

print(np.mean(y_test == 'No'))
print(np.mean(y_test == 'Yes'))

---

Build preprocessing pipeline for categorical and continuous features


---

In [None]:
## Build pipeline for continuous and categorical features

# Pipeline object for continuous features
continuous_transformer = Pipeline(steps = [('scaler', StandardScaler()),
                                           ('pca', PCA(n_components = 7))])

# Pipeline object for categorical features
categorical_transformer = Pipeline(steps = [('onehotenc', OneHotEncoder(handle_unknown = 'ignore'))])


# Create a preprocessor object for all features
preprocessor = ColumnTransformer(transformers = [('continuous', continuous_transformer, continuous_features),
                                                 ('categorical', categorical_transformer, categorical_features)
                                                 ],
                                 remainder = 'passthrough'
                                 )

# Define a classifier object
classifier = GradientBoostingClassifier()

# Define the entire classification model
model = Pipeline(steps = [('preprocessor', preprocessor), ('classifier', classifier)])

---

Fit the model on the train data and test on the test data

---

In [None]:
## Fit the model on the train data and test on the test data
model.fit(X_train, y_train)

# Predict the output labels for the test set
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))