Pipeline to automate the model fitting, tuning and evaluation process.

#todo:
* identify models to trial:
  KNN, decision tree, bagging, XGBoost?
* test train split: stratified sampling vs. SMOTE
* make hyperparameter tuning parameterised for all candidate models
  - establish grid paramater space
  - set final model parameters...
* (AUROCs) for performance evaluation of multiple models
* Calibration plots to visualise candidate model predictions
* SHAP plot - For best-performing model, identify & report significance of parameters

#todo: questions
* should we set up n folds for CV?

In [33]:
# load libraries and modules
import pandas as pd
import numpy as np, warnings
from pathlib import Path
import os
from importlib import reload

# visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# data processing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_validate, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectFromModel

# model comparison
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#from xgboost import XGBClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.linear_model import SGDClassifier

# Evaluation metrics
from sklearn.metrics import cohen_kappa_score,classification_report 
from sklearn.metrics import mean_squared_error, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, auc, make_scorer
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, balanced_accuracy_score
import shap


In [None]:

def compare_models(X_train_scaled, y_train, candidate_models, class_weights):
    """_summary_

    Args:
        X_train_scaled (_type_): _description_
        y_train (_type_): _description_
        candidate_models (_type_): _description_
        class_weight_dict (_type_): _description_
    """

    # Store model names
    model_names = []
    
    # Store the mean score of n-fold cross validation for each model
    model_average_scores = [] 
   
   # Calculate mean scores using cross validation
    for model_name, model in candidate_models.items():
        scores = cross_val_score(model, X_train_scaled, y_train)
        model_names.append(model_name)
        model_average_scores.append(scores.mean())
        print(f'Scoring completed for {model_name}')
        
    # store mean scores for each model
    df_model = pd.DataFrame()
    df_model['model'] = model_names
    df_model['performance_measure'] 
    df_model['average_score'] = model_average_scores
    
    print(df_model)
    print(" ---------------------------------------- ")
    
    
    return(df_model)

In [None]:
def get_performance_scores(model, X_train, X_test, y_train, y_test):
  """_summary_
  Get performance measures on (i) ballanced accuracy (ii) precision, (iii) recall and (iv) F1 score.
  
  Args:
      model (_type_): _description_
      X_train (_type_): Scaled X_train
      X_test (_type_): Scaled X_train
      y_train (_type_): _description_
      y_test (_type_): _description_
  """
    # Computing balanced accuracy
    balanced_acc_train = balanced_accuracy_score(y_train, y_pred_train)
    balanced_acc_test = balanced_accuracy_score(y_test, y_pred_test)

    # Computing precision and recall
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)

    # Computing F1 score
    f1_train = f1_score(y_train, y_pred_train)
    f1_test = f1_score(y_test, y_pred_test)

    # Format scores
    performance_scores = [balanced_acc_train, balanced_acc_test, precision_train, precision_test, recall_train, recall_test, f1_train, f1_test]
    
    
    formatted_performance_scores = []
    for i in len(performance_scores):
      s = '({i:.4f})'.format(performance_scores[i])
      formatted_performance_scores.append(s)
      
    
    return formatted_performance_scores

In [None]:
def plot_confusion_matrix(model, X_train, X_test, y_train, y_test):
  
  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)
  
  # Computating the confusion matrix
  cm_train = confusion_matrix(y_train, y_pred_train)
  cm_test = confusion_matrix(y_test, y_pred_test)    

  fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
  
  x_labels = ["Predicted\nNon-Sepsis", "Predicted\nSepsis"]
  y_labels = ["Actual Non-Sepsis", "Actual Sepsis"]
  sns.heatmap(cm_train, annot=True, fmt='d', xticklabels=x_labels, yticklabels=y_labels, ax=axes[0])
  sns.heatmap(cm_test, annot=True, fmt='d', xticklabels=x_labels, yticklabels=y_labels, ax=axes[1])
  
  axes[0].set_title("CM in training set", fontsize = 10)
  axes[1].set_title("CM in test set", fontsize = 10)
  axes[0].tick_params(labelsize=9)
  axes[1].tick_params(labelsize=9)
  plt.tight_layout()
  
  print(model_name)
  print(" ---------------------------------------- ")
  plt.show()

In [None]:
def plot_roc_curve_prec_rec(fpr, tpr, model_name, label = None):
  """_summary_
  returns AUC score, ROC curve, precision recall

  Args:
      fpr (array): _description_
      tpr (array): _description_
      model_name (str): Model name
      label (_type_, optional): _description_. Defaults to None.
  """

  # Compute AUC score
  auc_test = auc(fpr_test, tpr_test)
  print(auc_test)
  
  # Plot ROC curve
  plt.plot(fpr, tpr, linewidth=2, label = label)
  plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
  plt.axis([0, 1, 0, 1])                                    
  plt.xlabel('False Positive Rate (Fall-Out)', fontsize=11) 
  plt.ylabel('True Positive Rate (Recall)', fontsize=11)
  plt.grid(False)  
  
  plt.title("ROC Curve" + model_name)
  plt.figure(figsize=(6, 6))
  plt.show()
  
  # Plot precision recall
  display = PrecisionRecallDisplay.from_estimator(
    t0_lr, X_t0_test, y_t0_test, name="Logistic Regression", plot_chance_level=True
  )
  display.ax_.legend(loc='upper right')
  _ = display.ax_.set_title("2-class Precision-Recall curve")

## Hyperparameter tuning for the best model

In [None]:

def tune_hyperparameters(X_train, y_train, model):
    """_summary_

    Args:
        X_train_scaled (_type_): _description_
        y_train (_type_): _description_
        best_model_name (_type_): _description_
    """   
    
    param_grid = {
        'Logistic_Regression': {'C': [0.1, 1, 10],
                                'penalty': ['l1', 'l2']
                            },
        'Random_Forest': {'n_estimators': [50, 100, 150], 
                      'max_depth': [None, 10, 20], 
                      'min_leaf': list(range(2, 8)),
                        'min_samples_split': [2, 25, 50, 100, 250],
                      'min_samples_leaf': [2, 25, 50, 100, 250]
                        #   'min_samples_split': list(range(2,25)),
                    #   'min_samples_leaf': list(range(2,25))
                      },
        'Gradient_Boosting': {'n_estimators': [50, 100, 150, 200], 
                          'learning_rate': [0.01, 0.1, 0.2, 0.5], 
                          'loss': ['log_loss', 'exponential'], 
                          'criterion': ['friedman_mse', 'squared_error'],
                          'max_features': ['sqrt', 'log2']
                          }
    }
    
    # Create an empty dictionary to store the best parameters for each model
    best_params = {}
    
    candidate_models = {
        'Logistic_Regression': LogisticRegression(max_iter=10000000000, class_weight=class_weight_dict),
        'Random_Forest': RandomForestClassifier(class_weight=class_weight_dict),
        'Gradient_Boosting': GradientBoostingClassifier()
    
    # todo: are these valid?
    # 'SGD_Classifier': SGDClassifier(class_weight=class_weight_dict),
    # 'XGB': XGBClassifier(),
    # 'KNeighbors': KNeighborsClassifier(),
    # 'Adaboost':AdaBoostClassifier()
    }

    # Loop through the candidate_models dictionary to perform GridSearchCV for each model
    for model_name, model in candidate_models.items():
        
        #todo: gridSearchCV or RandomizedSearchCV using refit=True?
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid[model_name], cv=3)
        
        if model_name == 'Gradient_Boost':
            grid_search.fit(X_train, y_train, sample_weight=[class_weights[label] for label in y_train])
        else:
            grid_search.fit(X_train, y_train)
        
        
        # Store the best parameters for the current model
        best_params[model_name] = grid_search.best_params_
        
        
        grid_search.cv_results_.dropna()
        
        # get best score
        best_score = gridcv.best_score_
        print(f"Best score for {best_model_name}: {best_score}")
        
        print(grid_search.best_score_)
        print(grid_search.best_params_)
        

        #hypertuned_model = final_model.fit(X_train, y_train)
        
        # return model object
        #return(hypertuned_model)  

In [None]:
def validate_test_groundtruth(final_model,X_test_scaled,y_test):
    """
    Model Performance validation against test data

    Args:
        final_model (_type_): _description_
        X_test_scaled (_type_): _description_
        y_test (_type_): _description_
    """
        
    y_predicted = final_model.predict(X_test_scaled)

    print("\n\nFor test data generated using train test split \n")
    print(classification_report(y_test, y_predicted))

    print(" \n\n  ----------------------------------------  \n\n")

    print(f"cohen_kappa_score: {cohen_kappa_score(y_test,y_predicted)} ")

In [40]:
def split_data(df_train):
    """
    Split data into training and test sets.
    Standardise numerical features.
    
    Compute class weights for balancing the classes in the target variable.
    Compare performances of different models.
    Perform hyperparameter tuning on best model.
    Validate the optimal model's performance on the test set.
    Generate predictions for unknown data using the optimal model.
    
    Parameters:
    - df_train: Processed data for model training
    - test_size: 
    
    Returns:
    - DataFrame 
    """
         
    # Target and Predictors
    X = df_train.drop('IS_SEPSIS', axis='columns')
    y = df_train['IS_SEPSIS']
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    # Fit the scaler and transform the X train and test sets
    # Standardising (not normalising!)
    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

In [42]:
def get_class_weights(y_train):
      
  # Get sepsis label proportions
  label_counts = y_train.value_counts()
  label_proportions = label_counts / len(y_train)*100
    
  # Compute class weights
  class_weights = {0: 1 / (label_proportions[0] / 100), 1: 1 / (label_proportions[1] / 100)}

  # Round the class weights to the desired precision (optional)
  class_weights = {key: round(weight, 4) for key, weight in class_weights.items()}
  
  return class_weights


In [None]:
def train_models(X_train, X_test, y_train, y_test)
    

    

    # Get model performances
    model_performance_df = compare_models(X_train, y_train, class_weights)  
    
    # sort by model performance 
    sorted_model_performance_df = model_performance_df.sort_values(by=['average score'],ascending=False) 
    
    # find best performing model
    best_model_name = sorted_model_performance_df.head(1)['model'].to_string().split(" ")[4]
    print("\n\n The best Performing model :", best_model_name)
    
    # tune hyperparameters of best performing model
    tuned_model = tune_hyperparameters(X_train_scaled, y_train, best_model_name)
        
    # Model performance for test data generated using train test split
    
    # To print classification report and cohen_kappa_score    
    # performance_report = validate_test_groundtruth(tuned_model,X_test_scaled,y_test) 

In [19]:
def get_model_input_df(file_path):
  """_summary_
  
  Args:
      file_name (_type_): Processed csv file
  """
  model_input_df = pd.read_csv(file_path)
  model_input_df = model_input_df.drop(columns=["SUBJECT_ID", "HADM_ID"])
  return model_input_df

In [20]:
# init the parameters
ROOT_DIR = Path('')

file_path = ROOT_DIR / 'data' / 'model_input' / 't0_v3.csv'

model_input_df = get_model_input_df(file_path)





In [41]:
X_train, X_test, y_train, y_test = split_data(df_train=model_input_df)

In [44]:
class_weights = get_class_weights(y_train)
class_weights

{0: 1.1147, 1: 9.7204}

In [None]:
load_expression(model_input_df, test_size, candidate_models)