# Threshold tuning


For each model, we investigate the performance changes we get by tuning the threshold. In general, we deem more important to improve the Recall. Therefore we predict that a lowering of the threshold may be advantageous to us. 

## Load libraries and data 

### Load modules

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, recall_score, make_scorer


from sklearn.linear_model import LogisticRegression

from matplotlib_venn import venn2
from tqdm import tqdm

from imblearn.over_sampling import SMOTENC



import src.features as features
import src.model_selection as model_selection 
import src.functions as functions
import src.datasets as datasets 

# import custom functions
from src.functions import plot_correlations, plot_mutual_info, hello
from src.datasets import xy_train, xy_train_test, data_original, data_50000, data_balanced



In [4]:
import pickle

In [5]:
from imblearn.pipeline import Pipeline, make_pipeline
# https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html

### Load dataset

In [6]:
data = datasets.data_original()

### Feature engineering 

In [7]:
data = features.engineer(data)

### Train test split (unprocessed)

In [8]:
X_, y_ = data.drop('infected', axis=1), data['infected']
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size = 0.2, random_state=42, stratify = y_)


### Scaling the data 

In [9]:
scaler = StandardScaler()

scaler.fit(X_train_)

X_train_scaled = scaler.transform(X_train_)
X_test_scaled = scaler.transform(X_test_)

### Data ready for Machine Learning

In [10]:
X_train = pd.DataFrame(X_train_scaled,columns=X_.columns)
X_test = pd.DataFrame(X_test_scaled,columns=X_.columns)
y_train = y_train_ 
y_test = y_test_ 

## Load models

In [15]:
model = LogisticRegression(random_state=42)
with open("models/rfe_features_250103.pkl", "rb") as file:
    # we load the model with 8 features
    rfe_selected_features = pickle.load(file)[8]
X_train_rfe = X_train[rfe_selected_features]
X_test_rfe = X_test[rfe_selected_features]

print(rfe_selected_features)

['hemo', 'drugs', 'z30', 'race', 'gender', 'offtrt', 'cd420', 'time730']


## Threshold tuning

In [20]:

def threshold_results(Xtest, ytest, fit_model, title= "modelname"):
  """ returns a dataframe with columns 
    "model, threshold, accuracy, recall, selectivity, f1, f3, precision, NPV"
  you can plot the acc_recall_curve with sns.lineplot(data=results, x='recall', y='accuracy', hue='model')
  title = string description of the model, the df and the technique used
  """

  yproba = fit_model.predict_proba(Xtest)[:,1]
  auc = roc_auc_score(ytest, yproba)


  results = []
  thresholds = np.arange(0.05, 0.6, 0.01)

  for threshold in thresholds:
    ypred = (yproba > threshold).astype(int)

    results.append({
        "model": title,
        "threshold": threshold,
        "accuracy": accuracy_score(ytest,ypred),
        "recall": recall_score(ytest,ypred),
        "selectivity": recall_score(ytest,ypred,pos_label = 0),
        "f1": f1_score(ytest,ypred),
        "f3": fbeta_score(ytest,ypred,beta = 3),
        "precision": precision_score(ytest,ypred),
        "NPV": precision_score(ytest,ypred, pos_label = 0),
        "ROC AUC": auc
    })
  return pd.DataFrame(results)


def plot_acc_recall(results, figsize = (4,2)):
  """
  you can concatenate a few results as pd.concat([results_rf, results_xgb])
  """
  plt.figure(figsize = figsize)
  sns.lineplot(data=results, x='recall', y='accuracy', hue='model')
  plt.show()


### Test thresholds

In [18]:
model.fit(X_train_rfe,y_train)

In [21]:
results_rfe = threshold_results(X_test_rfe, y_test, model, "rfe 8 original")

In [22]:
results_rfe.head()

Unnamed: 0,model,threshold,accuracy,recall,selectivity,f1,f3,precision,NPV,ROC AUC
0,rfe 8 original,0.05,0.707944,0.932692,0.635802,0.60815,0.842745,0.451163,0.967136,0.921415
1,rfe 8 original,0.06,0.735981,0.923077,0.675926,0.629508,0.844327,0.477612,0.964758,0.921415
2,rfe 8 original,0.07,0.745327,0.913462,0.691358,0.635452,0.839965,0.487179,0.961373,0.921415
3,rfe 8 original,0.08,0.754673,0.913462,0.703704,0.644068,0.842946,0.497382,0.962025,0.921415
4,rfe 8 original,0.09,0.773364,0.913462,0.728395,0.662021,0.848972,0.519126,0.963265,0.921415


## Utils

In [None]:

  # Get the value of all metrics where f1 is max within each model group of thresholds
  idx_max_f3 = results.groupby('model')['f3'].idxmax()
  metrics_at_max_f3 = results.loc[idx_max_f31, :]
  print(metrics_at_max_f3)

  x_f3 = 