# Threshold tuning


For each model, we investigate the performance changes we get by tuning the threshold. In general, we deem more important to improve the Recall. Therefore we predict that a lowering of the threshold may be advantageous to us. 

## Load libraries and data 

### Load modules

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, recall_score, make_scorer


from sklearn.linear_model import LogisticRegression

from matplotlib_venn import venn2
from tqdm import tqdm

from imblearn.over_sampling import SMOTENC



import src.features as features
import src.model_selection as model_selection 
import src.functions as functions
import src.datasets as datasets 

# import custom functions
from src.functions import plot_correlations, plot_mutual_info, hello
from src.datasets import xy_train, xy_train_test, data_original, data_50000, data_balanced



In [4]:
import pickle

In [5]:
from imblearn.pipeline import Pipeline, make_pipeline
# https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html

### Load dataset

In [6]:
data = datasets.data_original()

### Feature engineering 

In [7]:
data = features.engineer(data)

### Train test split (unprocessed)

In [8]:
X_, y_ = data.drop('infected', axis=1), data['infected']
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size = 0.2, random_state=42, stratify = y_)


### Scaling the data 

In [9]:
scaler = StandardScaler()

scaler.fit(X_train_)

X_train_scaled = scaler.transform(X_train_)
X_test_scaled = scaler.transform(X_test_)

### Data ready for Machine Learning

In [10]:
X_train = pd.DataFrame(X_train_scaled,columns=X_.columns)
X_test = pd.DataFrame(X_test_scaled,columns=X_.columns)
y_train = y_train_ 
y_test = y_test_ 