# **Two fold cross validation for logistic regression and baseline**

## Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score
import pandas as pd

## Load dataset

In [2]:
filename = 'heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(filename)
y = df['DEATH_EVENT']
X = df.drop(['DEATH_EVENT'], axis = 1)
attributeNames = list(X.keys())
X = X.to_numpy()
y = y.to_numpy()

mu = np.mean(X, 0)
sigma = np.std(X, 0)
X = (X - mu) / sigma
X = (X - mu) / sigma
classNames = "DEATH_EVENT"
N, M = X.shape
C = 2

## Define our models

We are going to define two models:

- A baseline
- A logistic regression 
- A neural network

### Baseline

In [3]:
# In our dataset there are more entries of the class 0 (Survived)
# The best possible accuracy will be around 70% given our dataset
def baseline():
    return 0


### Logistic regression

In [4]:

log_reg = lambda regularization: LogisticRegression(penalty="l2", C= 1/regularization, max_iter= 5000)

In [5]:
def train_logistic_regression(model, X, y, X_test, y_test, regularization):
    test_error = np.empty(len(regularization))
    for s, lambda_value in enumerate(regularization):
        reg = model(lambda_value)
        reg.fit(X,y)
        y_pred = reg.predict(X_test)
        error = sum(y_pred != y_test) / len(y_test)
        test_error[s] = error
    
    return test_error


### ANN

In [6]:
nn = lambda n_hidden_units: torch.nn.Sequential(
                    torch.nn.Linear(M, n_hidden_units), #M features to H hiden units
                    torch.nn.Tanh(),   # 1st transfer function,
                    torch.nn.Linear(n_hidden_units, 1), # H hidden units to 1 output neuron
                    torch.nn.Sigmoid() # final tranfer function
                    )
loss_fn = torch.nn.BCELoss()

## Two fold cross-validation

In [7]:

y.squeeze()

# Variables to control fold splits
K1 = 3
K2 = 3

# Initialize variables
lambda_interval = np.linspace(0.001, 0.1, 9) # Holds regularization values for our logistic regression model
hidden_units_interval = np.linspace(1, 6) # Holds hidden units range for our neural network
final_models_lambda = []
final_models_reg_error = []
final_models_hidden_unit = []
final_models_nn_error = []
Error_test = np.empty((K1,1))
Validation_error_nn = np.zeros((len(hidden_units_interval), K2))
Validation_error_reg = np.zeros((len(lambda_interval), K2)) 

Outer_fold = StratifiedKFold(n_splits = K1, shuffle = True)
Inner_fold =  StratifiedKFold(n_splits = K2, shuffle = True)

for i, (par_index, test_index) in enumerate(Outer_fold.split(X,y)):
    X_par, y_par = X[par_index,:], y[par_index]
    X_test, y_test = X[test_index,:], y[test_index]

    Generalization_error_reg = []
    Generalization_error_nn = []
    Generalization_error_base = []


    for j, (train_index, val_index) in enumerate(Inner_fold.split(X_par, y_par)):
        X_train, y_train = X_par[train_index, :], y_par[train_index]
        X_val, y_val = X_par[val_index], y_par[val_index]

        # Now we test our s models in regression
        for s, lambda_val in enumerate(lambda_interval):
            model  = log_reg(lambda_val)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            Validation_error_reg[s,j] = (len(y_val)/len(y_par))*(sum(y_pred != y_val)/len(y_pred))

    # We compute for each model its generalization error
    for s in range(len(lambda_interval)):
        Generalization_error_reg.append(np.sum(Validation_error_reg[s, :]))

    # We select the best model and compute its test error with D_test
    min_index_reg = np.argmin(Generalization_error_reg)
    best_lambda = lambda_interval[min_index_reg]
    model = log_reg(best_lambda)
    model.fit(X_par, y_par)
    y_pred = model.predict(X_test)
    error_test_reg = sum(y_pred != y_test)/len(y_pred)



    # We store the final models data
    final_models_lambda.append(best_lambda)
    final_models_reg_error.append(error_test_reg)


        
