# **Two fold cross validation for logistic regression and baseline**

## Imports

In [8]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score
import pandas as pd

## Load dataset

In [9]:
filename = 'heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(filename)
y = df['DEATH_EVENT']
X = df.drop(['DEATH_EVENT'], axis = 1)
attributeNames = list(X.keys())

classNames = "DEATH_EVENT"
N, M = X.shape
C = len(classNames)

## Define our models

We are going to define two models:

- A baseline
- A logistic regression 
- A neural network

### Baseline

In [10]:
# In our dataset there are more entries of the class 0 (Survived)
# The best possible accuracy will be around 70% given our dataset
def baseline():
    return 0


### Logistic regression

In [29]:

log_reg = lambda regularization: LogisticRegression(penalty="l2", C= 1/regularization, max_iter= 5000)

## Two fold cross-validation

In [45]:
K1 = 5 # Number of outer folds
K2 = 5 # Number of inner folds

final_models_lambda = []
final_models_error = []
lambda_interval = np.logspace(-8, 2, 20)
valerror_LG = np.zeros(len(lambda_interval))
generalization_s = np.zeros(len(lambda_interval))


for i in range(K1):
    print('\nCrossvalidation outer fold: {0}/{1}'.format(i+1,K1))    
    X_par, X_test, y_par, y_test = train_test_split(X, y, test_size=0.5, stratify = y)

    # Standardize the training set based on training set mean and std
    mu = np.mean(X_par, 0)
    sigma = np.std(X_par, 0)
    X_par = (X_par - mu) / sigma
    X_par = (X_par - mu) / sigma

    innerKfold = StratifiedKFold(n_splits= K2, shuffle=True)
    for j, (train_index, test_index) in enumerate(innerKfold.split(X_par, y_par)):
        print('\nCrossvalidation inner fold: {0}/{1}'.format(j+1,K2)) 
        X_train = X_par.iloc[train_index]
        X_val = X_test.iloc[test_index]
        y_train = y_par.iloc[train_index]
        y_val = y_test.iloc[test_index]
        
        # Now we test our s models 
        for s, lambda_val in enumerate(lambda_interval):
            model  = log_reg(lambda_val)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            error = sum(y_pred != y_val)/len(y_pred)
            valerror_LG[s] = valerror_LG[s] + error 
        
    # We compute for each model its generalization error
    for s, model in enumerate(generalization_s):
        print(s)
        generalization_s[s] = generalization_s[s] + (len(X_val)/len(X_par)) * valerror_LG[s]

    # We select the best model and compute its test error with D_test
    min_index = np.argmin(generalization_s)
    print(min_index)
    print(lambda_interval[min_index])
    best_lambda = lambda_interval[min_index]
    
    model = log_reg(best_lambda)
    model.fit(X_par, y_par)
    y_pred = model.predict(X_test)
    error = sum(y_pred != y_test)/len(y_pred)

    # We store the final models data
    final_models_lambda.append(best_lambda)
    final_models_error.append(error)




    



Crossvalidation outer fold: 1/5

Crossvalidation inner fold: 1/5

Crossvalidation inner fold: 2/5

Crossvalidation inner fold: 3/5

Crossvalidation inner fold: 4/5

Crossvalidation inner fold: 5/5
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
17
8.858667904100832

Crossvalidation outer fold: 2/5

Crossvalidation inner fold: 1/5

Crossvalidation inner fold: 2/5

Crossvalidation inner fold: 3/5

Crossvalidation inner fold: 4/5

Crossvalidation inner fold: 5/5
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
18
29.763514416313132

Crossvalidation outer fold: 3/5

Crossvalidation inner fold: 1/5

Crossvalidation inner fold: 2/5

Crossvalidation inner fold: 3/5

Crossvalidation inner fold: 4/5

Crossvalidation inner fold: 5/5
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
16
2.6366508987303554

Crossvalidation outer fold: 4/5

Crossvalidation inner fold: 1/5

Crossvalidation inner fold: 2/5

Crossvalidation inner fold: 3/5

Crossvalidation inner fold: 4/5

Crossvalidation inner fol