## Clone github and import modules

In [None]:
# Clone github and import modules
!git clone https://github.com/jveenland/tm10007_ml.git

! pip install sklearn numpy matplotlib imbalanced-learn statsmodels

import zipfile
import statsmodels
import warnings
import statistics
import seaborn as sns
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve, average_precision_score, f1_score
from sklearn.compose import TransformedTargetRegressor
from scipy.stats import shapiro, lognorm, randint
from scipy.spatial.distance import cdist
from sklearn.model_selection import StratifiedShuffleSplit, learning_curve, GridSearchCV, StratifiedKFold, cross_val_score, KFold, train_test_split, RandomizedSearchCV, validation_curve 
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler, StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn import datasets as ds, model_selection, metrics, neighbors
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from statsmodels.stats.multitest import multipletests
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import classification_report

# Classifiers
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

fatal: destination path 'tm10007_ml' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##Defining functions

Missing data


In [None]:
def missing_data(X_design):
  # Missing data
  nan_count = X_design.isna().sum().sum()  # count missing data -> 0 in our dataset

  # Delete missing data when > 50% of feature or sample is missing
  X_design = X_design.dropna(axis='columns', how='all') # deletes a feature if all values of a column (so feature) are empty
  X_design = X_design.dropna(axis='rows', how='all') # deletes a patient if all values of a row (so sample) are empty
  
  #threshold = len(X_design.columns) // 2
  #X_design = X_design.dropna(axis='columns', thresh=threshold) # deletes a feature if 50% values of a column (so feature) are empty
  #X_design = X_design.dropna(axis='rows', thresh=threshold) # deletes a patient if 50% values of a row (so sample) are empty

  # Missing data to median per feature
  for column in X_design.columns:
      X_design[column].fillna(X_design[column].median(), inplace=True)
  return X_design

def missing_data_zero_check(X_design):
  """This function does the same as missing_data, but also corrects for zeros, assuming zeros are missing data"""
  # Missing data
  X_design = X_design.replace(0, np.nan)  # make all zeros to NaN
  nan_count = X_design.isna().sum().sum()  # count missing data -> 10500 in our dataset

  # Delete missing data when > 50% of feature or sample is missing
  X_design = X_design.dropna(axis='columns', how='all') # deletes a feature if all values of a column (so feature) are empty
  X_design = X_design.dropna(axis='rows', how='all') # deletes a patient if all values of a row (so sample) are empty
  #threshold = len(X_design.columns) // 2
  #X_design = X_design.dropna(axis='columns', thresh=threshold) # deletes a feature if 50% values of a column (so feature) are empty
  #X_design = X_design.dropna(axis='rows', thresh=threshold) # deletes a patient if 50% values of a row (so sample) are empty

  # Missing data to median per feature
  for column in X_design.columns:
      X_design[column].fillna(X_design[column].median(), inplace=True)
  return X_design

Outliers

In [None]:
def removing_outliers(X_design):
    # supress performance warning
    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

    # Outliers: Tukey's fence 
    k=3
    fences=pd.DataFrame()
    outliers = pd.DataFrame(False, index=X_design.index, columns=X_design.columns) # create an empty DataFrame for outliers

    for col in X_design.columns:
        q1, q3 = np.percentile(X_design[col], [25, 75])
        iqr = q3 - q1
        lower_fence = q1 - k*iqr
        upper_fence = q3 + k*iqr
        fences[col]=[lower_fence, upper_fence]
        for row in X_design.index:
            if X_design.loc[row, col] < lower_fence or X_design.loc[row, col] > upper_fence:
                outliers.loc[row, col] = True # mark the place as an outlier

    row_count = (outliers == True).sum(axis=1)
    col_count = (outliers == True).sum(axis=0)
    total_count = row_count.sum() + col_count.sum()
    print(f'The total number of outliers in dataset x is {total_count}')

    # create a copy of x to modify
    new_x = X_design.copy()

    #replace outliers with maximum or minimun interquartile range of x by column
    for col in outliers.columns:
        q3 = X_design.loc[outliers[col] == False, col].quantile(0.75) # 3rd quartile of column where outlier is False
        q1 = X_design.loc[outliers[col] == False, col].quantile(0.25) # 1st quartile of column where outlier is False
        iqr = q3 - q1 # interquartile range of column where outlier is False
        lower_fence = q1 - k*iqr
        upper_fence = q3 + k*iqr
        new_x.loc[outliers[col] & (new_x[col] > upper_fence), col] = upper_fence # replace outliers with upper fence
        new_x.loc[outliers[col] & (new_x[col] < lower_fence), col] = lower_fence # replace outliers with lower fence

        X_design = new_x.copy()
    return X_design

For learning curves: PCA+univariate and LASSO

In [None]:
def PCA_univariate(X_train, y_train):
    #Remove features with zero variance
    sel_vt = VarianceThreshold(threshold=0.0)
    X_vt = sel_vt.fit_transform(X_train, y_train)
    
    #PCA
    n_samples = X_train.shape[0]
    n_features = X_train.shape[1]
    n_features = min(n_samples, n_features)

    pca = PCA(n_components=n_features)            
    X_train = pca.fit_transform(X_train)

    #Univariate
    sel_kb = SelectKBest(f_classif, k='all')
    sel_kb.fit(X_train, y_train)
    p_values = sel_kb.pvalues_

    reject_fdr, pvals_fdr, _, _ = multipletests(pvals=p_values, alpha=0.05, method='fdr_bh')
    features_selected=np.array(np.where(reject_fdr)[0])
    print(features_selected.shape)
    X_train = X_train[:,features_selected]

    return X_train

def LASSO_feature(X_train_lasso, y_train):
    # Define the Lasso model
    lasso = Lasso()

    # Grid search
    alphas = np.logspace(-5, 1, 100)  # Define the grid of alpha values to search over
    grid_search = GridSearchCV(lasso, param_grid={'alpha': alphas}, cv=5)
    grid_search.fit(X_train_lasso, y_train)
    best_alpha = grid_search.best_params_['alpha']

    # Create a new Lasso model using the best alpha value
    lasso = Lasso(alpha=best_alpha)
    lasso.fit(X_train_lasso, y_train)

    # Get the coefficients of the Lasso model, find them and define the new X_train with less features
    coef = lasso.coef_
    selected_features = np.where(coef != 0)[0]
    X_train_lasso = X_train_lasso[selected_features]

    return X_train_lasso

def plot_learning_curve(estimator, title, X, y, axes, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
  axes.set_title(title)
  if ylim is not None:
      axes.set_ylim(*ylim)
  axes.set_xlabel("Training examples")
  axes.set_ylabel("Score")
  
  scorer = make_scorer(f1_score)
  
  train_sizes, train_scores, test_scores  = \
    learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes, scoring=scorer)
  train_scores_mean = np.mean(train_scores, axis=1)
  train_scores_std = np.std(train_scores, axis=1)
  test_scores_mean = np.mean(test_scores, axis=1)
  test_scores_std = np.std(test_scores, axis=1)
    
  # Plot learning curve
  axes.grid()
  axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
  axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
  axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
  axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
  axes.legend(loc="best")
  
  return plt

## Load data

In [None]:
with zipfile.ZipFile('/content/tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')

data = pd.read_csv('/content/tm10007_ml/ecg/ecg_data.csv', index_col=0)

## Explore data

In [None]:
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

X = data.loc[:, data.columns != 'label']  #samples and features
y = data['label']  # labels

# normal / abnormal ECGs
total_abnormal_ECG = np.count_nonzero(y) 
total_normal_ECG = y.size -np.count_nonzero(y) 
percentage_abnormal = total_abnormal_ECG / (total_abnormal_ECG + total_normal_ECG)*100

print(f'{total_abnormal_ECG} people have an abnormal ECG')
print(f'{total_normal_ECG} people have a normal ECG')
print(f'The percentage of abnormal ECGs in this dataset is {percentage_abnormal} %')


The number of samples: 827
The number of columns: 9001
9 people have an abnormal ECG
41 people have a normal ECG
The percentage of abnormal ECGs in this dataset is 18.0 %


## Learning curves
The learning curves are commented, but if interested, you can uncomment and see how the curves look like. 

Preperation of data

In [None]:
# # create X_train
# X_train, X_test_DO_NOT_FIT, y_train, y_test_DO_NOT_FIT = model_selection.train_test_split(X, y, test_size=0.1, stratify=y)

# # Balance data
# ros = RandomOverSampler(sampling_strategy='minority')
# X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# X_train = X_resampled
# y_train = y_resampled

# # Scale the data to be normal
# scaler = preprocessing.RobustScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_train=pd.DataFrame(X_train)

# # Create X_train for PCA+univariate and for LASSO
# X_train = X_train.copy()
# X_train_lasso = X_train.copy()

# clsfs_all = [neighbors.KNeighborsClassifier(n_neighbors=1), 
#          neighbors.KNeighborsClassifier(n_neighbors=5), 
#          neighbors.KNeighborsClassifier(n_neighbors=20), 
#          RandomForestClassifier(n_estimators=1, random_state=42),
#          RandomForestClassifier(n_estimators=5, random_state=42),
#          RandomForestClassifier(n_estimators=200, random_state=42),
#          SVC(kernel='rbf', C=10, gamma=0.1),
#          SVC(kernel='linear', C=10, gamma=0.1),
#          SVC(kernel='poly', C=10, gamma=0.1),
#          SVC(kernel='sigmoid', C=10, gamma=0.1),
#          GaussianNB(),
#          LinearRegression(),
#          QuadraticDiscriminantAnalysis()
#          ]


Learning curves for PCA+univariate

In [None]:
# X_train=PCA_univariate(X_train,y_train)

# num=0
# fig = plt.figure(figsize=(24,8*len(clsfs_all)))

# # Create a cross-validation object
# cv = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)

# # Now use the classifiers on all datasets
# for clf in clsfs_all:
#     title = str(type(clf))
#     ax = fig.add_subplot(7, 3, num + 1)
#     plot_learning_curve(clf, title, X_train, y_train, ax, ylim=(0.3, 1.01), cv=cv)
#     num += 1

Regularization for PCA+ univariate and their new learning curves

In [None]:
# # Define the Lasso model
# lasso2 = Lasso()
# # Define the grid of alpha values to search over
# alphas2 = np.logspace(-5, 1, 100)
# # Define the grid search
# grid_search2 = GridSearchCV(lasso2, param_grid={'alpha': alphas2}, cv=5)
# # Fit the grid search to your training data
# grid_search2.fit(X_train, y_train)
# # Get the best alpha value from the grid search
# best_alpha2 = grid_search2.best_params_['alpha']
# # Create a new Lasso model using the best alpha value
# lasso2 = Lasso(alpha=best_alpha2)
# # Fit the Lasso model to your training data
# lasso2.fit(X_train, y_train)
# # Get the coefficients of the Lasso model
# coef2 = lasso2.coef_
# # Get the indices of the selected features
# selected_features2 = np.where(coef2 != 0)[0]
# #X_train_regularization = X_train[selected_features2]
# #X_train_regularization = X_train[:,selected_features2]
# X_train_regularization = X_train[:,selected_features2[:]]

# clsfs_regularization = [SVC(kernel='rbf', C=10, gamma=0.1),
#          SVC(kernel='poly', C=10, gamma=0.1),
#          RandomForestClassifier(n_estimators=5, random_state=42),
#          RandomForestClassifier(n_estimators=200, random_state=42)]

# num=0
# fig = plt.figure(figsize=(24,8*len(clsfs_regularization)))

# # Create a cross-validation object
# cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

# # Now use the classifiers on all datasets
# for clf in clsfs_regularization:
#     title = str(type(clf))
#     ax = fig.add_subplot(7, 3, num + 1)
#     plot_learning_curve(clf, title, X_train_regularization, y_train, ax, ylim=(0.3, 1.01), cv=cv)
#     num += 1

Learning curves for LASSO

In [None]:
# num=0
# fig = plt.figure(figsize=(24,8*len(clsfs_all)))

# X_train_lasso=LASSO_feature(X_train_lasso,y_train)

# # Create a cross-validation object
# cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

# # Now use the classifiers on all datasets
# for clf in clsfs_all:
#     title = str(type(clf))
#     ax = fig.add_subplot(7, 3, num + 1)
#     plot_learning_curve(clf, title, X_train_lasso, y_train, ax, ylim=(0.3, 1.01), cv=cv)
#     num += 1

## Pipeline *1*: PCA + univariate -> Gaussian Naive Bayes

In [None]:
# Define outer and inner cross validation
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42) 

#Create empty arrays (for results)
f1_design_1_before = []
f1_test_1_before = []
f1_design_1_after = []
f1_test_1_after = []
results_1 = pd.DataFrame()
best_pipe_1 = []
best_hp_1_before = pd.DataFrame()
best_hp_1 = pd.DataFrame()
y_test_1_all = []
y_pred_1_all = []

for design_index, test_index in outer_cv.split(X, y): 
    X_design = X.transpose()[design_index]
    X_design = X_design.transpose()
    y_design = y[design_index]
    
    X_test = X.transpose()[test_index]
    X_test = X_test.transpose()
    y_test_1 = y[test_index].tolist()
    y_test_1_all.append(y_test_1)
    
    # remove outliers
    X_design = removing_outliers(X_design)
    X_test = removing_outliers(X_test)

    # Correct missing data
    X_design = missing_data(X_design)
    X_test = missing_data(X_test)

    # balance the classes, so design set consists of 50% normal and 50% abnormal ECG's
    ros = RandomOverSampler(sampling_strategy='minority')
    X_resampled, y_resampled = ros.fit_resample(X_design, y_design)
    X_design = X_resampled
    y_design = y_resampled   

    print(f'shape after balancing {X_design.shape}')
    
    # Define pipeline 1
    pipeline_1a = Pipeline([
        ('scaler', RobustScaler()),
        ('var_threshold', VarianceThreshold(threshold=0.0)),
        ('pca', PCA(n_components=0.5)),
    ])

    param_grid_1a = {
        'pca__n_components': [0.5],#,0.75, 0.9, 0.95, 0.99],
        }

    # Perform grid search with inner cross-validation, part 1
    rand_search_1a = RandomizedSearchCV(pipeline_1a, param_distributions=param_grid_1a, n_iter=10, cv=inner_cv, scoring='f1', n_jobs=-1) #klopt n__iter
    rand_search_1a.fit(X_design, y_design) # klopt dit fit_transform?
    X_design = rand_search_1a.best_estimator_.transform(X_design)
    X_test = rand_search_1a.best_estimator_.transform(X_test)
    
    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_1_before = best_hp_1_before.append(rand_search_1a.best_params_,ignore_index=True)
    print(best_hp_1_before)
    print(f'shape of X_design after pca {X_design.shape}')

    sel_kb = SelectKBest(f_classif, k='all')
    sel_kb.fit(X_design, y_design)
    p_values = sel_kb.pvalues_

    reject_fdr, pvals_fdr, _, _ = multipletests(pvals=p_values, alpha=0.05, method='fdr_bh')
    features_selected=np.array(np.where(reject_fdr)[0])
    print(f'size of features selected{features_selected.shape}')
    X_design = X_design[:,features_selected]
    X_test = X_test[:,features_selected]
    
    # pipeline 1b
    pipeline_1b = Pipeline([
        ('clf', GaussianNB())
    ])

    # Define scores BEFORE hyperparameter tuning
    pipeline_1b.fit(X_design, y_design)
 
    y_pred_design_1 = pipeline_1b.predict(X_design)
    f1_design_1_bef = f1_score(y_design, y_pred_design_1)
    f1_design_1_before.append(f1_design_1_bef)

    y_pred_test_1_before = pipeline_1b.predict(X_test)
    f1_test_1_bef = f1_score(y_test_1, y_pred_test_1_before)
    f1_test_1_before.append(f1_test_1_bef)

    # Define hyperparameters of pipeline 1
    param_grid_1b = {
    'clf__var_smoothing': np.logspace(0,-9, num=100),
    }

    print(f'after feature selection: {X_design.shape}')

    # Perform grid search with inner cross-validation, part 2
    model_1 = RandomizedSearchCV(pipeline_1b, param_distributions=param_grid_1b, n_iter=50, cv=inner_cv, scoring='f1', n_jobs=-1) 
    model_1.fit(X_design, y_design)
    results = pd.DataFrame(model_1.cv_results_)
    results_1 = results_1.append(results,ignore_index=True)

    # Define scores AFTER hyperparameter tuning 
    y_pred_design_1_after = model_1.predict(X_design)
    f1_design_1_aft = f1_score(y_design, y_pred_design_1_after)
    f1_design_1_after.append(f1_design_1_aft)
    
    y_pred_test_1_after = model_1.predict(X_test)
    y_pred_1_all.append(y_pred_test_1_after)
    
    f1_test_1_aft = f1_score(y_test_1, y_pred_test_1_after)
    f1_test_1_after.append(f1_test_1_aft)

    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_1 = best_hp_1.append(model_1.best_params_,ignore_index=True)
    # Stores the optimum model in best_pipe
    best_pipe_1.append(model_1.best_estimator_)

# Save results of inner CV into .csv file
results_1.to_csv('results_1.csv', index=False)

print(f'Mean and std of F1 scores of pipeline 1: {statistics.mean(f1_test_1_after)} +/- {statistics.stdev(f1_test_1_after)}')
print(f'The optimal hyperparameters per split: {best_hp_1}')
print(f'The best pipes per split {best_pipe_1}')
print(f'The design F1 scores before tuning {f1_design_1_before}')
print(f'The test F1 scores before tuning {f1_test_1_before}')
print(f'The design F1 scores after tuning {f1_design_1_after}')
print(f'The test F1 scores afer tuning {f1_test_1_after}')

data_1 = [f1_design_1_after, f1_test_1_after]
sns.boxplot(data=data_1)
plt.title('Boxplot F1 scores design and test pipeline 1')
plt.xlabel('design and test')
plt.ylabel('F1 score score')
plt.show()

# Loop over rows and compute precision recall curve for each row
for i in range(len(y_pred_1_all)):
    precision, recall, thresholds = precision_recall_curve(y_test_1_all[i], y_pred_1_all[i])
    auc = average_precision_score(y_test_1_all[i], y_pred_1_all[i])

    # Plot the ROC curve for each row
    plt.plot(recall, precision, lw=2, label='PR curve it. %d (AP = %0.2f)' % (i+1, auc))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall curve for all rows')
plt.legend(loc="lower right", fontsize=8)
plt.show()

## Pipeline 2: PCA + Univariate -> Quadratic Discriminant analysis

In [None]:
# PIPELINE 2
# Define outer and inner cross validation
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Creating empty arrays
f1_design_2_before = []
f1_test_2_before = []
f1_design_2_after = []
f1_test_2_after = []
results_2 = pd.DataFrame()
best_pipe_2 = []
best_hp_2_before = pd.DataFrame()
best_hp_2 = pd.DataFrame()
y_test_2_all = []
y_pred_2_all = []

for design_index, test_index in outer_cv.split(X, y): 
    X_design = X.transpose()[design_index]
    X_design = X_design.transpose()
    print(f'Size_X_design {X_design.shape}') # print size of X_design
    y_design = y[design_index]
    
    X_test = X.transpose()[test_index]
    X_test = X_test.transpose()
    y_test_2 = y[test_index].tolist()
    y_test_2_all.append(y_test_2)
    print(f'Size X_test {X_test.shape}')     # print size of X_test
    
    # remove outliers
    X_design = removing_outliers(X_design)
    X_test = removing_outliers(X_test)

    # Correct missing data
    X_design = missing_data(X_design)
    X_test = missing_data(X_test)

    # balance the classes, so design set consists of 50% normal and 50% abnormal ECG's
    ros = RandomOverSampler(sampling_strategy='minority')
    X_resampled, y_resampled = ros.fit_resample(X_design, y_design)
    X_design = X_resampled
    y_design = y_resampled   

    print(f'shape after balancing {X_design.shape}')
    
    # Define pipeline 2
    pipeline_2a = Pipeline([
        ('scaler', RobustScaler()),
        ('var_threshold', VarianceThreshold(threshold=0.0)),
        ('pca', PCA(n_components=0.5)),
    ])

    param_grid_2a = {
        'pca__n_components': [0.5],#0.75, 0.9, 0.95, 0.99],
        }

    # Perform grid search with inner cross-validation, part 1
    rand_search_2a = RandomizedSearchCV(pipeline_2a, param_distributions=param_grid_2a, n_iter=10, cv=inner_cv, scoring='f1', n_jobs=-1) #klopt n__iter
    rand_search_2a.fit(X_design, y_design) # klopt dit fit_transform?
    X_design = rand_search_2a.best_estimator_.transform(X_design)
    X_test = rand_search_2a.best_estimator_.transform(X_test)
    
    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_2_before = best_hp_2_before.append(rand_search_2a.best_params_,ignore_index=True)
    print(best_hp_2_before)
    print(f'shape of X_design after pca {X_design.shape}')

    sel_kb = SelectKBest(f_classif, k='all')
    sel_kb.fit(X_design, y_design)
    p_values = sel_kb.pvalues_

    reject_fdr, pvals_fdr, _, _ = multipletests(pvals=p_values, alpha=0.05, method='fdr_bh')
    features_selected=np.array(np.where(reject_fdr)[0])
    print(f'size of features selected{features_selected.shape}')
    X_design = X_design[:,features_selected]
    X_test = X_test[:,features_selected]

    # design the classifier on the selected features with the best hyperparameters to create best designed classifier
    model_2 = QuadraticDiscriminantAnalysis()
    model_2.fit(X_design, y_design)

    # Define scores AFTER hyperparameter tuning 
    y_pred_design_2_after = model_2.predict(X_design)
    f1_design_2_aft = f1_score(y_design, y_pred_design_2_after)
    f1_design_2_after.append(f1_design_2_aft)
   
    y_pred_test_2_after = model_2.predict(X_test)
    y_pred_2_all.append(y_pred_test_2_after)
   
    f1_test_2_aft = f1_score(y_test_2, y_pred_test_2_after)
    f1_test_2_after.append(f1_test_2_aft)

print(f'Mean and std of F1 scores of pipeline 1: {statistics.mean(f1_test_2_after)} +/- {statistics.stdev(f1_test_2_after)}')
print(f'The optimal hyperparameters per split: {best_hp_2}')
print(f'The best pipes per split {best_pipe_2}')
print(f'The design F1 scores before tuning {f1_design_2_before}')
print(f'The test F1 scores before tuning {f1_test_2_before}')
print(f'The design F1 scores after tuning {f1_design_2_after}')
print(f'The test F1 scores afer tuning {f1_test_2_after}')

data_2 = [f1_design_2_after, f1_test_2_after]
sns.boxplot(data=data_2)
plt.title('Boxplot F1 scores design and test pipeline 2')
plt.xlabel('design and test')
plt.ylabel('F1 score score')
plt.show()

# Loop over rows and compute precision recall curve for each row
for i in range(len(y_pred_2_all)):
    precision, recall, thresholds = precision_recall_curve(y_test_2_all[i], y_pred_2_all[i])
    auc = average_precision_score(y_test_2_all[i], y_pred_2_all[i])

    # Plot the ROC curve for each row
    plt.plot(recall, precision, lw=2, label='PR curve it. %d (AP = %0.2f)' % (i+1, auc))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall curve for all rows pipeline 2')
plt.legend(loc="lower right", fontsize=8)
plt.show()

## Pipeline 3

In [None]:
# PIPELINE 3
# Define outer and inner cross validation
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Creating empty arrays
f1_design_3_before = []
f1_test_3_before = []
f1_design_3_after = []
f1_test_3_after = []
results_3 = pd.DataFrame()
best_pipe_3 = []
best_hp_3_before = pd.DataFrame()
best_hp_3 = pd.DataFrame()
y_test_3_all = []
y_pred_3_all = []

for design_index, test_index in outer_cv.split(X, y): 
    X_design = X.transpose()[design_index]
    X_design = X_design.transpose()
    print(f'Size_X_design {X_design.shape}') # print size of X_design
    y_design = y[design_index]
    
    X_test = X.transpose()[test_index]
    X_test = X_test.transpose()
    y_test_3 = y[test_index].tolist()
    y_test_3_all.append(y_test_3)
    print(f'Size X_test {X_test.shape}')     # print size of X_test
    
    X_design = removing_outliers(X_design)
    X_test = removing_outliers(X_test)
    X_design = missing_data(X_design)
    X_test = missing_data(X_test)

    # balance the classes, so design set consists of 50% normal and 50% abnormal ECG's
    ros = RandomOverSampler(sampling_strategy='minority')
    X_resampled, y_resampled = ros.fit_resample(X_design, y_design)
    X_design = X_resampled
    y_design = y_resampled   
    print(f'shape after balancing {X_design.shape}')
    
    # Define pipeline 3
    pipeline_3a = Pipeline([
        ('scaler', RobustScaler()),
        ('var_threshold', VarianceThreshold(threshold=0.0)),
        ('pca', PCA(n_components=0.5)),
    ])

    param_grid_3a = {
        'pca__n_components': [0.5],#0.75, 0.9, 0.95, 0.99],
        }

    # Perform grid search with inner cross-validation, part 1
    rand_search_3a = RandomizedSearchCV(pipeline_3a, param_distributions=param_grid_3a, n_iter=10, cv=inner_cv, scoring='f1', n_jobs=-1) #klopt n__iter
    rand_search_3a.fit(X_design, y_design) # klopt dit fit_transform?
    X_design = rand_search_3a.best_estimator_.transform(X_design)
    X_test = rand_search_3a.best_estimator_.transform(X_test)
    
    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_3_before = best_hp_3_before.append(rand_search_3a.best_params_,ignore_index=True)
    print(f'shape of X_design after pca {X_design.shape}')

    # univariate feature selection
    sel_kb = SelectKBest(f_classif, k='all')
    sel_kb.fit(X_design, y_design)
    p_values = sel_kb.pvalues_

    reject_fdr, pvals_fdr, _, _ = multipletests(pvals=p_values, alpha=0.05, method='fdr_bh')
    features_selected=np.array(np.where(reject_fdr)[0])
    print(f'size of features selected{features_selected.shape}')
    X_design = X_design[:,features_selected]
    X_test = X_test[:,features_selected]
        
    print(f'shape of X_design after univariate: {X_design.shape}')
    
    # pipeline 3b
    pipeline_3b = Pipeline([    
        ('clf', SVC(kernel='linear'))
    ])

    # Define scores BEFORE hyperparameter tuning
    pipeline_3b.fit(X_design, y_design)
 
    y_pred_design_3 = pipeline_3b.predict(X_design)
    f1_design_3_bef = f1_score(y_design, y_pred_design_3)
    f1_design_3_before.append(f1_design_3_bef)

    y_pred_test_3_before = pipeline_3b.predict(X_test)
    f1_test_3_bef = f1_score(y_test_3, y_pred_test_3_before)
    f1_test_3_before.append(f1_test_3_bef)

    # Define hyperparameters of pipeline 3
    param_grid_3b = {
    'clf__C': np.logspace(-3, 1, 20),
    }

    # Perform grid search with inner cross-validation, part 1
    model_3 = RandomizedSearchCV(pipeline_3b, param_distributions=param_grid_3b, cv=inner_cv, scoring='f1', n_iter=50, n_jobs=-1) # optimize parameters
    model_3.fit(X_design, y_design)

    results = pd.DataFrame(model_3.cv_results_)
    results_3 = results_3.append(results,ignore_index=True)

    # Define scores AFTER hyperparameter tuning 
    y_pred_design_3_after = model_3.predict(X_design)
    f1_design_3_aft = f1_score(y_design, y_pred_design_3_after)
    f1_design_3_after.append(f1_design_3_aft)
   
    y_pred_test_3_after = model_3.predict(X_test)
    y_pred_3_all.append(y_pred_test_3_after)
   
    f1_test_3_aft = f1_score(y_test_3, y_pred_test_3_after)
    f1_test_3_after.append(f1_test_3_aft)


    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_3 = best_hp_3.append(model_3.best_params_,ignore_index=True)
    # Stores the optimum model in best_pipe
    best_pipe_3.append(model_3.best_estimator_)

# Save results of inner CV into .csv file
results_3.to_csv('results_3.csv', index=False)

print(f'Mean and std of F1 scores of pipeline 3: {statistics.mean(f1_test_3_after)} +/- {statistics.stdev(f1_test_3_after)}')
print(f'The optimal hyperparameters per split: {best_hp_3}')
print(f'The best pipes per split {best_pipe_3}')
print(f'The design F1 scores before tuning {f1_design_3_before}')
print(f'The test F1 scores before tuning {f1_test_3_before}')
print(f'The design F1 scores after tuning {f1_design_3_after}')
print(f'The test F1 scores afer tuning {f1_test_3_after}')

data_3 = [f1_design_3_after, f1_test_3_after]
sns.boxplot(data=data_3)
plt.title('Boxplot F1 scores design and test pipeline 3')
plt.xlabel('design and test')
plt.ylabel('F1 score score')
plt.show()

# Loop over rows and compute precision recall curve for each row
for i in range(len(y_pred_3_all)):
    precision, recall, thresholds = precision_recall_curve(y_test_3_all[i], y_pred_3_all[i])
    auc = average_precision_score(y_test_3_all[i], y_pred_3_all[i])

    # Plot the ROC curve for each row
    plt.plot(recall, precision, lw=2, label='PR curve it. %d (AP = %0.2f)' % (i+1, auc))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall curve for all rows')
plt.legend(loc="lower right", fontsize=8)
plt.show()

## Pipeline 4:PCA + Univariate -> KNN

In [None]:
# PIPELINE 4
# Define outer and inner cross validation
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Creating empty arrays
f1_design_4_before = []
f1_test_4_before = []
f1_design_4_after = []
f1_test_4_after = []
results_4 = pd.DataFrame()
best_pipe_4 = []
best_hp_4_before = pd.DataFrame()
best_hp_4 = pd.DataFrame()
y_test_4_all = []
y_pred_4_all = []

for design_index, test_index in outer_cv.split(X, y): 
    X_design = X.transpose()[design_index]
    X_design = X_design.transpose()
    print(f'Size_X_design {X_design.shape}') # print size of X_design
    y_design = y[design_index]
    
    X_test = X.transpose()[test_index]
    X_test = X_test.transpose()
    y_test_4 = y[test_index].tolist()
    y_test_4_all.append(y_test_4)
    print(f'Size X_test {X_test.shape}')     # print size of X_test

    # remove outliers
    X_design = removing_outliers(X_design)
    X_test = removing_outliers(X_test)

    # Correct missing data
    X_design = missing_data(X_design)
    X_test = missing_data(X_test)

    # balance the classes, so design set consists of 50% normal and 50% abnormal ECG's
    ros = RandomOverSampler(sampling_strategy='minority')
    X_resampled, y_resampled = ros.fit_resample(X_design, y_design)
    X_design = X_resampled
    y_design = y_resampled   
    print(f'shape after balancing {X_design.shape}')

    # Define pipeline 4a
    pipeline_4a = Pipeline([
        ('scaler', RobustScaler()),
        ('var_threshold', VarianceThreshold(threshold=0.0)),
        ('pca', PCA(n_components=0.5)),
    ])

    param_grid_4a = {
        'pca__n_components': [0.5],#0.75, 0.9, 0.95, 0.99],
        }

    # Perform grid search with inner cross-validation, part 1
    rand_search_4a = RandomizedSearchCV(pipeline_4a, param_distributions=param_grid_4a, n_iter=10, cv=inner_cv, scoring='f1', n_jobs=-1) #klopt n__iter
    rand_search_4a.fit(X_design, y_design) # klopt dit fit_transform?
    X_design = rand_search_4a.best_estimator_.transform(X_design)
    X_test = rand_search_4a.best_estimator_.transform(X_test)
    
    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_4_before = best_hp_4_before.append(rand_search_4a.best_params_,ignore_index=True)
    print(f'shape of X_design after pca {X_design.shape}')

    # univariate feature selection
    sel_kb = SelectKBest(f_classif, k='all')
    sel_kb.fit(X_design, y_design)
    p_values = sel_kb.pvalues_

    reject_fdr, pvals_fdr, _, _ = multipletests(pvals=p_values, alpha=0.05, method='fdr_bh')
    features_selected=np.array(np.where(reject_fdr)[0])
    print(f'size of features selected{features_selected.shape}')
    X_design = X_design[:,features_selected]
    X_test = X_test[:,features_selected]
        
    print(f'shape of X_design after univariate: {X_design.shape}')
    
    # pipeline 4b
    pipeline_4b = Pipeline([    
        ('clf', KNeighborsClassifier())
    ])

    # Define scores BEFORE hyperparameter tuning
    pipeline_4b.fit(X_design, y_design)
 
    y_pred_design_4 = pipeline_4b.predict(X_design)
    f1_design_4_bef = f1_score(y_design, y_pred_design_4)
    f1_design_4_before.append(f1_design_4_bef)

    y_pred_test_4_before = pipeline_4b.predict(X_test)
    f1_test_4_bef = f1_score(y_test_4, y_pred_test_4_before)
    f1_test_4_before.append(f1_test_4_bef)
    
    # Define hyperparameters of pipeline 4
    param_grid_4b = {
    'clf__n_neighbors': list(range(4,26,2)), 
    'clf__p': [1,2],
    'clf__leaf_size': np.arange(1,26,1)
    }

    # Perform grid search with inner cross-validation, part 1
    model_4 = RandomizedSearchCV(pipeline_4b, param_distributions=param_grid_4b, cv=inner_cv, scoring='f1', n_iter= 50, n_jobs=-1) # optimize parameters
    model_4.fit(X_design, y_design)

    results = pd.DataFrame(model_4.cv_results_)
    results_4 = results_4.append(results,ignore_index=True)

    # Define scores AFTER hyperparameter tuning 
    y_pred_design_4_after = model_4.predict(X_design)
    f1_design_4_aft = f1_score(y_design, y_pred_design_4_after)
    f1_design_4_after.append(f1_design_4_aft)
   
    y_pred_test_4_after = model_4.predict(X_test)
    y_pred_4_all.append(y_pred_test_4_after)
   
    f1_test_4_aft = f1_score(y_test_4, y_pred_test_4_after)
    f1_test_4_after.append(f1_test_4_aft)

    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_4 = best_hp_4.append(model_4.best_params_,ignore_index=True)
    # Stores the optimum model in best_pipe
    best_pipe_4.append(model_4.best_estimator_)

# Save results of inner CV into .csv file
results_4.to_csv('results_4.csv', index=False)

print(f'Mean and std of F1 scores of pipeline 4: {statistics.mean(f1_test_4_after)} +/- {statistics.stdev(f1_test_4_after)}')
print(f'The optimal hyperparameters per split: {best_hp_4}')
print(f'The best pipes per split {best_pipe_4}')
print(f'The design F1 scores before tuning {f1_design_4_before}')
print(f'The test F1 scores before tuning {f1_test_4_before}')
print(f'The design F1 scores after tuning {f1_design_4_after}')
print(f'The test F1 scores afer tuning {f1_test_4_after}')

data_4 = [f1_design_4_after, f1_test_4_after]
sns.boxplot(data=data_4)
plt.title('Boxplot F1 scores design and test pipeline 4')
plt.xlabel('design and test')
plt.ylabel('F1 score score')
plt.show()

# Confusion matrix
# convert numpy arrays to Python lists
y_pred_4_all = [arr.tolist() for arr in y_pred_4_all]
y_test_4_all = np.array(y_test_4_all)
y_pred_4_all = np.array(y_pred_4_all)
y_test_4_all_confusion = [item for sublist in y_test_4_all for item in sublist]
y_pred_4_all_confusion = [item for sublist in y_pred_4_all for item in sublist]

# Loop over rows and compute precision recall curve for each row
for i in range(len(y_pred_4_all)):
    precision, recall, thresholds = precision_recall_curve(y_test_4_all[i], y_pred_4_all[i])
    auc = average_precision_score(y_test_4_all[i], y_pred_4_all[i])

    # Plot the ROC curve for each row
    plt.plot(recall, precision, lw=2, label='PR curve it. %d (AP = %0.2f)' % (i+1, auc))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall curve for all rows')
plt.legend(loc="lower right", fontsize=8)
plt.show()

## Pipeline 5: LASSO -> KNN

In [None]:
# PIPELINE 5

# Define outer and inner cross validation
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Creating ampty arrays
f1_design_5_before = []
f1_test_5_before = []
f1_design_5_after = []
f1_test_5_after = []
results_5 = pd.DataFrame()
best_pipe_5 = []
best_hp_5_before = pd.DataFrame()
best_hp_5 = pd.DataFrame()
y_test_5_all = []
y_pred_5_all = []

for design_index, test_index in outer_cv.split(X, y): 
    X_design = X.transpose()[design_index]
    X_design = X_design.transpose()
    print(f'Size_X_design {X_design.shape}') # print size of X_design
    y_design = y[design_index]
    
    X_test = X.transpose()[test_index]
    X_test = X_test.transpose()
    y_test_5 = y[test_index].tolist()
    y_test_5_all.append(y_test_5)
    print(f'Size X_test {X_test.shape}')     # print size of X_test

    # remove outliers
    X_design = removing_outliers(X_design)
    X_test = removing_outliers(X_test)

    # Correct missing data
    X_design = missing_data(X_design)
    X_test = missing_data(X_test)

    # balance the classes, so design set consists of 50% normal and 50% abnormal ECG's
    ros = RandomOverSampler(sampling_strategy='minority')
    X_resampled, y_resampled = ros.fit_resample(X_design, y_design)
    X_design = X_resampled
    y_design = y_resampled   

    print(f'shape after balancing {X_design.shape}')  

    # Scaling the data
    scaler = RobustScaler()
    X_design = scaler.fit_transform(X_design)
    X_test = scaler.transform(X_test)

    ## PIPELINE 5: RobustScaler --> LASSO --> KNN
    # Define pipeline 5
    pipeline_5a = Pipeline([
        ('lasso', Lasso()),
    ])
    # Define hyperparameters of pipeline 5
    param_grid_5a = {
    'lasso__alpha': np.logspace(-10, 1, 100),
    }

    # Perform randomized search with inner cross-validation to find best alpha
    rand_search_5a = RandomizedSearchCV(pipeline_5a, param_distributions=param_grid_5a, n_iter=50, cv=inner_cv, scoring='f1',n_jobs=-1) # optimize parameters
    rand_search_5a.fit(X_design, y_design)
    
    # Create a new Lasso model using the best alpha value
    lasso = Lasso(alpha=rand_search_5a.best_params_['lasso__alpha'])
    lasso.fit(X_design, y_design)

    # Get the coefficients of the Lasso model, find them and define the new X_design with less features
    coef = lasso.coef_
    selected_features = np.where(coef != 0)[0]
    X_design = X_design[:, selected_features]
    X_test = X_test[:, selected_features]
    print(f'This is the size of X_design after LASSO: {X_design.shape}')

    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_5_before = best_hp_5_before.append(rand_search_5a.best_params_,ignore_index=True)
    print(f'This is the size of X_design after LASSO: {X_design.shape}')

    # Define pipeline 5b
    pipeline_5b = Pipeline([
        ('clf', KNeighborsClassifier())
        ])
    
    # Define scores BEFORE hyperparameter tuning
    pipeline_5b.fit(X_design, y_design)
 
    y_pred_design_5 = pipeline_5b.predict(X_design)
    f1_design_5_bef = f1_score(y_design, y_pred_design_5)
    f1_design_5_before.append(f1_design_5_bef)

    y_pred_test_5_before = pipeline_5b.predict(X_test)
    f1_test_5_bef = f1_score(y_test_5, y_pred_test_5_before)
    f1_test_5_before.append(f1_test_5_bef)

    # Define hyperparameters of pipeline 5b
    param_grid_5b = {'clf__n_neighbors': list(range(4,26,2)),
                  'clf__p': [1,2],
                  'clf__leaf_size': np.arange(1,26,1)
                }

    # Perform grid search with inner cross-validation, part 1
    model_5 = RandomizedSearchCV(pipeline_5b, param_distributions=param_grid_5b, n_iter=50, cv=inner_cv, scoring='f1', n_jobs=-1) # optimize parameters
    model_5.fit(X_design, y_design)

    # Storing results cross-validation
    results = pd.DataFrame(model_5.cv_results_)
    results_5 = results_5.append(results,ignore_index=True)

    # Define scores AFTER hyperparameter tuning 
    y_pred_design_5_after = model_5.predict(X_design)
    f1_design_5_aft = f1_score(y_design, y_pred_design_5_after)
    f1_design_5_after.append(f1_design_5_aft)
   
    y_pred_test_5_after = model_5.predict(X_test)
    y_pred_5_all.append(y_pred_test_5_after)
   
    f1_test_5_aft = f1_score(y_test_5, y_pred_test_5_after)
    f1_test_5_after.append(f1_test_5_aft)

    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_5 = best_hp_5.append(model_5.best_params_,ignore_index=True)
    # Stores the optimum model in best_pipe
    best_pipe_5.append(model_5.best_estimator_)

# Save results of inner CV into .csv file
results_5.to_csv('results_5.csv', index=False)

print(f'Mean and std of F1 scores of pipeline 5: {statistics.mean(f1_test_5_after)} +/- {statistics.stdev(f1_test_5_after)}')
print(f'The optimal hyperparameters per split: {best_hp_5}')
print(f'The best pipes per split {best_pipe_5}')
print(f'The design F1 scores before tuning {f1_design_5_before}')
print(f'The test F1 scores before tuning {f1_test_5_before}')
print(f'The design F1 scores after tuning {f1_design_5_after}')
print(f'The test F1 scores afer tuning {f1_test_5_after}')

data_5 = [f1_design_5_after, f1_test_5_after]
sns.boxplot(data=data_5)
plt.title('Boxplot F1 scores design and test pipeline 5')
plt.xlabel('design and test')
plt.ylabel('F1 score score')
plt.show()

# Loop over rows and compute precision recall curve for each row
for i in range(len(y_pred_5_all)):
    precision, recall, thresholds = precision_recall_curve(y_test_5_all[i], y_pred_5_all[i])
    auc = average_precision_score(y_test_5_all[i], y_pred_5_all[i])

    # Plot the ROC curve for each row
    plt.plot(recall, precision, lw=2, label='PR curve it. %d (AP = %0.2f)' % (i+1, auc))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall curve for all rows')
plt.legend(loc="lower right", fontsize=8)
plt.show()

## Pipeline 6: PCA + Univariate -> Random forest

In [None]:
# PIPELINE 6
# Define outer and inner cross validation
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Creating empty error
f1_design_6_before = []
f1_test_6_before = []
f1_design_6_after = []
f1_test_6_after = []
results_6 = pd.DataFrame()
best_pipe_6 = []
best_hp_6_before = pd.DataFrame()
best_hp_6 = pd.DataFrame()
y_test_6_all = []
y_pred_6_all = []

for design_index, test_index in outer_cv.split(X, y): 
    X_design = X.transpose()[design_index]
    X_design = X_design.transpose()
    print(f'Size_X_design {X_design.shape}') # print size of X_design
    y_design = y[design_index]
    
    X_test = X.transpose()[test_index]
    X_test = X_test.transpose()
    y_test_6 = y[test_index].tolist()
    y_test_6_all.append(y_test_6)
    print(f'Size X_test {X_test.shape}')     # print size of X_test
    
    # remove outliers
    X_design = removing_outliers(X_design)
    X_test = removing_outliers(X_test)

    # Correct missing data
    X_design = missing_data(X_design)
    X_test = missing_data(X_test)

    # balance the classes, so design set consists of 50% normal and 50% abnormal ECG's
    ros = RandomOverSampler(sampling_strategy='minority')
    X_resampled, y_resampled = ros.fit_resample(X_design, y_design)
    X_design = X_resampled
    y_design = y_resampled   

    print(f'shape after balancing {X_design.shape}')
    
    # Define pipeline 6a
    pipeline_6a = Pipeline([
        ('scaler', RobustScaler()),
        ('var_threshold', VarianceThreshold(threshold=0.0)),
        ('pca', PCA(n_components=0.5)),
    ])

    param_grid_6a = {
        'pca__n_components': [0.5],#,0.75, 0.9, 0.95, 0.99],
        }

    # Perform grid search with inner cross-validation, part 1
    rand_search_6a = RandomizedSearchCV(pipeline_6a, param_distributions=param_grid_6a, n_iter=10, cv=inner_cv, scoring='f1', n_jobs=-1) #klopt n__iter
    rand_search_6a.fit(X_design, y_design)
    X_design = rand_search_6a.best_estimator_.transform(X_design)
    X_test = rand_search_6a.best_estimator_.transform(X_test)
    
    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_6_before = best_hp_6_before.append(rand_search_6a.best_params_,ignore_index=True)
    print(f'shape of X_design after pca {X_design.shape}')

    # univariate feature selection
    sel_kb = SelectKBest(f_classif, k='all')
    sel_kb.fit(X_design, y_design)
    p_values = sel_kb.pvalues_

    reject_fdr, pvals_fdr, _, _ = multipletests(pvals=p_values, alpha=0.05, method='fdr_bh')
    features_selected=np.array(np.where(reject_fdr)[0])
    print(f'size of features selected{features_selected.shape}')
    X_design = X_design[:,features_selected]
    X_test = X_test[:,features_selected]
        
    print(f'shape of X_design after univariate: {X_design.shape}')
    
    # pipeline 6b
    pipeline_6b = Pipeline([    
        ('clf', RandomForestClassifier())
    ])

    # Define scores BEFORE hyperparameter tuning
    pipeline_6b.fit(X_design, y_design)
 
    y_pred_design_6 = pipeline_6b.predict(X_design)
    f1_design_6_bef = f1_score(y_design, y_pred_design_6)
    f1_design_6_before.append(f1_design_6_bef)

    y_pred_test_6_before = pipeline_6b.predict(X_test)
    f1_test_6_bef = f1_score(y_test_6, y_pred_test_6_before)
    f1_test_6_before.append(f1_test_6_bef)
    
    # Define hyperparameters of pipeline 6
    param_grid_6b = {'clf__n_estimators' : range(2,6),
                    'clf__criterion' :['gini','entropy','log_loss'],
                    'clf__min_samples_split':range(2,10),
                    'clf__min_samples_leaf':range(1,10),
                    'clf__min_weight_fraction_leaf' : np.linspace(0, 0.5, 25),
                    'clf__max_features':['sqrt','log2',None],
                    'clf__bootstrap':[True,False],
                    'clf__warm_start':[True,False]
                    }

    # Perform grid search with inner cross-validation, part 1
    model_6 = RandomizedSearchCV(pipeline_6b, param_distributions=param_grid_6b, cv=inner_cv, scoring='f1', n_iter=50, n_jobs=-1) # optimize parameters
    model_6.fit(X_design, y_design)

    results = pd.DataFrame(model_6.cv_results_)
    results_6 = results_6.append(results,ignore_index=True)

    # Define scores AFTER hyperparameter tuning 
    y_pred_design_6_after = model_6.predict(X_design)
    f1_design_6_aft = f1_score(y_design, y_pred_design_6_after)
    f1_design_6_after.append(f1_design_6_aft)
   
    y_pred_test_6_after = model_6.predict(X_test)
    y_pred_6_all.append(y_pred_test_6_after)
   
    f1_test_6_aft = f1_score(y_test_6, y_pred_test_6_after)
    f1_test_6_after.append(f1_test_6_aft)

    # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
    best_hp_6 = best_hp_6.append(model_6.best_params_,ignore_index=True)
    # Stores the optimum model in best_pipe
    best_pipe_6.append(model_6.best_estimator_)

# Save results of inner CV into .csv file
results_6.to_csv('results_6.csv', index=False)

print(f'Mean and std of F1 scores of pipeline 1: {statistics.mean(f1_test_6_after)} +/- {statistics.stdev(f1_test_6_after)}')
print(f'The optimal hyperparameters per split: {best_hp_6}')
print(f'The best pipes per split {best_pipe_6}')
print(f'The design F1 scores before tuning {f1_design_6_before}')
print(f'The test F1 scores before tuning {f1_test_6_before}')
print(f'The design F1 scores after tuning {f1_design_6_after}')
print(f'The test F1 scores afer tuning {f1_test_6_after}')

data_6 = [f1_design_6_after, f1_test_6_after]
sns.boxplot(data=data_6)
plt.title('Boxplot F1 scores design and test pipeline 6')
plt.xlabel('design and test')
plt.ylabel('F1 score score')
plt.show()

# Loop over rows and compute precision recall curve for each row
for i in range(len(y_pred_6_all)):
    precision, recall, thresholds = precision_recall_curve(y_test_6_all[i], y_pred_6_all[i])
    auc = average_precision_score(y_test_6_all[i], y_pred_6_all[i])

    # Plot the ROC curve for each row
    plt.plot(recall, precision, lw=2, label='PR curve it. %d (AP = %0.2f)' % (i+1, auc))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-recall curve for all rows')
plt.legend(loc="lower right", fontsize=8)
plt.show()

## Pipeline 7: LASSO -> Random forest

In [None]:
# # Define outer and inner cross validation
# outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
# inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# # Creating empty arrays
# f1_design_7_before = []
# f1_test_7_before = []
# f1_design_7_after = []
# f1_test_7_after = []
# results_7 = pd.DataFrame()
# best_pipe_7 = []
# best_hp_7_before = pd.DataFrame()
# best_hp_7 = pd.DataFrame()
# y_test_7_all = []
# y_pred_7_all = []

# for design_index, test_index in outer_cv.split(X, y): 
#     X_design = X.transpose()[design_index]
#     X_design = X_design.transpose()
#     print(f'Size_X_design {X_design.shape}') # print size of X_design
#     y_design = y[design_index]
    
#     X_test = X.transpose()[test_index]
#     X_test = X_test.transpose()
#     y_test_7 = y[test_index].tolist()
#     y_test_7_all.append(y_test_7)
#     print(f'Size X_test {X_test.shape}')     # print size of X_test

#     # remove outliers
#     X_design = removing_outliers(X_design)
#     X_test = removing_outliers(X_test)

#     # Correct missing data
#     X_design = missing_data(X_design)
#     X_test = missing_data(X_test)

#     # balance the classes, so design set consists of 50% normal and 50% abnormal ECG's
#     ros = RandomOverSampler(sampling_strategy='minority')
#     X_resampled, y_resampled = ros.fit_resample(X_design, y_design)
#     X_design = X_resampled
#     y_design = y_resampled  
#     print(f'shape after balancing {X_design.shape}')  

#     # Scaling the data
#     scaler = RobustScaler()
#     X_design = scaler.fit_transform(X_design)
#     X_test = scaler.transform(X_test)

#     # Define pipeline 7
#     pipeline_7a = Pipeline([
#         ('lasso', Lasso()),
#     ])
#     # Define hyperparameters of pipeline 5
#     param_grid_7a = {
#     'lasso__alpha': np.logspace(-10, 1, 100),
#     }

#     # Perform randomized search with inner cross-validation to find best alpha
#     rand_search_7a = RandomizedSearchCV(pipeline_7a, param_distributions=param_grid_7a, n_iter =50, cv=inner_cv, scoring='f1',n_jobs=-1) # optimize parameters
#     rand_search_7a.fit(X_design, y_design)
    
#     # Create a new Lasso model using the best alpha value
#     lasso = Lasso(alpha=rand_search_7a.best_params_['lasso__alpha'])
#     lasso.fit(X_design, y_design)

#     # Get the coefficients of the Lasso model, find them and define the new X_design with less features
#     coef = lasso.coef_
#     selected_features = np.where(coef != 0)[0]
#     X_design = X_design[:, selected_features]
#     X_test = X_test[:, selected_features]
#     print(f'This is the size of X_design after LASSO: {X_design.shape}')

#     # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
#     best_hp_7_before = best_hp_7_before.append(rand_search_7a.best_params_,ignore_index=True)
#     print(f'This is the size of X_design after LASSO: {X_design.shape}')

#     # Define pipeline 7b
#     pipeline_7b = Pipeline([
#         ('clf', RandomForestClassifier())])
    
#     # Define scores BEFORE hyperparameter tuning
#     pipeline_7b.fit(X_design, y_design)
 
#     y_pred_design_7 = pipeline_7b.predict(X_design)
#     f1_design_7_bef = f1_score(y_design, y_pred_design_7)
#     f1_design_7_before.append(f1_design_7_bef)

#     y_pred_test_7_before = pipeline_7b.predict(X_test)
#     f1_test_7_bef = f1_score(y_test_7, y_pred_test_7_before)
#     f1_test_7_before.append(f1_test_7_bef)

#     # Define hyperparameters of pipeline 7b
#     param_grid_7b = {'clf__n_estimators' : range(2,6),
#                                'clf__criterion' :['gini','entropy','log_loss'],
#                                'clf__min_samples_split':range(2,10),
#                                'clf__min_samples_leaf':range(1,10),
#                                'clf__min_weight_fraction_leaf' : np.linspace(0, 0.5, 25),
#                                'clf__max_features':['sqrt','log2',None],
#                                'clf__bootstrap':[True,False],
#                                'clf__warm_start':[True,False],
#                                }

#     # Perform grid search with inner cross-validation, part 1
#     model_7 = RandomizedSearchCV(pipeline_7b, param_distributions=param_grid_7b, n_iter=50, cv=inner_cv, scoring='f1', n_jobs=-1) # optimize parameters
#     model_7.fit(X_design, y_design)

#     # Storing results cross-validation
#     results = pd.DataFrame(model_7.cv_results_)
#     results_7 = results_7.append(results,ignore_index=True)

#     # Define scores AFTER hyperparameter tuning 
#     y_pred_design_7_after = model_7.predict(X_design)
#     f1_design_7_aft = f1_score(y_design, y_pred_design_7_after)
#     f1_design_7_after.append(f1_design_7_aft)
   
#     y_pred_test_7_after = model_7.predict(X_test)
#     y_pred_7_all.append(y_pred_test_7_after)
   
#     f1_test_7_aft = f1_score(y_test_7, y_pred_test_7_after)
#     f1_test_7_after.append(f1_test_7_aft)

#     # Access and store the best set of hyperparameters of each outer-CV loop in a DataFrame
#     best_hp_7 = best_hp_7.append(model_7.best_params_,ignore_index=True)
#     # Stores the optimum model in best_pipe
#     best_pipe_7.append(model_7.best_estimator_)

# # Save results of inner CV into .csv file
# results_7.to_csv('results_7.csv', index=False)

# print(f'Mean and std of F1 scores of pipeline 7: {statistics.mean(f1_test_7_after)} +/- {statistics.stdev(f1_test_7_after)}')
# print(f'The optimal hyperparameters per split: {best_hp_7}')
# print(f'The best pipes per split {best_pipe_7}')
# print(f'The design F1 scores before tuning {f1_design_7_before}')
# print(f'The test F1 scores before tuning {f1_test_7_before}')
# print(f'The design F1 scores after tuning {f1_design_7_after}')
# print(f'The test F1 scores afer tuning {f1_test_7_after}')

# data_7 = [f1_design_7_after, f1_test_7_after]
# sns.boxplot(data=data_7)
# plt.title('Boxplot F1 scores design and test pipeline 7')
# plt.xlabel('design and test')
# plt.ylabel('F1 score score')
# plt.show()

# # Loop over rows and compute precision recall curve for each row
# for i in range(len(y_pred_7_all)):
#     precision, recall, thresholds = precision_recall_curve(y_test_7_all[i], y_pred_7_all[i])
#     auc = average_precision_score(y_test_7_all[i], y_pred_7_all[i])

#     # Plot the ROC curve for each row
#     plt.plot(recall, precision, lw=2, label='PR curve it. %d (AP = %0.2f)' % (i+1, auc))

# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.title('Precision-recall curve for all rows')
# plt.legend(loc="lower right", fontsize=8)
# plt.show()

# Confusion matrices and classification report

In [None]:
# Confusion matrices
# Pipeline 1
y_pred_1_all_con = [arr.tolist() for arr in y_pred_1_all]
y_test_1_all_con = np.array(y_test_1_all)
y_pred_1_all_con = np.array(y_pred_1_all_con)
y_test_1_all_confusion = [item for sublist in y_test_1_all_con for item in sublist]
y_pred_1_all_confusion = [item for sublist in y_pred_1_all_con for item in sublist]

cm_1 = confusion_matrix(y_test_1_all_confusion, y_pred_1_all_confusion)

sns.heatmap(cm_1, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion matrix Pipeline 1')
plt.show()

# Pipeline 2
y_pred_2_all_con = [arr.tolist() for arr in y_pred_2_all]
y_test_2_all_con = np.array(y_test_2_all)
y_pred_2_all_con = np.array(y_pred_2_all_con)
y_test_2_all_confusion = [item for sublist in y_test_2_all_con for item in sublist]
y_pred_2_all_confusion = [item for sublist in y_pred_2_all_con for item in sublist]

cm_2 = confusion_matrix(y_test_2_all_confusion, y_pred_2_all_confusion)

sns.heatmap(cm_2, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion matrix Pipeline 2')
plt.show()

# Pipeline 3
y_pred_3_all_con = [arr.tolist() for arr in y_pred_3_all]
y_test_3_all_con = np.array(y_test_3_all)
y_pred_3_all_con = np.array(y_pred_3_all_con)
y_test_3_all_confusion = [item for sublist in y_test_3_all_con for item in sublist]
y_pred_3_all_confusion = [item for sublist in y_pred_3_all_con for item in sublist]

cm_3 = confusion_matrix(y_test_3_all_confusion, y_pred_3_all_confusion)

sns.heatmap(cm_3, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion matrix Pipeline 3')
plt.show()

# Pipeline 4
y_pred_4_all_con = [arr.tolist() for arr in y_pred_4_all]
y_test_4_all_con = np.array(y_test_4_all)
y_pred_4_all_con = np.array(y_pred_4_all_con)
y_test_4_all_confusion = [item for sublist in y_test_4_all_con for item in sublist]
y_pred_4_all_confusion = [item for sublist in y_pred_4_all_con for item in sublist]

cm_4 = confusion_matrix(y_test_4_all_confusion, y_pred_4_all_confusion)

sns.heatmap(cm_4, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion matrix Pipeline 4')
plt.show()

# Pipeline 5
y_pred_5_all_con = [arr.tolist() for arr in y_pred_5_all]
y_test_5_all_con = np.array(y_test_5_all)
y_pred_5_all_con = np.array(y_pred_5_all_con)
y_test_5_all_confusion = [item for sublist in y_test_5_all_con for item in sublist]
y_pred_5_all_confusion = [item for sublist in y_pred_5_all_con for item in sublist]

cm_5 = confusion_matrix(y_test_5_all_confusion, y_pred_5_all_confusion)

sns.heatmap(cm_5, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion matrix Pipeline 5')
plt.show()

# Pipeline 6
y_pred_6_all_con = [arr.tolist() for arr in y_pred_6_all]
y_test_6_all_con = np.array(y_test_6_all)
y_pred_6_all_con = np.array(y_pred_6_all_con)
y_test_6_all_confusion = [item for sublist in y_test_6_all_con for item in sublist]
y_pred_6_all_confusion = [item for sublist in y_pred_6_all_con for item in sublist]

cm_6 = confusion_matrix(y_test_6_all_confusion, y_pred_6_all_confusion)

sns.heatmap(cm_6, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion matrix Pipeline 6')
plt.show()

# # Pipeline 7
# y_pred_7_all_con = [arr.tolist() for arr in y_pred_7_all]
# y_test_7_all_con = np.array(y_test_7_all)
# y_pred_7_all_con = np.array(y_pred_7_all_con)
# y_test_7_all_confusion = [item for sublist in y_test_7_all_con for item in sublist]
# y_pred_7_all_confusion = [item for sublist in y_pred_7_all_con for item in sublist]

# cm_7 = confusion_matrix(y_test_7_all_confusion, y_pred_7_all_confusion)

# sns.heatmap(cm_7, annot=True, cmap='Blues')
# plt.xlabel('Predicted labels')
# plt.ylabel('True labels')
# plt.title('Confusion matrix Pipeline 7')
# plt.show()

# Classification reports
print(classification_report(y_test_1_all_confusion, y_pred_1_all_confusion))
print(classification_report(y_test_2_all_confusion, y_pred_2_all_confusion))
print(classification_report(y_test_3_all_confusion, y_pred_3_all_confusion))
print(classification_report(y_test_4_all_confusion, y_pred_4_all_confusion))
print(classification_report(y_test_5_all_confusion, y_pred_5_all_confusion))
print(classification_report(y_test_6_all_confusion, y_pred_6_all_confusion))
# print(classification_report(y_test_7_all_confusion, y_pred_7_all_confusion))