Imports

In [151]:
# Author: Agustin CARTAYA

# matrix and tables gestion
import numpy as np
import pandas as pd

# train test separation
from sklearn.model_selection import train_test_split

# scalers
from sklearn import preprocessing

# classifiers
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier    

# features selection
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# cross validation
from sklearn.model_selection import cross_val_score

# metrics
from sklearn.metrics import roc_auc_score, matthews_corrcoef, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.width', 200)


# -------- datasets
# dataset ADCTL
DATASET_ADCTL = "ADCTL"
LABELS_ADCTL = ("CTL", "AD")

# dataset ADMCI
DATASET_ADMCI = "ADMCI"
LABELS_ADMCI = ("MCI", "AD")

# dataset MCICT
DATASET_MCICTL = "MCICTL"
LABELS_MCICTL = ("CTL", "MCI")

# -------- functions
def get_dataset_class_labels(dataset):
    label_classes=""
    if dataset == DATASET_ADCTL:
        label_classes = LABELS_ADCTL
    elif dataset == DATASET_ADMCI :
        label_classes = LABELS_ADMCI
    elif dataset == DATASET_MCICTL :
        label_classes = LABELS_MCICTL
    return label_classes

def read_data(dataset="ADCTL", data_type="train"):
    extension = ".csv"    
    path = "data/" + dataset + data_type + extension
    data = pd.read_csv(path)

    label_classes = get_dataset_class_labels(dataset)

    if data_type == "train":
        data['Label'] = data['Label'].replace({label_classes[0]: 0, label_classes[1]: 1})

    return data

def get_best_params(dataset):
    best_result_knn = []
    best_result_svm = []

    best_result_knn_keys = ["train_acc","test_acc","train_auc","test_auc","f_classif","mutual_info_classif","pca","n_neighbors","p"]
    best_result_svm_keys = ["train_acc","test_acc","train_auc","test_auc","f_classif","mutual_info_classif","pca","C","degree", "kernel"]
    ## best results DATASET_ADCTL
    if dataset == DATASET_ADCTL:
        #                   train_acc        test_acc       train_auc     test_auc   f_classif   mutual_info_classif    pca    n_neighbors    p
        best_result_knn =    [0.96748,       0.926829,       0.99496,    0.964976,          1,                 358,      9,             3,   1]
        #                   train_acc        test_acc       train_auc     test_auc   f_classif   mutual_info_classif    pca      C  degree   kernel
        best_result_svm =   [1.0,           0.902439,           1.0,    0.983092,           1,                 247,    119,   0.1,      1,  "poly"]

    ## best results DATASET_ADMCI
    elif dataset == DATASET_ADMCI:
        #                   train_acc        test_acc       train_auc     test_auc   f_classif   mutual_info_classif    pca    n_neighbors    p
        best_result_knn =    [0.891473,       0.72093,       0.974687,    0.780303,          6,                 43,      19,            2,   1]
        #                   train_acc        test_acc       train_auc     test_auc   f_classif   mutual_info_classif    pca      C  degree   kernel
        best_result_svm =   [0.868217,       0.651163,      0.943587,    0.733766,         53,                   50,    50,   1.0,      1,  "poly"]

    ## best results DATASET_MCICTL
    elif dataset == DATASET_MCICTL:
        #                   train_acc        test_acc       train_auc     test_auc   f_classif   mutual_info_classif    pca    n_neighbors    p
        best_result_knn =   [0.968992,       0.906977,      0.998074,     0.96087,          1,                  172,    61,             3,   1]
        #                   train_acc        test_acc       train_auc     test_auc   f_classif   mutual_info_classif    pca      C  degree   kernel
        best_result_svm =   [0.953488,       0.883721,       0.98676,    0.956522,          1,                 172,     51,   0.1,      1,  "poly"]
     
    return dict(zip(best_result_knn_keys, best_result_knn)), dict(zip(best_result_svm_keys, best_result_svm))

def get_freatures_labels_training(data):
    x_tr=data.drop(['Label', 'ID'],axis=1)
    y_tr=pd.DataFrame(data.Label, columns = ['Label'])
    return (x_tr, y_tr)

def split_trainin(x_tr, y_tr, test_size=0.25, random_state=42, shuffle=False):
    return train_test_split(x_tr, y_tr, test_size=test_size, shuffle=shuffle, random_state=random_state)

def normalize_training_test(x_train, x_test):
    scaler = preprocessing.MinMaxScaler()
    x_train_normalized = scaler.fit_transform(x_train)
    x_test_normalized = scaler.transform(x_test)
    return x_train_normalized, x_test_normalized

def feature_selection(x_train, x_test, y_train, i_f_classif, i_mutual_info_classif, features_index=False):
    # individual features selection
    selector = SelectKBest(score_func=f_classif, k=i_f_classif)
    selector.fit(x_train, y_train)

    features_selected_f_classif = selector.get_support(indices=True)
    # print("f classif features selection: \n", features_selected_f_classif)

    # multiple features selection
    selector = SelectKBest(score_func=mutual_info_classif, k=i_mutual_info_classif)
    selector.fit(x_train, y_train)

    features_selected_mutual_info_classif  = selector.get_support(indices=True)
    # print("mutual info classif features selection: \n",features_selected_mutual_info_classif)

    # union of selected features
    conjunto = list(set(features_selected_f_classif) | set(features_selected_mutual_info_classif))
    # print("Selected features = ", len(conjunto), "\n", conjunto)

    if features_index:
        return (x_train[:, conjunto], x_test[:, conjunto], conjunto)
    else:
        return (x_train[:, conjunto], x_test[:, conjunto])

def apply_PCA(x_train, x_test, pca_components):
    pca = PCA(pca_components)
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.transform(x_test)
    return (x_train_pca, x_test_pca)
    
def apply_lda(x_train, x_test, y_train):
    lda = LinearDiscriminantAnalysis(n_components=1)
    x_train_lda = lda.fit_transform(x_train, y_train)
    x_test_lda = lda.transform(x_test)                                                                           
    return (x_train_lda, x_test_lda)

def compute_scores(y, y_pred, y_pred_probability):
    # calculate the cofusion matrix
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    # Accuracy
    acc = (tp+tn)/(tn + fp + fn + tp)
    # specificity
    spec = tn / (tn + fp)
    # sensitivity (recall)
    sens = tp / (tp + fn)
    # presition
    prec  = tp/(tp+fp)  
    # f1
    f1 = (2 * prec * sens)/(prec + sens)
    # ba
    ba = (spec + sens)/2
    # mcc
    mcc = matthews_corrcoef(y, y_pred)
    #auc
    auc = roc_auc_score(y, y_pred_probability)
    
    return {"tn":tn, "fp":fp, "fn":fn, "tp":fn, "acc":acc, "spec":spec, "sens":sens, "prec":prec, "f1":f1, "ba":ba, "auc":auc, "mcc":mcc}

def combine_prediction(probabilities, weights):
    weights_sum = sum(weights)
    normalized_weights = np.array([[weight / weights_sum for weight in weights]]).T
    y_lineal_combination = np.dot(np.array(probabilities).T, normalized_weights)
    y_predicted = np.where(y_lineal_combination > 0.5, 1, 0)
    
    return y_predicted, y_lineal_combination

def create_csv_output(dataset, ids, predicted_classes, predicted_probabilities):
    predicted_probabilities_complement = 1 - predicted_probabilities
    
    df = pd.DataFrame({ 'id': ids.flatten(), 
                        'class': predicted_classes.flatten(),
                        'predicted_probability_'+ get_dataset_class_labels(dataset)[0]: predicted_probabilities_complement.flatten(), 
                        'predicted_probability_'+ get_dataset_class_labels(dataset)[1]: predicted_probabilities.flatten()})
    
    df.to_csv('0075721_CartayaLathulerie_'+ dataset +'res.csv', index=False)

def create_csv_selected_features(dataset, features_selected):
    df = pd.DataFrame(features_selected, columns=['features_selected'])
    df.to_csv('0075721_CartayaLathulerie_'+ dataset +'feat.csv', index=False)


In [152]:
def test_approach():
    # DATASET_ADCTL DATASET_ADMCI DATASET_MCICTL
    columns = ["DATASET", "Acc", "Sens", "Spec", "Prec", "F1", "AUC", "MCC", "BA"]
    train_table =  pd.DataFrame(columns=columns)
    test_table =  pd.DataFrame(columns=columns)


    for dataset in [DATASET_ADCTL, DATASET_ADMCI, DATASET_MCICTL]:
        
        # select best parameters
        params_knn, params_svm  = get_best_params(dataset)
        # read data training data
        data = read_data(dataset=dataset, data_type="train")
        # obtaining labels and features from training data
        x_tr, y_tr = get_freatures_labels_training(data)
        # split into train and test
        x_train, x_test, y_train, y_test  = train_test_split(x_tr, y_tr, test_size=0.25, shuffle=True, random_state=42)
        # normalize data
        x_train_normalized, x_test_normalized = normalize_training_test(x_train, x_test)

        # ---- KNN
        # feature selection
        knn_x_train_selected, knn_x_test_selected = feature_selection(x_train_normalized, x_test_normalized, y_train, params_knn["f_classif"], params_knn["mutual_info_classif"])
        # obtaining the first n principal components
        knn_x_train_pca, knn_x_test_pca = apply_PCA(knn_x_train_selected, knn_x_test_selected, params_knn["pca"])
        # apply LDA
        knn_x_train_lda, knn_x_test_lda = apply_lda(knn_x_train_pca, knn_x_test_pca, y_train)
        # create and train the classifier
        knn_classifier = KNeighborsClassifier(n_neighbors=int(params_knn["n_neighbors"]), p=params_knn["p"])
        knn_classifier.fit(knn_x_train_lda, y_train)
        knn_y_predcted_probability_test = knn_classifier.predict_proba(knn_x_test_lda)[:, 1]
        knn_y_predcted_probability_train = knn_classifier.predict_proba(knn_x_train_lda)[:, 1]

        # ---- SVM
        # feature selection
        svm_x_train_selected, svm_x_test_selected = feature_selection(x_train_normalized, x_test_normalized, y_train, params_svm["f_classif"], params_svm["mutual_info_classif"])
        # obtaining the first n principal components
        svm_x_train_pca, svm_x_test_pca = apply_PCA(svm_x_train_selected, svm_x_test_selected, params_svm["pca"])
        # apply LDA
        svm_x_train_lda, svm_x_test_lda = apply_lda(svm_x_train_pca, svm_x_test_pca, y_train)
        # create and train the classifier
        svm_classifier = SVC(C=params_svm["C"], degree=params_svm["degree"], kernel=params_svm["kernel"], probability=True)
        svm_classifier.fit(svm_x_train_lda, y_train)
        svm_y_predcted_probability_test = svm_classifier.predict_proba(svm_x_test_lda)[:, 1]
        svm_y_predcted_probability_train = svm_classifier.predict_proba(svm_x_train_lda)[:, 1]

        weights = [params_knn["test_auc"], params_svm["test_auc"]]
        y_pred_test, y_pred_probability_test = combine_prediction([knn_y_predcted_probability_test, svm_y_predcted_probability_test],  weights)
        y_pred_train, y_pred_probability_train = combine_prediction([knn_y_predcted_probability_train, svm_y_predcted_probability_train],  weights)
        
        # computing scores
        s_ts = compute_scores(y_test, y_pred_test, y_pred_probability_test)
        s_tr = compute_scores(y_train, y_pred_train, y_pred_probability_train)

        test_table = test_table.append(pd.Series([dataset, s_ts["acc"], s_ts["sens"], s_ts["spec"], s_ts["prec"], s_ts["f1"], s_ts["auc"], s_ts["mcc"], s_ts["ba"]], index=columns),
                                        ignore_index=True)
        train_table = train_table.append(pd.Series([dataset, s_tr["acc"], s_tr["sens"], s_tr["spec"], s_tr["prec"], s_tr["f1"], s_tr["auc"], s_tr["mcc"], s_tr["ba"]], index=columns),
                                        ignore_index=True)

    print("\nPerformance on the 75% of the training datasets (data used for training the model):")
    print(train_table.head())

    print("\nPerformance on the 25% of the training dataset (data not used for training the models):")
    print(test_table.head())
    


In [153]:
def test_approach_with_cross_validation(k=5, choose="MEAN"):
    columns = ["DATASET", "Acc", "Sens", "Spec", "Prec", "F1", "AUC", "MCC", "BA"]
    cross_validation_table =  pd.DataFrame(columns=columns)

    for dataset in [DATASET_ADCTL, DATASET_ADMCI, DATASET_MCICTL]:

        # read data training data
        data = read_data(dataset=dataset, data_type="train")
        # shuffle data
        data = data.sample(frac=1, random_state=42)
        # obtaining labels and features from training data
        x_tr, y_tr = get_freatures_labels_training(data)
        # select best parameters
        params_knn, params_svm  = get_best_params(dataset)
        # divide folds
        x_folds = np.array_split(x_tr, k)
        y_folds = np.array_split(y_tr, k)
        scores = []

        # cross validation
        for i in range(k):
            # obtain train and test
            x_train = np.concatenate(x_folds[:i] + x_folds[i+1:])
            y_train = np.concatenate(y_folds[:i] + y_folds[i+1:])
            x_test = x_folds[i]
            y_test = y_folds[i]

            # normalize data
            x_train_normalized, x_test_normalized = normalize_training_test(x_train, x_test)

            # ---- KNN
            # feature selection
            knn_x_train_selected, knn_x_test_selected = feature_selection(x_train_normalized, x_test_normalized, y_train, params_knn["f_classif"], params_knn["mutual_info_classif"])
            # obtaining the first n principal components
            knn_x_train_pca, knn_x_test_pca = apply_PCA(knn_x_train_selected, knn_x_test_selected, params_knn["pca"])
            # apply LDA
            knn_x_train_lda, knn_x_test_lda = apply_lda(knn_x_train_pca, knn_x_test_pca, y_train)
            # create and train the classifier
            knn_classifier = KNeighborsClassifier(n_neighbors=int(params_knn["n_neighbors"]), p=params_knn["p"])
            knn_classifier.fit(knn_x_train_lda, y_train)
            knn_y_predcted_probability = knn_classifier.predict_proba(knn_x_test_lda)[:, 1]
            knn_y_predcted = knn_classifier.predict(knn_x_test_lda)

            # ---- SVM
            # feature selection
            svm_x_train_selected, svm_x_test_selected = feature_selection(x_train_normalized, x_test_normalized, y_train, params_svm["f_classif"], params_svm["mutual_info_classif"])
            # obtaining the first n principal components
            svm_x_train_pca, svm_x_test_pca = apply_PCA(svm_x_train_selected, svm_x_test_selected, params_svm["pca"])
            # apply LDA
            svm_x_train_lda, svm_x_test_lda = apply_lda(svm_x_train_pca, svm_x_test_pca, y_train)
            # create and train the classifier
            svm_classifier = SVC(C=params_svm["C"], degree=params_svm["degree"], kernel=params_svm["kernel"], probability=True)
            svm_classifier.fit(svm_x_train_lda, y_train)
            svm_y_predcted_probability = svm_classifier.predict_proba(svm_x_test_lda)[:, 1]
            svm_y_predcted = svm_classifier.predict(svm_x_test_lda)

            weights = [params_knn["test_auc"], params_svm["test_auc"]]
            y_pred_test, y_pred_probability_test = combine_prediction([knn_y_predcted_probability, svm_y_predcted_probability],  weights)
            
            # computing scores
            scd = compute_scores(y_test, y_pred_test, y_pred_probability_test)
            scores.append((scd["acc"], scd["sens"], scd["spec"], scd["prec"], scd["f1"], scd["auc"], scd["mcc"], scd["ba"]))

        # calc scores means
        scores = np.nan_to_num(scores)
        scores = np.array(scores)
        if choose == "MAX":
            scores_choosen = scores[scores[:, 5].argmax(), :]
        else:
            scores_choosen = np.mean(scores, axis=0).tolist()

        cross_validation_table = cross_validation_table.append(pd.Series([dataset, *scores_choosen], index=columns), ignore_index=True)

    print(choose, "\nPerformance of the classifiers using 5-fold cross validation:")
    print(cross_validation_table.head())

In [154]:
def predict_test_dataset():
    columns = ["DATASET", "Acc", "Sens", "Spec", "Prec", "F1", "AUC", "MCC", "BA"]
    train_table =  pd.DataFrame(columns=columns)
    for dataset in [DATASET_ADCTL, DATASET_ADMCI, DATASET_MCICTL]:
        # select best parameters
        params_knn, params_svm  = get_best_params(dataset)

        # read train dataset
        data_tr = read_data(dataset=dataset, data_type="train")

        # read test dataset
        data_ts = read_data(dataset=dataset, data_type="test")

        # obtaining labels and features from training data
        x_train, y_train = get_freatures_labels_training(data_tr)

        # remove and save id from test data
        x_test=data_ts.drop(['ID'],axis=1)
        id_test=np.array([data_ts.ID]).T

        # normalize data
        x_train_normalized, x_test_normalized = normalize_training_test(x_train, x_test)

        # ---- KNN
        # feature selection
        knn_x_train_selected, knn_x_test_selected, knn_features_selected = feature_selection(x_train_normalized, x_test_normalized, y_train, params_knn["f_classif"], params_knn["mutual_info_classif"], features_index=True)
        # obtaining the first n principal components
        knn_x_train_pca, knn_x_test_pca = apply_PCA(knn_x_train_selected, knn_x_test_selected, params_knn["pca"])
        # apply LDA
        knn_x_train_lda, knn_x_test_lda = apply_lda(knn_x_train_pca, knn_x_test_pca, y_train)
        # create and train the classifier
        knn_classifier = KNeighborsClassifier(n_neighbors=int(params_knn["n_neighbors"]), p=params_knn["p"])
        knn_classifier.fit(knn_x_train_lda, y_train)
        knn_y_predcted_probability = knn_classifier.predict_proba(knn_x_test_lda)[:, 1]
        knn_y_predcted_probability_train = knn_classifier.predict_proba(knn_x_train_lda)[:, 1]

        # ---- SVM
        # feature selection
        svm_x_train_selected, svm_x_test_selected, svm_features_selected = feature_selection(x_train_normalized, x_test_normalized, y_train, params_svm["f_classif"], params_svm["mutual_info_classif"], features_index=True)
        # obtaining the first n principal components
        svm_x_train_pca, svm_x_test_pca = apply_PCA(svm_x_train_selected, svm_x_test_selected, params_svm["pca"])
        # apply LDA
        svm_x_train_lda, svm_x_test_lda = apply_lda(svm_x_train_pca, svm_x_test_pca, y_train)
        # create and train the classifier
        svm_classifier = SVC(C=params_svm["C"], degree=params_svm["degree"], kernel=params_svm["kernel"], probability=True)
        svm_classifier.fit(svm_x_train_lda, y_train)
        svm_y_predcted_probability = svm_classifier.predict_proba(svm_x_test_lda)[:, 1]
        svm_y_predcted_probability_train = svm_classifier.predict_proba(svm_x_train_lda)[:, 1]

        weights = [params_knn["test_auc"], params_svm["test_auc"]]
        # weights = [1, 1]
        y_pred_test, y_pred_probability_test = combine_prediction([knn_y_predcted_probability, svm_y_predcted_probability],  weights)
        y_pred_train, y_pred_probability_train = combine_prediction([knn_y_predcted_probability_train, svm_y_predcted_probability_train],  weights)
        
        # computing training scores
        scd = compute_scores(y_train, y_pred_train, y_pred_probability_train)

        train_table = train_table.append(pd.Series([dataset, scd["acc"], scd["sens"], scd["spec"], scd["prec"], scd["f1"], scd["auc"], scd["mcc"], scd["ba"]], index=columns),
                                                ignore_index=True)

        # creating csv files
        create_csv_output(dataset, id_test, y_pred_test, y_pred_probability_test)
        # combine selected features by the two clasifiers
        features_selected = list(set(svm_features_selected) | set(knn_features_selected))
        create_csv_selected_features(dataset, features_selected)

    print("\nPerformance on the 100% of the training datasets (data used for training the model):")
    print(train_table.head())    

In [155]:
test_approach_with_cross_validation(k=5, choose="MEAN")
test_approach()
predict_test_dataset()

MEAN 
Performance of the classifiers using 5-fold cross validation:
  DATASET       Acc      Sens      Spec      Prec        F1       AUC       MCC        BA
0   ADCTL  0.847538  0.865441  0.822745  0.854880  0.855395  0.924329  0.700672  0.844093
1   ADMCI  0.686555  0.659649  0.721287  0.689615  0.667559  0.719654  0.383355  0.690468
2  MCICTL  0.790252  0.814524  0.763636  0.780658  0.793094  0.874777  0.578215  0.789080

Performance on the 75% of the training datasets (data used for training the model):
  DATASET       Acc      Sens      Spec      Prec        F1      AUC       MCC        BA
0   ADCTL  1.000000  1.000000  1.000000  1.000000  1.000000  1.00000  1.000000  1.000000
1   ADMCI  0.914729  0.918033  0.911765  0.903226  0.910569  0.98216  0.829198  0.914899
2  MCICTL  0.961240  0.970149  0.951613  0.955882  0.962963  0.99687  0.922429  0.960881

Performance on the 25% of the training dataset (data not used for training the models):
  DATASET       Acc      Sens      Spec   

In [58]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

def find_best_features_and_classifier(dataset):

    def get_svm_gridsearch():
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        pipe_SVM = Pipeline([('classifier',SVC( ))])

        parameters_SVM={
                        'classifier__C': [0.1, 1, 10],
                        'classifier__kernel': ['poly', 'rbf'],
                        'classifier__degree': [1, 2, 3]
                        }
        grid_search_SVM = GridSearchCV(pipe_SVM, parameters_SVM, cv=cv)
        return grid_search_SVM

    def get_knn_gridsearch():
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        pipe_KNN = Pipeline([('classifier', KNeighborsClassifier(algorithm='ball_tree'))])

        parameters_KNN={'classifier__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
                        'classifier__p': [1,2]}

        grid_search_KNN = GridSearchCV(pipe_KNN, parameters_KNN, cv=cv)
        return grid_search_KNN

    def get_classifier_results(classifier, x_train, x_test, y_train, y_test):
        # train
        train_score = classifier.score(x_train, y_train)
        y_pred_train = classifier.predict_proba(x_train)[:, 1]
        auc_train = roc_auc_score(y_train, y_pred_train)

        # test
        test_score = classifier.score(x_test, y_test)
        y_pred_test = classifier.predict_proba(x_test)[:, 1]
        auc_test = roc_auc_score(y_test, y_pred_test)

        return (train_score, auc_train, test_score, auc_test)
    
    grid_search_SVM = get_svm_gridsearch()
    grid_search_KNN = get_knn_gridsearch()
    def calc_svm_knn(x_train, x_test, y_train, y_test):
        grid_search_SVM.fit(x_train, y_train)
        grid_search_KNN.fit(x_train, y_train)

        classifier_svm = SVC(C=grid_search_SVM.best_params_['classifier__C'], degree=grid_search_SVM.best_params_['classifier__degree'], kernel=grid_search_SVM.best_params_['classifier__kernel'], probability=True)
        classifier_svm.fit(x_train, y_train)

        classifier_knn = KNeighborsClassifier(n_neighbors=grid_search_KNN.best_params_['classifier__n_neighbors'], p=grid_search_KNN.best_params_['classifier__p'])
        classifier_knn.fit(x_train, y_train)

        svm_results = get_classifier_results(classifier_svm, x_train, x_test, y_train, y_test)
        knn_results = get_classifier_results(classifier_knn, x_train, x_test, y_train, y_test)

        return ((svm_results, grid_search_SVM.best_params_), (knn_results, grid_search_KNN.best_params_))

    def prepare_data():
        # read data training data
        data = read_data(dataset=dataset, data_type="train")

        # obtaining labels and features from training data
        x_tr, y_tr = get_freatures_labels_training(data)

        # split into train and test
        x_train, x_test, y_train, y_test  = train_test_split(x_tr, y_tr, test_size=0.25, random_state=42)

        # normalize data
        x_train_normalized, x_test_normalized = normalize_training_test(x_train, x_test)
        return (x_train_normalized, x_test_normalized, y_train, y_test)

    def init_search():
        x_train_normalized, x_test_normalized, y_train, y_test = prepare_data()
        res_svm = []
        res_knn = []
        for i_f_classif in range(1, x_train_normalized.shape[1], 5):
        # for i_f_classif in range(1, 2):

            #  selecting individuals
            selector = SelectKBest(score_func=f_classif, k=i_f_classif)
            selector.fit(x_train_normalized, y_train)
            features_selected_f_classif = selector.get_support(indices=True)

            for i_mutual_info_classif in range(1,  x_train_normalized.shape[1], 5):
            # for i_mutual_info_classif in range(358,  359):

                # selecting multiples
                selector = SelectKBest(score_func=mutual_info_classif, k=i_mutual_info_classif)
                selector.fit(x_train_normalized, y_train)
                features_selected_mutual_info_classif  = selector.get_support(indices=True)

                conjunto = list(set(features_selected_f_classif) | set(features_selected_mutual_info_classif))
                x_train_extracted_features = x_train_normalized[:, conjunto]
                x_test_extracted_features = x_test_normalized[:, conjunto]

                for i_pca in range(1, min(x_train_extracted_features.shape), 3):
                # for i_pca in range(9, 10):

                    # PCA
                    pca = PCA(i_pca)
                    x_train_pca = pca.fit_transform(x_train_extracted_features)
                    x_test_pca = pca.transform(x_test_extracted_features)

                    lda = LinearDiscriminantAnalysis(n_components=1)
                    x_train_lda = lda.fit_transform(x_train_pca, y_train)
                    x_test_lda = lda.transform(x_test_pca)        

                    single_res_svm,  single_res_knn = calc_svm_knn(x_train_lda, x_test_lda, y_train, y_test)
                    res_svm.append((*single_res_svm[0], i_f_classif, i_mutual_info_classif, i_pca, single_res_svm[1]['classifier__C'], single_res_svm[1]['classifier__degree'], single_res_svm[1]['classifier__kernel'] ))
                    res_knn.append((*single_res_knn[0], i_f_classif, i_mutual_info_classif, i_pca, single_res_knn[1]['classifier__n_neighbors'], single_res_knn[1]['classifier__p'] ))

                    print(i_f_classif, " ", i_mutual_info_classif, " ", i_pca , 
                        " tr svm acc: ", single_res_svm[0][0], " tr svm AUC: ", single_res_svm[0][1],
                        " ts svm acc: ", single_res_svm[0][2], " ts svm AUC: ", single_res_svm[0][3],
                        " tr knn acc: ", single_res_knn[0][0], " tr knn AUC: ", single_res_knn[0][1],  
                        " ts knn acc: ", single_res_knn[0][2], " ts knn AUC: ", single_res_knn[0][3])

        res_svm = pd.DataFrame(res_svm,
                             columns=["train_acc", "train_auc", "test_acc", "test_auc", "f_classif", "mutual_info_classif", "pca", "c", "degree", "kernel"])

        res_knn = pd.DataFrame(res_svm,
                             columns=["train_acc", "train_auc", "test_acc", "test_auc", "f_classif", "mutual_info_classif", "pca", "n_neighbors", "p"])
        
        return (res_svm, res_knn)
        
    def select_best():
        res_svm, res_knn = init_search()

        res_svm = res_svm.sort_values('train_auc', ascending=False)
        res_knn = res_knn.sort_values('train_auc', ascending=False)

        ## DATASET_ADCTL
        if dataset == DATASET_ADCTL:
            res_svm = res_svm[(res_svm['train_auc'] > 0.969) & (res_svm['test_auc'] > 0.888)]
            res_knn = res_knn[(res_knn['train_auc'] > 0.969) & (res_knn['test_auc'] > 0.888)]

        ## DATASET_ADMCI
        elif dataset == DATASET_ADMCI:
            res_svm = res_svm[(res_svm['train_auc'] > 0.94) & (res_svm['test_auc'] > 0.73)]
            res_knn = res_knn[(res_knn['train_auc'] > 0.97) & (res_knn['test_auc'] > 0.761)]

            # res_svm = res_svm[(res_svm['train_auc'] > 0.5) & (res_svm['test_auc'] > 0.5)]
            # res_knn = res_knn[(res_knn['train_auc'] > 0.5) & (res_knn['test_auc'] > 0.5)]

        ## DATASET_MCICTL
        elif dataset == DATASET_MCICTL:
            res_svm = res_svm[(res_svm['train_auc'] > 0.884) & (res_svm['test_auc'] > 0.801)]
            res_knn = res_knn[(res_knn['train_auc'] > 0.884) & (res_knn['test_auc'] > 0.801)]

        res_svm = res_svm.sort_values('test_auc', ascending=False)
        res_knn = res_knn.sort_values('test_auc', ascending=False)

        no_res = 0
        if res_svm.shape[0] == 0:
            print("NO SVM RESULTS")
            no_res +=1

        if res_knn.shape[0] == 0:
            print("NO KNN RESULTS")
            no_res +=1

        if no_res == 2:
            print("NO RESULT WAS FOUND")
            return None

        best_result_knn = res_knn.iloc[0].tolist() 
        best_result_svm = res_svm.iloc[0].tolist()

        if best_result_knn[3] > best_result_svm[3]:
           print("KNN:")
           print("train_acc        test_acc       train_auc     test_auc   f_classif   mutual_info_classif    pca    n_neighbors    p")
           print(best_result_knn)
           return best_result_knn
            
        else:
           print("SVM:")
           print("train_acc        test_acc       train_auc     test_auc   f_classif   mutual_info_classif    pca    n_neighbors    p")
           print(best_result_svm)
           return best_result_svm

    select_best()


# DATASET_ADCTL | DATASET_ADMCI | DATASET_MCICTL
# find_best_features_and_classifier(DATASET_ADCTL)