# Import dependencies

In [1]:
import numpy as np
import pickle
import pandas as pd
import joblib

# Evaluate nested cross validation

In [22]:
def evaluate_nested_cross_validaton(model, input_vector):
    """
    Loads trained classifiers and respective training + test sets for cross-validation iterations
    
    Arguments:
    model -- which classifier to use ("RF", "GB", "LR", "SVM")
    input_vector -- use high-dimensional ("high-dim") or low dimensional ("lwo-dim") input vectors
    
    
    Returns:
    prints accuracies on train and test sets within iterations of cross validation and gives out 
    average results of 5-fold nested cross validation
    """
    
    if input_vector not in ["low_dim", "high_dim"]:
        print("Please select 'low_dim' or 'high_dim' as input vector argument.")
        return 0

    if model == "RF":
        path = "../../ML-models/nested-cross-validation/random-forest/" + input_vector + "/"
        # used 5-fold cross validation
        train_accuracies = []
        test_accuracies = []
        for i in range(5):
            clf = joblib.load(path + "RF_V_" + str(i) + ".joblib")
            X_train = np.load(path + "X_train_V_" + str(i) + ".npy")
            X_test = np.load(path + "X_test_V_" + str(i) + ".npy")
            y_train = np.load(path + "y_train_V_" + str(i) + ".npy")
            y_test = np.load(path + "y_test_V_" + str(i) + ".npy")
            print("Iteration " + str(i+1) + ":")
            print("Accuracy on train data: ", clf.score(X_train, y_train))
            train_accuracies.append(clf.score(X_train, y_train))
            print("Accuracy on test data: ", clf.score(X_test, y_test))
            test_accuracies.append(clf.score(X_test, y_test))
        print("-----------Average results over 5 folds-----------")
        print("Avg. accuracy on train data: ", np.mean(train_accuracies))
        print("Avg. accuracy on test data: ", np.mean(test_accuracies))
            
    elif model == "GB":
            path = "../../ML-models/nested-cross-validation/gradient-boosting/" + input_vector + "/"
            # used 5-fold cross validation
            train_accuracies = []
            test_accuracies = []
            for i in range(5):
                clf = joblib.load(path + "HistGB_V_" + str(i) + ".joblib")
                X_train = np.load(path + "X_train_V_" + str(i) + ".npy")
                X_test = np.load(path + "X_test_V_" + str(i) + ".npy")
                y_train = np.load(path + "y_train_V_" + str(i) + ".npy")
                y_test = np.load(path + "y_test_V_" + str(i) + ".npy")
                print("Iteration " + str(i+1) + ":")
                print("Accuracy on train data: ", clf.score(X_train, y_train))
                train_accuracies.append(clf.score(X_train, y_train))
                print("Accuracy on test data: ", clf.score(X_test, y_test))
                test_accuracies.append(clf.score(X_test, y_test))
            print("-----------Average results over 5 folds-----------")
            print("Avg. accuracy on train data: ", np.mean(train_accuracies))
            print("Avg. accuracy on test data: ", np.mean(test_accuracies))
    elif model == "LR":
            path = "../../ML-models/nested-cross-validation/logistic-regression/" + input_vector + "/"
            # used 5-fold cross validation
            train_accuracies = []
            test_accuracies = []
            for i in range(5):
                clf = joblib.load(path + "LR_V_" + str(i) + ".joblib")
                X_train = np.load(path + "X_train_V_" + str(i) + ".npy")
                X_test = np.load(path + "X_test_V_" + str(i) + ".npy")
                y_train = np.load(path + "y_train_V_" + str(i) + ".npy")
                y_test = np.load(path + "y_test_V_" + str(i) + ".npy")
                print("Iteration " + str(i+1) + ":")
                print("Accuracy on train data: ", clf.score(X_train, y_train))
                train_accuracies.append(clf.score(X_train, y_train))
                print("Accuracy on test data: ", clf.score(X_test, y_test))
                test_accuracies.append(clf.score(X_test, y_test))
            print("-----------Average results over 5 folds-----------")
            print("Avg. accuracy on train data: ", np.mean(train_accuracies))
            print("Avg. accuracy on test data: ", np.mean(test_accuracies))
    elif model == "SVM":
            path = "../../ML-models/nested-cross-validation/SVM/" + input_vector + "/"
            # used 5-fold cross validation
            train_accuracies = []
            test_accuracies = []
            for i in range(5):
                clf = joblib.load(path + "SVM_V_" + str(i) + ".joblib")
                X_train = np.load(path + "X_train_V_" + str(i) + ".npy")
                X_test = np.load(path + "X_test_V_" + str(i) + ".npy")
                y_train = np.load(path + "y_train_V_" + str(i) + ".npy")
                y_test = np.load(path + "y_test_V_" + str(i) + ".npy")
                print("Iteration " + str(i+1) + ":")
                print("Accuracy on train data: ", clf.score(X_train, y_train))
                train_accuracies.append(clf.score(X_train, y_train))
                print("Accuracy on test data: ", clf.score(X_test, y_test))
                test_accuracies.append(clf.score(X_test, y_test))
            print("-----------Average results over 5 folds-----------")
            print("Avg. accuracy on train data: ", np.mean(train_accuracies))
            print("Avg. accuracy on test data: ", np.mean(test_accuracies))
    else:
        print("Please select valid model from ['RF', 'GB', 'LR', 'SVM'].")
        return 0

## Random forest

In [7]:
evaluate_nested_cross_validaton(model="RF", input_vector="low_dim")

Iteration 1:
Accuracy on train data:  0.9948958333333333
Accuracy on test data:  0.9702083333333333
Iteration 2:
Accuracy on train data:  0.99109375
Accuracy on test data:  0.9664583333333333
Iteration 3:
Accuracy on train data:  0.9961979166666667
Accuracy on test data:  0.968125
Iteration 4:
Accuracy on train data:  0.99140625
Accuracy on test data:  0.969375
Iteration 5:
Accuracy on train data:  0.9964583333333333
Accuracy on test data:  0.965625
-----------Average results over 5 folds-----------
Avg. accuracy on train data:  0.9940104166666666
Avg. accuracy on test data:  0.9679583333333334


In [9]:
evaluate_nested_cross_validaton(model="RF", input_vector="high_dim")

Iteration 1:
Accuracy on train data:  0.9993229166666666
Accuracy on test data:  0.931875
Iteration 2:
Accuracy on train data:  0.9980208333333334
Accuracy on test data:  0.928125
Iteration 3:
Accuracy on train data:  0.9997395833333333
Accuracy on test data:  0.9375
Iteration 4:
Accuracy on train data:  0.9977604166666667
Accuracy on test data:  0.9260416666666667
Iteration 5:
Accuracy on train data:  0.9994270833333333
Accuracy on test data:  0.931875
-----------Average results over 5 folds-----------
Avg. accuracy on train data:  0.9988541666666666
Avg. accuracy on test data:  0.9310833333333333


## Gradient boosting

In [15]:
evaluate_nested_cross_validaton(model="GB", input_vector="low_dim")

Iteration 1:
Accuracy on train data:  0.9977604166666667
Accuracy on test data:  0.9754166666666667
Iteration 2:
Accuracy on train data:  0.99765625
Accuracy on test data:  0.9752083333333333
Iteration 3:
Accuracy on train data:  0.9976041666666666
Accuracy on test data:  0.974375
Iteration 4:
Accuracy on train data:  0.9984375
Accuracy on test data:  0.9727083333333333
Iteration 5:
Accuracy on train data:  0.99734375
Accuracy on test data:  0.973125
-----------Average results over 5 folds-----------
Avg. accuracy on train data:  0.9977604166666667
Avg. accuracy on test data:  0.9741666666666667


In [16]:
evaluate_nested_cross_validaton(model="GB", input_vector="high_dim")

Iteration 1:
Accuracy on train data:  0.9974479166666667
Accuracy on test data:  0.9777083333333333
Iteration 2:
Accuracy on train data:  0.9972395833333333
Accuracy on test data:  0.9691666666666666
Iteration 3:
Accuracy on train data:  0.9969791666666666
Accuracy on test data:  0.970625
Iteration 4:
Accuracy on train data:  0.9971354166666667
Accuracy on test data:  0.96875
Iteration 5:
Accuracy on train data:  0.9973958333333334
Accuracy on test data:  0.965625
-----------Average results over 5 folds-----------
Avg. accuracy on train data:  0.9972395833333334
Avg. accuracy on test data:  0.970375


## Logistic regression

In [18]:
evaluate_nested_cross_validaton(model="LR", input_vector="low_dim")

Iteration 1:
Accuracy on train data:  0.8797395833333334
Accuracy on test data:  0.8858333333333334
Iteration 2:
Accuracy on train data:  0.8859895833333333
Accuracy on test data:  0.8754166666666666
Iteration 3:
Accuracy on train data:  0.8809895833333333
Accuracy on test data:  0.8841666666666667
Iteration 4:
Accuracy on train data:  0.8841145833333334
Accuracy on test data:  0.8870833333333333
Iteration 5:
Accuracy on train data:  0.8851041666666667
Accuracy on test data:  0.8802083333333334
-----------Average results over 5 folds-----------
Avg. accuracy on train data:  0.8831875
Avg. accuracy on test data:  0.8825416666666668


In [19]:
evaluate_nested_cross_validaton(model="LR", input_vector="high_dim")

Iteration 1:
Accuracy on train data:  0.9823958333333334
Accuracy on test data:  0.970625
Iteration 2:
Accuracy on train data:  0.9861979166666667
Accuracy on test data:  0.9754166666666667
Iteration 3:
Accuracy on train data:  0.9822395833333334
Accuracy on test data:  0.9760416666666667
Iteration 4:
Accuracy on train data:  0.9873958333333334
Accuracy on test data:  0.971875
Iteration 5:
Accuracy on train data:  0.9865625
Accuracy on test data:  0.973125
-----------Average results over 5 folds-----------
Avg. accuracy on train data:  0.9849583333333334
Avg. accuracy on test data:  0.9734166666666667


## SVM

In [23]:
evaluate_nested_cross_validaton(model="SVM", input_vector="low_dim")

Iteration 1:
Accuracy on train data:  0.9216145833333333
Accuracy on test data:  0.926875
Iteration 2:
Accuracy on train data:  0.9238541666666666
Accuracy on test data:  0.9145833333333333
Iteration 3:
Accuracy on train data:  0.9230208333333333
Accuracy on test data:  0.9183333333333333
Iteration 4:
Accuracy on train data:  0.9216145833333333
Accuracy on test data:  0.92375
Iteration 5:
Accuracy on train data:  0.92125
Accuracy on test data:  0.9245833333333333
-----------Average results over 5 folds-----------
Avg. accuracy on train data:  0.9222708333333334
Avg. accuracy on test data:  0.921625


In [24]:
evaluate_nested_cross_validaton(model="SVM", input_vector="high_dim")

Iteration 1:
Accuracy on train data:  1.0
Accuracy on test data:  0.99375
Iteration 2:
Accuracy on train data:  0.99984375
Accuracy on test data:  0.9933333333333333
Iteration 3:
Accuracy on train data:  1.0
Accuracy on test data:  0.99125
Iteration 4:
Accuracy on train data:  1.0
Accuracy on test data:  0.9914583333333333
Iteration 5:
Accuracy on train data:  1.0
Accuracy on test data:  0.9916666666666667
-----------Average results over 5 folds-----------
Avg. accuracy on train data:  0.9999687500000001
Avg. accuracy on test data:  0.9922916666666666
