In [1]:
import classification_trees as trees
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
import itertools
from typing import Tuple

In [2]:
def accuracy(y_test, y_pred):
    cf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cf_matrix.ravel()
#     print(cf_matrix)

    accuracy = (tn + tp)/(tn + tp + fp + fn)
    print(f'Accuracy {accuracy:.2f}')
    return accuracy

In [3]:
# Generate train and test set
data = np.genfromtxt("pima-indians-diabetes.csv", delimiter=',')
X, y = data[:, 0:8], data[:, 8]
y = y.astype('int64')

In [4]:
X, X_val, y, y_val = train_test_split(X, y, test_size=0.05, random_state=1234)
y_val

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0])

In [5]:
# Test, don't use
t1 = time.time()

rf_clf = trees.RandomForestClassifier()
rf_clf.fit(X, y)

t2 = time.time()
print(f'Time: {t2-t1:.2f}s')

y_pred = rf_clf.predict(X_val)
accuracy(y_val, y_pred)

Time: 4.09s
Accuracy 0.79


0.7948717948717948

In [6]:
# Best hyperparameter search for RF
def find_best_hyperparams(params: Tuple[int, int, int]) -> Tuple[int, int, int, float]:
       
    nmin, minleaf, ntrees = params
    print(nmin, minleaf, ntrees)
    kf = KFold(n_splits=5)
    accuracies = [] # Accuracy on each test fold. With k = 5 we evaluate 5 models per parameter combination
    for train_idxs, test_idxs in kf.split(X):
        X_train, X_test, y_train, y_test = X[train_idxs], X[test_idxs], y[train_idxs], y[test_idxs]
        rf_clf = trees.RandomForestClassifier(ntrees=ntrees, nmin=nmin, minleaf=minleaf)
        rf_clf.fit(X_train, y_train)
        y_pred = rf_clf.predict(X_test)
        accuracies.append(accuracy(y_test, y_pred))
    
    return (nmin, minleaf, ntrees, np.mean(accuracies))
    
    
nmin = range(2, 4)
minleaf = range(1, 3)
ntrees = [10, 20, 30, 40, 50, 100, 150, 200]
cartesian_product = list(itertools.product(nmin, minleaf, ntrees))

hyperparams = list(map(find_best_hyperparams, cartesian_product))

2 1 10
Accuracy 0.73
Accuracy 0.77
Accuracy 0.72
Accuracy 0.75
Accuracy 0.74
2 1 20
Accuracy 0.74
Accuracy 0.78
Accuracy 0.70
Accuracy 0.77
Accuracy 0.74
2 1 30
Accuracy 0.72
Accuracy 0.79
Accuracy 0.74
Accuracy 0.78
Accuracy 0.74
2 1 40
Accuracy 0.73
Accuracy 0.80
Accuracy 0.75
Accuracy 0.80
Accuracy 0.77
2 1 50
Accuracy 0.73
Accuracy 0.80
Accuracy 0.73
Accuracy 0.80
Accuracy 0.76
2 1 100
Accuracy 0.73
Accuracy 0.79
Accuracy 0.73
Accuracy 0.81
Accuracy 0.76
2 1 150
Accuracy 0.74
Accuracy 0.79
Accuracy 0.77
Accuracy 0.82
Accuracy 0.75
2 1 200
Accuracy 0.75
Accuracy 0.79
Accuracy 0.73
Accuracy 0.81
Accuracy 0.77
2 2 10
Accuracy 0.74
Accuracy 0.75
Accuracy 0.69
Accuracy 0.77
Accuracy 0.75
2 2 20
Accuracy 0.75
Accuracy 0.79
Accuracy 0.71
Accuracy 0.79
Accuracy 0.72
2 2 30
Accuracy 0.77
Accuracy 0.76
Accuracy 0.70
Accuracy 0.82
Accuracy 0.74
2 2 40
Accuracy 0.77
Accuracy 0.82
Accuracy 0.73
Accuracy 0.80
Accuracy 0.77
2 2 50
Accuracy 0.75
Accuracy 0.77
Accuracy 0.74
Accuracy 0.80
Accuracy 0

In [8]:
max(hyperparams, key=lambda x: x[3])

(2, 2, 40, 0.7777704298535664)

In [7]:
hyperparams

[(2, 1, 10, 0.7434860651865848),
 (2, 1, 20, 0.7462163438828531),
 (2, 1, 30, 0.7530656589513463),
 (2, 1, 40, 0.7695418044402456),
 (2, 1, 50, 0.764052905054322),
 (2, 1, 100, 0.7640529050543222),
 (2, 1, 150, 0.7736324988190836),
 (2, 1, 200, 0.7668020784128483),
 (2, 2, 10, 0.7393859234766178),
 (2, 2, 20, 0.7544166273027869),
 (2, 2, 30, 0.7558053849787435),
 (2, 2, 40, 0.7777704298535664),
 (2, 2, 50, 0.7599149740198393),
 (2, 2, 100, 0.7640717997165801),
 (2, 2, 150, 0.7668115257439773),
 (2, 2, 200, 0.7695418044402456),
 (3, 1, 10, 0.7434860651865848),
 (3, 1, 20, 0.7448748228625414),
 (3, 1, 30, 0.754426074633916),
 (3, 1, 40, 0.7612659423712802),
 (3, 1, 50, 0.7667926310817194),
 (3, 1, 100, 0.7585640056683987),
 (3, 1, 150, 0.7599433160132263),
 (3, 1, 200, 0.7764100141709968),
 (3, 2, 10, 0.7448842701936703),
 (3, 2, 20, 0.7613604156825697),
 (3, 2, 30, 0.7517052432687766),
 (3, 2, 40, 0.7585734529995275),
 (3, 2, 50, 0.7736608408124704),
 (3, 2, 100, 0.7722909777987719),
 (

In [None]:
# Train RF classifier on best hyperparam and test accuracy on validation set