## Setup

In [0]:
import os
import pickle
from itertools import product

from scipy.io import loadmat
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

if not os.path.exists("model"):
    os.makedirs("model")

## Load data

In [0]:
train_data = dict()
test_data = dict()

for dataset in os.listdir("data"):
    data_dir = f"data/{dataset}"
    train_data[dataset] = loadmat(f"{data_dir}/{dataset}_Train.mat")
    test_data[dataset] = loadmat(f"{data_dir}/{dataset}_Test.mat")

assert len(train_data.keys()) == len(test_data.keys())

## Train

In [0]:
folds = 4
num_trees = [50, 100, 150, 200, 250]
min_node_sizes = [1, 3, 5, 7, 9]
num_features = ['sqrt', 'log2', None]

cross_validator = StratifiedKFold(n_splits=folds)
for dataset, mat in train_data.items():
    print(f"dataset: {dataset:<20}", end='')

    data = mat['Data']
    labels = mat['Label'].flatten()

    # stratified k-fold cross-validation
    validation_accuracies = dict()
    for n_tree, node_size, n_feat in product(num_trees, min_node_sizes, num_features):
        cumulative_accuracy = 0
        for train_indices, test_indices in cross_validator.split(data, labels):
            model = RandomForestClassifier(n_estimators=n_tree, min_samples_leaf=node_size, max_features=n_feat)
            model.fit(data[train_indices], labels[train_indices])
            cumulative_accuracy += model.score(data[test_indices], labels[test_indices])
        validation_accuracies[(n_tree, node_size, n_feat)] = cumulative_accuracy / folds

    (n_tree, node_size, n_feat), validation_acc = list(sorted(validation_accuracies.items(), key=lambda item: item[1], reverse=True))[0]
    model = RandomForestClassifier(n_estimators=n_tree, min_samples_leaf=node_size, max_features=n_feat)
    model.fit(data, labels)

    print(f"n_estimators: {n_tree:<5}", end='')
    print(f"min_samples_leaf: {node_size:<3}", end='')
    print(f"max_features: {str(n_feat):<6}", end='')
    print(f"Validation accuracy: {validation_acc:<6.3f}", end='')
    print(f"Train accuracy: {model.score(data, labels):.3f}")

    # save model
    with open(f"model/rf_{dataset}", 'wb') as f:
        pickle.dump(model, f)

dataset: bank                n_estimators: 50   min_samples_leaf: 3  max_features: sqrt  Validation accuracy: 0.898 Train accuracy: 0.957
dataset: car                 n_estimators: 100  min_samples_leaf: 1  max_features: sqrt  Validation accuracy: 0.976 Train accuracy: 1.000
dataset: abalone             n_estimators: 200  min_samples_leaf: 7  max_features: sqrt  Validation accuracy: 0.657 Train accuracy: 0.819
dataset: image-segmentation  n_estimators: 100  min_samples_leaf: 1  max_features: sqrt  Validation accuracy: 0.973 Train accuracy: 1.000
dataset: plant-shape         n_estimators: 200  min_samples_leaf: 1  max_features: log2  Validation accuracy: 0.616 Train accuracy: 1.000
dataset: wine-quality-red    n_estimators: 250  min_samples_leaf: 1  max_features: log2  Validation accuracy: 0.694 Train accuracy: 1.000
dataset: titanic             n_estimators: 50   min_samples_leaf: 5  max_features: log2  Validation accuracy: 0.793 Train accuracy: 0.793
dataset: ozone               n_est

## Test

In [0]:
for dataset, mat in test_data.items():
    print(f"dataset: {dataset:<20}", end='')

    # load model
    with open(f"model/rf_{dataset}", 'rb') as f:
        model = pickle.load(f)

    print(f"n_estimators: {model.n_estimators:<5}", end='')
    print(f"min_samples_leaf: {model.min_samples_leaf:<3}", end='')
    print(f"max_features: {str(model.max_features):<6}", end='')
    print(f"Test accuracy: {model.score(mat['Data'], mat['Label'].flatten())}")

dataset: bank                n_estimators: 50   min_samples_leaf: 3  max_features: sqrt  Test accuracy: 0.9005524861878453
dataset: car                 n_estimators: 100  min_samples_leaf: 1  max_features: sqrt  Test accuracy: 0.9682080924855492
dataset: abalone             n_estimators: 200  min_samples_leaf: 7  max_features: sqrt  Test accuracy: 0.6519138755980861
dataset: image-segmentation  n_estimators: 100  min_samples_leaf: 1  max_features: sqrt  Test accuracy: 0.9805194805194806
dataset: plant-shape         n_estimators: 200  min_samples_leaf: 1  max_features: log2  Test accuracy: 0.596875
dataset: wine-quality-red    n_estimators: 250  min_samples_leaf: 1  max_features: log2  Test accuracy: 0.671875
dataset: titanic             n_estimators: 50   min_samples_leaf: 5  max_features: log2  Test accuracy: 0.782312925170068
dataset: ozone               n_estimators: 100  min_samples_leaf: 3  max_features: log2  Test accuracy: 0.9822834645669292
dataset: led-display         n_estima