In [1]:
import numpy as np
import pandas as pd
import sklearn.datasets as skdata
import sklearn.metrics as skmetrics
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# MAKE SURE TO CHANGE THIS TO THE LOCAL PATH TO DATA
#TRAIN_PATH = "../augmented_train/reformatted_train_10000_samples.csv"
TRAIN_PATH = "train_1998_samples_fft_0_to_10_hz_consensus_1.0_balanced.csv"
data = pd.read_csv(TRAIN_PATH)
data_np = data.to_numpy()

In [3]:
num_input = data_np.shape[1] - 1

x = data_np[:, 0:num_input]
y = data_np[:, num_input]

# Shuffle the dataset based on sample indices
shuffled_indices = np.random.permutation(x.shape[0])

# Choose the first 80% as training set, next 10% as validation and the rest as testing
train_split_idx = int(0.80 * x.shape[0])
val_split_idx = int(0.90 * x.shape[0])

train_indices = shuffled_indices[0:train_split_idx]
val_indices = shuffled_indices[train_split_idx:val_split_idx]
test_indices = shuffled_indices[val_split_idx:]

# Select the examples from x and y to construct our training, validation, testing sets
x_train, y_train = x[train_indices, :], y[train_indices]
x_val, y_val = x[val_indices, :], y[val_indices]
x_test, y_test = x[test_indices, :], y[test_indices]

In [7]:
model = RandomForestClassifier()
models = []
#metrics give node-splitting conditions for construction of trees
metrics = ['gini', 'entropy', 'log_loss']
tree_sizes = [10, 50, 100, 250, 500, 2000]
model_scores = []

for metric in metrics:
    for tree_size in tree_sizes:
        model = RandomForestClassifier(n_estimators = tree_size, criterion = metric)
        model.fit(x_train, y_train)
        models.append(model)

        predictions_train = model.predict(x_train)
        score_train = skmetrics.accuracy_score(predictions_train, y_train)

        predictions_val = model.predict(x_val)
        score_val = skmetrics.accuracy_score(predictions_val, y_val)

        predictions_test = model.predict(x_test)
        score_test = skmetrics.accuracy_score(predictions_test, y_test)

        model_scores.append(score_test)
        print('metric: {}, tree size: {}'.format(metric, tree_size))
        print('Training accuracy: {:0.5f}%  Validation accuracy: {:0.2f}% Testing accuracy: {:0.2f}%'.format(score_train*100, score_val*100, score_test*100))

best_model_idx = np.argmax(model_scores)
best_model = models[best_model_idx]
best_model_prediction = models[best_model_idx].predict(x_test)

print('Best model with metric {} and tree size {}'.format(metrics[best_model_idx//len(tree_sizes)], tree_sizes[best_model_idx%len(tree_sizes)]))
print('\n with error: {}'.format(model_scores[best_model_idx]))


metric: gini, tree size: 10
Training accuracy: 99.24576%  Validation accuracy: 43.22% Testing accuracy: 43.22%
metric: gini, tree size: 50
Training accuracy: 100.00000%  Validation accuracy: 45.73% Testing accuracy: 51.76%
metric: gini, tree size: 100
Training accuracy: 100.00000%  Validation accuracy: 47.24% Testing accuracy: 53.77%
metric: gini, tree size: 250
Training accuracy: 100.00000%  Validation accuracy: 49.25% Testing accuracy: 56.78%
metric: gini, tree size: 500
Training accuracy: 100.00000%  Validation accuracy: 49.25% Testing accuracy: 56.28%
metric: gini, tree size: 2000
Training accuracy: 100.00000%  Validation accuracy: 48.24% Testing accuracy: 57.29%
metric: entropy, tree size: 10
Training accuracy: 99.49717%  Validation accuracy: 38.69% Testing accuracy: 46.73%
metric: entropy, tree size: 50
Training accuracy: 100.00000%  Validation accuracy: 45.23% Testing accuracy: 51.76%
metric: entropy, tree size: 100
Training accuracy: 100.00000%  Validation accuracy: 50.75% Test