In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
features = np.load('../../data/features/normalized_features/normalized_02_jan.npy')
labels = np.load('../../data/features/directional_labels/k_50_categorical_labels_02_jan.npy')

test_features = np.load('../../data/features/normalized_features/normalized_03_jan.npy')
test_labels = np.load('../../data/features/directional_labels/k_50_categorical_labels_03_jan.npy')

In [None]:
### Unoptimized

rf = RandomForestClassifier(random_state=42, n_jobs=6)

rf.fit(features, labels)
classes = ['Down', 'No Change', 'Up']
train_report = classification_report(labels, rf.predict(features), target_names=classes, output_dict=True)
print("Train Classification Report:")
print(train_report)


test_report = classification_report(test_labels, rf.predict(test_features), target_names=classes, output_dict=True)
print("Test Classification Report:")
print(test_report)

In [4]:
df_train_report = pd.DataFrame(train_report).transpose()
df_train_report.to_csv('../../results/optimisation_results/rfc/rf_train_unoptimised.csv')

df_test_report = pd.DataFrame(test_report).transpose()
df_test_report.to_csv('../../results/optimisation_results/rfc/rf_test_unoptimised.csv')

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    rf = RandomForestClassifier(n_estimators=n_estimators,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf,
                                max_features=max_features,
                                random_state=42, n_jobs=6)

    rf.fit(features, labels)
    accuracy = rf.score(test_features, test_labels)
    
    return accuracy
    

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=100)

In [None]:
print("Best trial:")
params = study.best_trial.params
params

In [None]:
### Optimized

rf = RandomForestClassifier(n_estimators=63,
                            max_depth=5,
                            min_samples_split=6,
                            min_samples_leaf=4,
                            max_features='sqrt',
                            random_state=42, n_jobs=6)

rf.fit(features, labels)
classes = ['Down', 'No Change', 'Up']
train_report = classification_report(labels, rf.predict(features), target_names=classes, output_dict=True)
print("Train Classification Report:")
print(train_report)


test_report = classification_report(test_labels, rf.predict(test_features), target_names=classes, output_dict=True)
print("Test Classification Report:")
print(test_report)

In [9]:
df_train_report = pd.DataFrame(train_report).transpose()
df_train_report.to_csv('../../results/optimisation_results/rfc/rf_train_optimised.csv')

df_test_report = pd.DataFrame(test_report).transpose()
df_test_report.to_csv('../../results/optimisation_results/rfc/rf_test_optimised.csv')