In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from joblib import load, dump
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import optuna

In [2]:
combined_data = load('../../DataDumps/3classfft.joblib')
combined_labels = load('../../DataDumps/3class_labels.joblib')

In [3]:
def reshape_data(data):
    num_samples = data.shape[0]
    num_timesteps = data.shape[1]
    num_channels = data.shape[2]  # Assuming the 3rd dimension is the number of channels
    return data.reshape(num_samples, num_timesteps * num_channels) 


combined_data = reshape_data(combined_data)  
# Split data into training/validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(combined_data, combined_labels, test_size=0.2, random_state=42)


In [4]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [5]:
def objective(trial):
    # Define hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # Initialize variables to store fold accuracies
    fold_accuracies = []

    # Perform K-Fold cross-validation
    for train_index, val_index in skf.split(X_train_val, y_train_val):
        X_train, X_val = X_train_val[train_index], X_train_val[val_index]
        y_train, y_val = y_train_val[train_index], y_train_val[val_index]
        
        # Initialize Random Forest classifier with current hyperparameters
        rf = RandomForestClassifier(**params)
        
        # Train the classifier
        rf.fit(X_train, y_train)
        
        # Make predictions on validation set
        predictions_val = rf.predict(X_val)
        
        # Calculate accuracy on validation set
        accuracy_val = accuracy_score(y_val, predictions_val)
        
        # Store accuracy for current fold
        fold_accuracies.append(accuracy_val)

    # Calculate average accuracy across all folds
    avg_accuracy = np.mean(fold_accuracies)

    return avg_accuracy

In [6]:
# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500, n_jobs=-1)

# Get the best hyperparameters found
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Initialize Random Forest classifier with best hyperparameters
best_rf = RandomForestClassifier(**best_params)

# Train the best model on the entire dataset
best_rf.fit(X_train_val, y_train_val)

# Save the best model if needed
dump(best_rf, './ModelDumps/RF-Base-3.joblib')

# Evaluate the best model on the test set
predictions_test = best_rf.predict(X_test)
accuracy_test = accuracy_score(y_test, predictions_test)
print("Accuracy on Test Set:", accuracy_test)
print(classification_report(y_test, predictions_test))

[I 2024-04-21 22:55:30,508] A new study created in memory with name: no-name-2b5b4678-f982-4202-a0c8-afbd54a90472
[I 2024-04-21 22:55:42,911] Trial 18 finished with value: 0.6973214285714285 and parameters: {'n_estimators': 138, 'max_depth': 18, 'min_samples_split': 16, 'min_samples_leaf': 12, 'max_features': 'log2', 'bootstrap': True}. Best is trial 18 with value: 0.6973214285714285.
[I 2024-04-21 22:55:46,009] Trial 3 finished with value: 0.6937499999999999 and parameters: {'n_estimators': 156, 'max_depth': 25, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_features': 'log2', 'bootstrap': True}. Best is trial 18 with value: 0.6973214285714285.
[I 2024-04-21 22:55:46,427] Trial 8 finished with value: 0.70625 and parameters: {'n_estimators': 136, 'max_depth': 23, 'min_samples_split': 15, 'min_samples_leaf': 3, 'max_features': 'log2', 'bootstrap': True}. Best is trial 8 with value: 0.70625.
[I 2024-04-21 22:55:47,218] Trial 5 finished with value: 0.7 and parameters: {'n_estimators'

Best Hyperparameters: {'n_estimators': 198, 'max_depth': 26, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False}
Accuracy on Test Set: 0.7046263345195729
              precision    recall  f1-score   support

           0       0.71      0.89      0.79        90
           1       0.74      0.80      0.77        95
           2       0.65      0.44      0.52        96

    accuracy                           0.70       281
   macro avg       0.70      0.71      0.69       281
weighted avg       0.70      0.70      0.69       281



In [9]:
print(confusion_matrix(y_test, predictions_test))

[[80  3  7]
 [ 3 76 16]
 [30 24 42]]
