In [None]:
import numpy as np
from cuml.ensemble import RandomForestClassifier
from joblib import load, dump
from cuml.model_selection import StratifiedKFold
from cuml.metrics import accuracy_score, classification_report
from cuml.preprocessing.model_selection import train_test_split
import optuna

# Load data
combined_data = load('../../DataDumps/3classfft.joblib')
combined_labels = load('../../DataDumps/3class_labels.joblib')

# Define a function to reshape the data
def reshape_data(data):
    num_samples = data.shape[0]
    num_timesteps = data.shape[1]
    num_channels = data.shape[2]
    return data.reshape(num_samples, num_timesteps * num_channels) 

combined_data = reshape_data(combined_data)  

# Split data into training/validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(combined_data, combined_labels, test_size=0.2, random_state=42)

# Define K-Fold cross-validation using cuML
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Define objective function for Optuna
def objective(trial):
    # Define hyperparameters to optimize
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    # Initialize variables to store fold accuracies
    fold_accuracies = []

    # Perform K-Fold cross-validation
    for train_index, val_index in skf.split(X_train_val, y_train_val):
        X_train, X_val = X_train_val[train_index], X_train_val[val_index]
        y_train, y_val = y_train_val[train_index], y_train_val[val_index]
        
        # Initialize Random Forest classifier with current hyperparameters
        rf = RandomForestClassifier(**params)
        
        # Train the classifier
        rf.fit(X_train, y_train)
        
        # Make predictions on validation set
        predictions_val = rf.predict(X_val)
        
        # Calculate accuracy on validation set
        accuracy_val = accuracy_score(y_val, predictions_val)
        
        # Store accuracy for current fold
        fold_accuracies.append(accuracy_val)

    # Calculate average accuracy across all folds
    avg_accuracy = np.mean(fold_accuracies)

    return avg_accuracy

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Get the best hyperparameters found
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Initialize Random Forest classifier with best hyperparameters
best_rf = RandomForestClassifier(**best_params)

# Train the best model on the entire dataset
best_rf.fit(X_train_val, y_train_val)

# Save the best model if needed
dump(best_rf, 'best_model.joblib')

# Evaluate the best model on the test set
predictions_test = best_rf.predict(X_test)
accuracy_test = accuracy_score(y_test, predictions_test)
print("Accuracy on Test Set:", accuracy_test)
print(classification_report(y_test, predictions_test))
