In [1]:
import sys
sys.path.append("../")

import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scripts.sampling import sampling, balanceSampling
import scripts.logger as log
import time

start_time = time.time()

# Load data from .npz file
data = np.load("features/symptoms.npz")
features = data["X"]
labels = data["y"]

features, labels = balanceSampling(features, labels)
log.simple_logger(f'Features size: {features.shape}')
log.simple_logger(f'Labels size: {labels.shape}')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

# Define the parameter grid to search for Random Forest
param_grid = {
    "n_estimators": [300],
    "max_depth": [None],
    "min_samples_split": [5],
    "min_samples_leaf": [1],
}

# Create a Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    random_forest, param_grid, verbose=3, cv=3, scoring="accuracy", n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(best_model.get_params)
all_params = best_model.get_params()
# Print the parameters
for param, value in all_params.items():
    log.simple_logger(f'{param}: {value}')


# Make predictions on the test set using the best model
predictions = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print("Best Parameters:", best_params)
print(f"Best Model Accuracy: {accuracy}")
#print("Classification Report:\n", report)

log.simple_logger(f'Accuracy: {accuracy}')

end_time = time.time()
execution_time = end_time - start_time

# Convert seconds to hours, minutes, and seconds
hours, remainder = divmod(execution_time, 3600)
minutes, seconds = divmod(remainder, 60)

# Format the execution time
formatted_execution_time = f'{int(hours):02}:{int(minutes):02}:{seconds:.4f}'

log.simple_logger(f'The code took {formatted_execution_time} seconds to execute.')
log.simple_logger("\n\n")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
<bound method BaseEstimator.get_params of RandomForestClassifier(min_samples_split=5, n_estimators=300, random_state=42)>
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Model Accuracy: 0.869


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
