In [1]:
import os
import sys
import joblib
sys.path.append("../")

import numpy as np
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scripts.sampling import sampling, balanceSampling
from scripts.features_extraction import load_features
import scripts.logger as log
import time

start_time = time.time()

# Load data from .npz file
# names = ["symptoms"]
names = ["normalized/community_size_norm", "normalized/community_count_norm"]
features, labels = load_features(names)

features, labels = balanceSampling(features, labels, threshold=35)
log.simple_logger(f'Features size: {features.shape}')
log.simple_logger(f'Labels size: {labels.shape}')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

# Perform MaxAbsScaler normalization
# scaler = MaxAbsScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# np.savez_compressed("../prediction_model/features/RF_sample_bet_count_size", Xtest=X_test, Xtrain=X_train, Ytest=y_test, Ytrain=y_train)
#np.savez_compressed("../prediction_model/features/RF_mix_data_norm", Xtest=X_test, Xtrain=X_train, Ytest=y_test, Ytrain=y_train)
np.savez_compressed("../prediction_model/features/RF_mix_data_norm", Xtest=X_test, Xtrain=X_train, Ytest=y_test, Ytrain=y_train)

# Define the parameter grid to search for Random Forest
param_grid = {
    "n_estimators": [600], #200 #600
    "max_depth": [50], #75 #50
    #"min_samples_split": [2],
    #"min_samples_leaf": [1],
}

# Create a Random Forest classifier
random_forest = RandomForestClassifier(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    random_forest, param_grid, verbose=3, cv=3, scoring="accuracy", n_jobs=-1
)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(best_model.get_params())
all_params = best_model.get_params()
# Print the parameters
for param, value in all_params.items():
    log.simple_logger(f'{param}: {value}')


# Make predictions on the test set using the best model
predictions = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print("Best Parameters:", best_params)
print(f"Best Model Accuracy: {accuracy}")
#print("Classification Report:\n", report)

log.simple_logger(f'Accuracy: {accuracy}')

end_time = time.time()
execution_time = end_time - start_time

# Convert seconds to hours, minutes, and seconds
hours, remainder = divmod(execution_time, 3600)
minutes, seconds = divmod(remainder, 60)

# Format the execution time
formatted_execution_time = f'{int(hours):02}:{int(minutes):02}:{seconds:.4f}'

log.simple_logger(f'The code took {formatted_execution_time} seconds to execute.')
log.simple_logger("\n\n")

# Specify the current working directory and relative directory
current_directory = os.getcwd()
relative_save_directory = 'models'

# Create the absolute path for the directory
absolute_save_directory = os.path.join(current_directory, relative_save_directory)

# Ensure the directory exists, creating it if necessary
os.makedirs(absolute_save_directory, exist_ok=True)

# Specify the filename for the logistic regression model
#logistic_regression_model_filename = 'random_forest_model_symtoms.joblib'
#logistic_regression_model_filename = 'RF_mix_data_norm.joblib'
logistic_regression_model_filename = 'RF_mix_norm.joblib'

# Create the absolute path for the file
logistic_regression_model_path = os.path.join(absolute_save_directory, logistic_regression_model_filename)

# Save the logistic regression model
joblib.dump(best_model, logistic_regression_model_path)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 50, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 600, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Best Parameters: {'max_depth': 50, 'n_estimators': 600}
Best Model Accuracy: 0.8705118411000764


['c:\\Users\\Cristian\\Documents\\GitHub\\financial-project\\code\\prediction_model\\models\\RF_mix_norm.joblib']