# Imports methods and classes

In [None]:
# Import libs
import os
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
import joblib
import json

# Importing custom modules
from src.classes.data_loader import DataLoader
from src.classes.data_saver import DataSaver
from src.classes.class_distribution import ClassDistribution
from src.classes.preprocessing import Preprocessing
from src.classes.hyperparameter_tuning import HyperparameterTuning
from src.classes.evaluation import Evaluation
from src.classes.random_forest_model import RandomForestModel
from src.classes.visualize_datas import VisualizeDatas

## Setting Up Paths and Parameters

In [None]:
# Setting up paths and parameters
base_path = "assets/sorted_data"
output_datas = "outputs/reports"
output_models = "outputs/trained_models"
model_path = f"{output_models}/random_forest_model.pkl"
pca_path = f"{output_models}/pca_model.pkl"
class_names = ['NORMAL', 'BACTERIA', 'VIRUS']

# Check if paths exist and create them if not
if not os.path.exists(output_datas):
    os.makedirs(output_datas)
if not os.path.exists(output_models):
    os.makedirs(output_models)

## Loading Data

In [None]:
# Load data
data_loader = DataLoader(base_path)
try:
    test_images, test_labels = data_loader.load_test_images()
    train_images, train_labels = data_loader.load_train_images()
    val_images, val_labels = data_loader.load_val_images()
except Exception as e:
    print(f"Error loading images: {e}")
run_number = len([name for name in os.listdir(output_datas) if os.path.isdir(os.path.join(output_datas, name))]) + 1
run_folder = os.path.join(output_datas, f"run_{run_number}")
os.makedirs(run_folder, exist_ok=True)

## Checking Class Distribution

In [None]:
# Check class distribution
try:
    ClassDistribution.detailed_class_distribution(train_labels, "Training", run_number)
    ClassDistribution.detailed_class_distribution(val_labels, "Validation", run_number)
    ClassDistribution.detailed_class_distribution(test_labels, "Test", run_number)
    unique_classes = np.unique(train_labels)
    if len(unique_classes) < len(class_names):
        print("Training set does not contain all classes. Consider augmenting the dataset.")
    else:
        print("All classes are present in the data set.")
except Exception as e:
    print(f"Error: {e}")

## Preprocessing Data

In [None]:
# Preprocess data
preprocessing = Preprocessing()
try:
    train_features, train_labels = preprocessing.preprocess_data(train_images, train_labels, fit=True, balance_method='smote')
    val_features, val_labels = preprocessing.preprocess_data(val_images, val_labels, fit=False)
    test_features, test_labels = preprocessing.preprocess_data(test_images, test_labels, fit=False)
except Exception as e:
    print(f"Error: {e}")
# Convert labels to single dimension
try:
    if train_labels.ndim > 1:
        train_labels_single = np.argmax(train_labels, axis=1)
    else:
        train_labels_single = train_labels

    if val_labels.ndim > 1:
        val_labels_single = np.argmax(val_labels, axis=1)
    else:
        val_labels_single = val_labels

    if test_labels.ndim > 1:
        test_labels_single = np.argmax(test_labels, axis=1)
    else:
        test_labels_single = test_labels
except Exception as e:
    print("Error during preprocessing")

## Hyperparameter Tuning

In [None]:
# Perform hyperparameter tuning
skf = StratifiedKFold(n_splits=10)

# Define parameter distribution
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [10, 15, 20, 25, 32, None],
    'min_samples_split': [50, 100, 150],
    'min_samples_leaf': [20, 30, 50],
    'bootstrap': [True],
    'max_samples': [0.8, 0.9, 1.0],
    'criterion': ['gini', 'entropy']
}

best_params = None
best_score = None
#scoring = "roc_auc"
scoring = "accuracy"
#scoring = "f1"
# Hyperparameter Tuning or Loading Model
if os.path.exists(model_path) and os.path.exists(pca_path):
    rf_model = RandomForestModel()  # Use your class
    rf_model.load(model_path)  # Load the model using the class method
    preprocessing.pca = joblib.load(pca_path)
    print("Loaded existing model and PCA.")
else:
    best_params, best_score = HyperparameterTuning().randomized_search_hyperparameters(train_features, train_labels_single, cv=skf, param_dist=param_dist, scoring=scoring)

    rf_model = RandomForestModel(
        n_estimators=best_params['n_estimators'],
        max_features=best_params['max_features'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        random_state=42,
    )
    rf_model.fit(train_features, train_labels_single)

    rf_model.save(model_path)
    joblib.dump(preprocessing.pca, pca_path)
    print("Trained and saved new model and PCA.")

## Evaluating Model

In [None]:
data_saver = DataSaver()

if scoring == "roc_auc":
    # Compute ROC AUC for cross validation
    val_roc_auc_cv = cross_val_score(rf_model.model, val_features, val_labels_single, cv=skf, scoring='roc_auc_ovr').mean()

    # Evaluate ROC AUC
    val_roc_auc = Evaluation.evaluate_model(rf_model, val_features, val_labels_single, "Validation", run_folder, class_names)
    test_roc_auc = Evaluation.evaluate_model(rf_model, test_features, test_labels_single, "Test", run_folder, class_names)
    train_roc_auc = Evaluation.evaluate_model(rf_model, train_features, train_labels_single, "Training", run_folder, class_names)
    # Save results
    data_saver.save_results({
        'train_roc_auc': train_roc_auc,
        'val_roc_auc_cv': val_roc_auc_cv,
        'val_roc_auc': val_roc_auc,
        'test_roc_auc': test_roc_auc,
    })
else:
    # Compute ACCURACY for cross validation
    val_accuracy_cv = cross_val_score(rf_model.model, val_features, val_labels_single, cv=skf, scoring='accuracy').mean()

    # Evaluate Accuracy
    val_accuracy = Evaluation.evaluate_model(rf_model, val_features, val_labels_single, "Validation", run_folder, class_names, metric='accuracy')
    test_accuracy = Evaluation.evaluate_model(rf_model, test_features, test_labels_single, "Test", run_folder, class_names, metric='accuracy')
    train_accuracy = Evaluation.evaluate_model(rf_model, train_features, train_labels_single, "Training", run_folder, class_names, metric='accuracy')
    # Save results
    data_saver.save_results({
        'val_accuracy_cv': val_accuracy_cv,
        'train_accuracy': train_accuracy,
        'val_accuracy': val_accuracy,
        'test_accuracy': test_accuracy
    })

## Saving Technical Details

In [None]:
if scoring == "roc_auc":
    technical_details = {
    'best_hyperparameters': best_params if best_params else "Loaded model, no hyperparameter tuning performed",
    'auc_cross_validation_scores': val_roc_auc_cv,
    'test_scores': test_roc_auc,
    'validation_scores': val_roc_auc,}
    with open(f"{run_folder}/technical_details.json", 'w') as f:
        json.dump(technical_details, f, indent=4)


else: 
    technical_details = {
    'best_hyperparameters': best_params if best_params else "Loaded model, no hyperparameter tuning performed",
    'test_accuracy': test_accuracy,
    'val_accuracy': val_accuracy,
    'accuracy_cross_validation_scores': val_accuracy_cv,}
    with open(f"{run_folder}/technical_details.json", 'w') as f:
        json.dump(technical_details, f, indent=4)

# Update reports.json
reports_path = os.path.join(output_datas, "reports.json")
if os.path.exists(reports_path):
    with open(reports_path, 'r') as f:
        reports = json.load(f)
else:
    reports = {}

if scoring == "roc_auc":
    reports[f"run_{run_number}"] = {
        'train_roc_auc': train_roc_auc,
        'val_roc_auc': val_roc_auc,
        'test_roc_auc': test_roc_auc,
    }
else:
    reports[f"run_{run_number}"] = {
        'train_accuracy': train_accuracy,
        'val_accuracy': val_accuracy,
        'test_accuracy': test_accuracy
    }
    
with open(reports_path, 'w') as f:
    json.dump(reports, f, indent=4)

## Visualizing Results

In [None]:
VisualizeDatas.display_images(run_folder,scoring)
VisualizeDatas.display_json_report(run_folder)

## End of Notebook