# 1. Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import os
import joblib
warnings.filterwarnings("ignore")
from prettytable import PrettyTable
from datetime import datetime
from sklearn.pipeline import Pipeline
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from umap import UMAP
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingRegressor,
    RandomForestClassifier,
    RandomForestRegressor,
)
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    mean_absolute_error,
    mean_squared_error,
    precision_score,
    recall_score,
    r2_score,
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
) # all metrics used in the notebook
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# 2. Exploratory Data Analysis (EDA)

In [None]:
class EDA:
    """
    Class dedicated to load, transform and visualize data.
    """
    def __init__(self, file=None):
        """
        Starts the class and loads the data from a CSV file

        Parameters:
        -file (str): Path to a CSV file
        """
        if file:
            self.df = pd.read_csv(file)
        else:
            raise ValueError("⚠️ No file path provided.")

    
    def head_df(self, n=5):
        """
        Returns the first n rows of the DataFrame
        """
        if self.df.empty:
            return "⚠️ No data available"
        else:
            return self.df.head(n)

    def tail_df(self, n=5):
        """
        Returns the last n rows of the DataFrame
        """
        if self.df.empty:
            return "⚠️ No data available"
        else:
            return self.df.tail(n)

    def check_data_types(self):
        """
        Shows the data types of each column in the DataFrame
        """
        return self.df.dtypes

    def drop_irrelevant_columns(self, columns):
        """
        Drops irrelevant columns from the DataFrame selected by the user

        Parameters:
        -columns (list): List of column names to drop.
        """
        self.df.drop(columns=columns, inplace=True)

    def drop_missing_values(self):
        """
        Drops rows with missing values from the DataFrame
        """
        self.df.dropna(inplace=True)

    def detect_outliers(self):
        """
        Extracts the outliers from the DataFrame using the IQR method
        (Interquartile Rang)
        Parameters:
        -df (pd.DataFrame): DataFrame to analyze
        Returns:
        -Dicc_outliers (dict): Dictionary with the number of outliers for each column
        """
        num_df = self.df.select_dtypes(include=['float64', 'int64'])
        if num_df.empty:
            return "No numeric columns to detect outliers."

        Q1 = num_df.quantile(0.25)
        Q3 = num_df.quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((num_df < (Q1 - 1.5 * IQR)) | (num_df > (Q3 + 1.5 * IQR))).sum()
        Dicc_outliers = {col: outliers[col] for col in num_df.columns if outliers[col] > 0}

        return Dicc_outliers if Dicc_outliers else "No outliers detected"

    def plot_scatter(self, col1, col2):
        """
        Plots a scatter plot of two columns in the DataFrame
        """
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=self.df[col1], y=self.df[col2])
        plt.title(f'Scatter Plot: {col1} vs {col2}')
        plt.xlabel(col1)
        plt.ylabel(col2)
        plt.grid()
        plt.show()

    def plot_histogram(self, col):
        """
        Plots a histogram of a column in the DataFrame
        """
        plt.figure(figsize=(10, 6))
        sns.histplot(self.df[col], kde=True)
        plt.title(f'Histogram {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()

    def plot_heatmap(self):
        """
        Plots a heatmap of the correlation matrix of the DataFrame in-line
        """
        num_df = self.df.select_dtypes(include=['float64', 'int64'])
        if num_df.empty:
            return "There's no numeric columns to plot a heatmap."
        num_df = num_df.loc[:, num_df.apply(lambda x: np.std(x) > 0.01)]

        fig, ax = plt.subplots(figsize=(12,10))
        sns.heatmap(num_df.corr(), cmap="coolwarm", annot=True, ax=ax)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
        ax.set_title("Heatmap")
        fig.tight_layout()
        fig.show()
        plt.pause(0.1)
        plt.close(fig)

    def __str__(self):
        """
        Returns a string representation of the class. In case user needs to print the class
        """
        return f"🟡 EDA - Dataframe shape: {self.df.shape}"

    def get_df(self):
        """
        Provides a safe copy of the dataset for use in machine learning algorithms
        """
        return self.df.copy()

# 3. Unsupervised Model

In [None]:
class Unsupervised ():
    """
    Class dedicated to unsupervised learning algorithms.
    Contains: Methods and metrics.
    """
    def __init__(self, dataframe_eda):
        self.df = dataframe_eda.get_df()

    # Tools and metrics

    def byebye_object_values(self):
        # Delete object values from the dataframe
        self.df = self.df.select_dtypes(exclude=['object'])

    def calculate_metrics(self, labels):
        """
        Calculates clustering metrics:
        - Silhouette: how well samples fit within their cluster
        - Calinski-Harabasz: cluster separation and compactness
        - Davies-Bouldin: similarity between clusters
        """
        data = self.df.dropna()
        data = (data - data.mean()) / data.std()
        metrics = {
            "Silhouette Score": silhouette_score(data, labels),
            "Calinski-Harabasz": calinski_harabasz_score(data, labels),
            "Davies-Bouldin": davies_bouldin_score(data, labels)
        }
        return metrics
    
    # -----------------Algorithms------------------------
    
    def kmeans(self, n_clusters, return_estimator=False):
        self.byebye_object_values()
        data = self.df
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        labels = kmeans.fit_predict(data)
        scores = self.calculate_unsupervised_scores("K-Means", labels)
        if return_estimator:
            return {"model": "K-Means", "metrics": scores, "estimator": kmeans}
        return scores

    def k_medoids(self, n_clusters, metric='euclidean', return_estimator=False):
        self.byebye_object_values()
        data = self.df
        kmedoids = KMedoids(n_clusters=n_clusters, metric=metric, random_state=42)
        labels = kmedoids.fit_predict(data)
        scores = self.calculate_unsupervised_scores("K-Medoids", labels)
        if return_estimator:
            return {"model": "K-Medoids", "metrics": scores, "estimator": kmedoids}
        return scores
        
    def hac(self, n_clusters, method='ward', return_estimator=False):
        self.byebye_object_values()
        data = self.df
        linkage_matrix = linkage(data, method=method)
        labels = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust')
        scores = self.calculate_unsupervised_scores("HAC", labels)
        if return_estimator:
            return {"model": "HAC", "metrics": scores, "estimator": None}
        return scores

    def umap_reduc(self, n_clusters, n_components=2, n_neighbors=15, return_estimator=False):
        self.byebye_object_values()
        data = self.df
        reducer = UMAP(n_components=n_components, n_neighbors=n_neighbors, random_state=42)
        components = reducer.fit_transform(data)
        km = KMeans(n_clusters=n_clusters, random_state=42)
        labels = km.fit_predict(components)
        scores = self.calculate_unsupervised_scores("UMAP+KMeans", labels)
        if return_estimator:
            pipe = Pipeline([("umap", reducer), ("kmeans", km)])
            return {"model": "UMAP+KMeans", "metrics": scores, "estimator": pipe}
        return scores

    def calculate_unsupervised_scores(self, model_name, labels):
        data = self.df.dropna()
        data = (data - data.mean()) / data.std()
        silhouette = silhouette_score(data, labels)
        calinski = calinski_harabasz_score(data, labels)
        davies = davies_bouldin_score(data, labels)
        results = {
            'model': model_name,
            'Silhouette Score': silhouette,
            'Calinski-Harabasz': calinski,
            'Davies-Bouldin': davies
        }
        return results

# 4. Supervised Model

In [None]:
class Supervised():
    """
    Class dedicated to supervised learning algorithms.
    Contains: Split data, methods and metrics for regression and classification.
    """
    def __init__(self, dataframe_eda):
        self.df = dataframe_eda.get_df()

#-----------------Split Process----------------------------

    def get_percent_data_split(self):
        """
        Requests the user to input a percentage for splitting the dataset into training and testing sets.

        Returns:
        - float: Percentage of data to be used for training (between 0 and 1).
        """
        while True:
            try:
                df_percentage = float(input("Enter the percentage to be used for training (Example: 80) \n"))
                if 0 < df_percentage < 100:
                    return df_percentage / 100
                else:
                    print("Percentage must be between 1 and 99.")
            except ValueError:
                print("⚠️ Invalid number, please try again...")

    def split_df(self, target_column, test_size=None, random_state=42, task_type="classification"):
      """
      Calls get_percent_data_split and divides the DataFrame into training and testing sets.
      Modifies the target column based on the task type (classification or regression).
      Shows how many rows are in the training and testing sets.

      Parameters:
      - target_column: Name of the target column (y).
      - test_size: Percentage of the test set. If not provided, it will be calculated automatically.
      - random_state: Random seed for reproducibility.

      Returns:
      -Datasets for modeling X_train, X_test, y_train, y_test
      """
      if test_size is None:
          # If test_size is not provided, request the user for the percentage
          test_size = 1 - self.get_percent_data_split()

      while True:
          try:
              # Try to split features (X) and target (y)
              X = self.df.drop(columns=[target_column])
              y = self.df[target_column]
              break
          except KeyError:
              print(f"Column '{target_column}' does not exist. Please try again.")
              print("Available columns:")
              print(self.check_data_types())
              target_column = input("Enter the target value to be predicted: ")

      # Encode the target if it's categorical
      X = pd.get_dummies(X, drop_first=True)

      # Process target variable based on task type
      if task_type == "classification":
        if y.dtypes == 'object' or y.dtypes.name == 'category':
            from sklearn.preprocessing import LabelEncoder
            le = LabelEncoder()
            y = le.fit_transform(y)
      elif task_type == "regression":
        # Ensure y is numeric for regression
        if not pd.api.types.is_numeric_dtype(y):
            try:
                y = pd.to_numeric(y, errors="raise")
            except Exception as e:
                raise ValueError(
                    "Target column is not numeric and cannot be converted: "
                    f"{e}"
                )
      else:
          raise ValueError("task_type must be 'classification' or 'regression'")

      # Split the data using train_test_split
      X_train, X_test, y_train, y_test = train_test_split(
          X, y, test_size=test_size, random_state=random_state
      )

      # Show how many rows are in the training and testing sets
      print(f"Dataset divided:\n- Training: {X_train.shape[0]} rows \n- Testing: {X_test.shape[0]} rows")
      return X_train, X_test, y_train, y_test

#-----------------Metric Evaluation Process----------------------------
    def calculate_regression_scores(self, model_name, y_test, model_predictions):
      """
      Calculates the evaluation metrics for regression models and stores the results in a dictionary.

      Parameters:
      - model_name: Name of the model
      - X_test: Features from the test set.
      - y_test: Target Value from the test set.
      - model_predictions: Predicted values from the model.

      Returns:
      - results: Dictionary with evaluation scores
      """
      mse = mean_squared_error(y_test, model_predictions)
      r2 = r2_score(y_test, model_predictions)
      mae = mean_absolute_error(y_test, model_predictions)
      rmse = np.sqrt(mse)
      tolerance = 0.1  # Equivalent to 10% tolerance
      tolerance_accuracy = np.mean(np.abs(y_test - model_predictions) <= (tolerance * y_test)) * 100

      results = {
          'model': model_name,
          'MSE': mse,
          'R2': r2,
          'MAE': mae,
          'RMSE': rmse,
          'tolerance_accuracy': tolerance_accuracy
      }
      return results

    def calculate_classification_scores(self, model_name, y_test, model_predictions):
      """
      Calculates classification evaluation metrics and stores the results in a dictionary.
      Metrics were set with average='weighted' to handle imbalanced datasets.
      Watermark - Esteban Ramirez M

      Parameters:
      - model_name: Name of the classification model.
      - y_test: Actual target values from the test set.
      - model_predictions: Predicted target values generated by the model.

      Returns:
      - results: Dictionary containing classification evaluation metrics (Accuracy, Precision, Recall, F1-score).
      """
      accuracy = accuracy_score(y_test, model_predictions)
      precision = precision_score(y_test, model_predictions, average='weighted')
      recall = recall_score(y_test, model_predictions, average='weighted')
      f1 = f1_score(y_test, model_predictions, average='weighted')

      results = {
          'model': model_name,
          'accuracy': accuracy,
          'precision': precision,
          'recall': recall,
          'f1_score': f1
      }
      return results


#------------------------Regression Models--------------------------------------------------------------

    def regre_lineal_simple(self, X_train, X_test, y_train, y_test):
      """
      Runs Simple Linear Regression and calculates multiple performance metrics.
      Returns:
      - results: Dictionary of regression metrics.
      """
      print("Starting Simple Linear Regression...")
      model = LinearRegression()
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_regression_scores("LinearRegression", y_test, predictions)
      results['estimator'] = model
      return results

    def regre_regridge(self, X_train, X_test, y_train, y_test):
      """
      Runs Ridge Regression and calculates multiple performance metrics.
      Returns:
      - results: Dictionary of regression metrics.
      """
      print("Starting Ridge Regression...")
      model = Ridge(alpha = 1.0)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_regression_scores("Ridge", y_test, predictions)
      results['estimator'] = model
      return results

    def regre_decisionTree(self, X_train, X_test, y_train, y_test):
      """
      Runs Decision Tree Regressor and calculates multiple performance metrics.
      Returns:
      - results: Dictionary of regression metrics.
      """
      print("Starting Decision Tree Regressor...")
      model = DecisionTreeRegressor(random_state=42)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_regression_scores("Decision Tree Regressor", y_test, predictions)
      results['estimator'] = model
      return results

    def regre_randomforest(self, X_train, X_test, y_train, y_test):
      """
      Runs Random Forest Regressor and calculates multiple performance metrics.
      Returns:
      - results: Dictionary of regression metrics.
      """
      print("Starting Random Forest Regressor...")
      model = RandomForestRegressor(max_depth=2, random_state=42)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_regression_scores("Random Forest Regressor", y_test, predictions)
      results['estimator'] = model
      return results

    def regre_gradient_boosting(self, X_train, X_test, y_train, y_test):
      """
      Runs Grandient Boostsing Regressor and calculates multiple performance metrics.
      Returns:
      - results: Dictionary of regression metrics.
      """
      print("Starting Grandient Boostsing Regressor...")
      model = GradientBoostingRegressor(random_state=42)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_regression_scores("Gradient Boosting Regressor", y_test, predictions)
      results['estimator'] = model
      return results

#------------------------Classification Models--------------------------------------------------------------

    def classi_decision_tree(self, X_train, X_test, y_train, y_test):
      """
      Runs Decision Tree Classifier and calculates multiple performance metrics.
      Returns:
      - results: Dictionary with classification metrics.
      """
      print("Starting Decision Tree Classifier...")
      model = DecisionTreeClassifier(random_state=0)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_classification_scores("Decision Tree Classifier", y_test, predictions)
      results['estimator'] = model
      return results

    def classi_knn(self, X_train, X_test, y_train, y_test):
      """
      Runs K-Nearest Neighbors Classifier and calculates multiple performance metrics.
      Returns:
      - results: Dictionary with classification metrics.
      """
      print("Starting K-Nearest Neighbors Classifier...")
      model = KNeighborsClassifier()
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_classification_scores("KNN", y_test, predictions)
      results['estimator'] = model
      return results

    def classi_random_forest(self, X_train, X_test, y_train, y_test):
      """
      Runs Random Forest Classifier and calculates multiple performance metrics.
      Returns:
      - results: Dictionary with classification metrics.
      """
      print("Starting Random Forest Classifier...")
      model = RandomForestClassifier(random_state=42)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_classification_scores("Random Forest Classifier", y_test, predictions)
      results['estimator'] = model
      return results

    def classi_adaboost(self, X_train, X_test, y_train, y_test):
      """
      Runs AdaBoost Classifier and calculates multiple performance metrics.
      Returns:
      - results: Dictionary with classification metrics.
      """
      print("Starting AdaBoost Classifier...")
      model = AdaBoostClassifier(random_state=42)
      model.fit(X_train, y_train)
      predictions = model.predict(X_test)
      results = self.calculate_classification_scores("AdaBoost Classifier", y_test, predictions)
      results['estimator'] = model
      return results

# 5. Menu
Calling this class allows to use all features in this notebook.

In [None]:
class Start:
    """
    Class dedicated to the main controller of the program. Shows all menus that
    the user can interact with, calling all previous classes
    """
    def __init__(self):
        """
        Main controller, starts with no data loaded
        """
        self.file_path = None
        self.eda = None
        self.unsupervised = None
        self.supervised = None
        self.dataset_loaded = False
        

    def load_dataset(self, file_name: str) -> None:
        """
        Loads a dataset from a CSV file located in the datasets/ directory
        """
        # If user does not provide extension csv
        if not file_name.endswith('.csv'):
            file_name = f"{file_name}.csv"

        # By default is datasets/, included in the project
        full_path = f"datasets/{file_name}"

        if not os.path.isfile(full_path):
            raise FileNotFoundError(f"⚠️  File not found: {full_path}")
        self.eda = EDA(full_path)
        self.file_path = full_path
        self.dataset_loaded = True
        print(f"✅ Loaded: {full_path}")
    
    def run(self) -> None:
        """
        This is the first menu that the user sees. There are two main levels:
        1) Load a dataset from a CSV file.
        2) If a dataset is loaded, the user can choose between EDA tools or Modeling tools.
        """
        plt.ioff()
        while True:
            # 1) Load a dataset from a CSV file
            if not self.dataset_loaded:
                file_name = input("Enter CSV file name (or 'q' to quit): ")
                if file_name.lower() == "q":
                    break
                try:
                    self.load_dataset(file_name)
                except Exception as e:
                    print(e)
                    continue  # ask again

            # 2) If a dataset is loaded, the user can choose between EDA tools or Modeling tools.
            else:
                print("\nMAIN MENU (dataset loaded)")
                print("1) EDA tools")
                print("2) Modeling tools")
                print("3) Reload dataset")
                print("0) Quit")
                choice = input("Select option: ")

                if choice == "1":
                    self.eda_menu()
                elif choice == "2":
                    self.model_menu()
                elif choice == "3":
                    self.dataset_loaded = False
                elif choice == "0":
                    break
                else:
                    print("Invalid input, please try again. \n")

    def eda_menu(self):
        """
        Third level menu, where the user can choose between EDA
        tools from EDA class. The menu repeats until the user exits.
    
        Options:
        - Viewing DataFrame head and data types
        - Removing columns or null values
        - Detecting outliers
        - Plotting scatter, histogram, and heatmap
        """
        while True:
            print("\n------- EDA Menu -------")
            print("1) Show DataFrame head")
            print("2) Show DataFrame dtypes")
            print("3) Remove columns")
            print("4) Drop null values")
            print("5) Detect outliers")
            print("6) Plot scatter chart")
            print("7) Plot histogram")
            print("8) Plot heatmap")
            print("0) Back to main")
            choice = input("Select option: ")

            if choice == "1":
                print(self.eda.head_df())
            elif choice == "2":
                print(self.eda.check_data_types())
            elif choice == "3":
                column_list = input("Enter column names to delete (comma-separated): ").split(",")
                column_list = [col.strip() for col in column_list]
                try:
                    self.eda.drop_irrelevant_columns(column_list)
                    print(f"Columns removed: {', '.join(column_list)}")
                except Exception as e:
                    print(f"Error removing columns: {e}")
            elif choice == "4":
                self.eda.drop_missing_values()
                print("Null values dropped.")
            elif choice == "5":
                print(self.eda.detect_outliers())
            elif choice == "6":
                print("\n*** Available variables ***")
                print(self.eda.check_data_types())
                first_var = input("Enter X variable name: ").strip()
                second_var = input("Enter Y variable name: ").strip()
                try:
                    self.eda.plot_scatter(first_var, second_var)
                except Exception as e:
                    print(f"Plotting error: {e}")
                    break
            elif choice == "7":
                print("\n*** Available variables ***")
                print(self.eda.check_data_types())
                histogram_var = input("Enter varible name: ")
                try:
                    self.eda.plot_histogram(histogram_var)
                except Exception as e:
                    print(f"Plotting error: {e}")
            elif choice == "8":
                self.eda.plot_heatmap()
            elif choice == "0":
                plt.close('all')
                break
            else:
                print("Invalid input, please try again. \n")

    def model_menu(self):
        """
        This is the third level menu, where the user can choose between
        several machine learning models from Usupervised and Supervised classes.

        Options:
        - Regression models
        - Classification models
        - Clustering models
        - Model serialization (after selecting any model)
        """
        if self.unsupervised is None:
            self.unsupervised = Unsupervised(self.eda)
        if self.supervised is None:
            self.supervised   = Supervised(self.eda)

        while True:
            print("\n------- Machine Learning Models Menu -------")
            print("1) Regression")
            print("2) Classification")
            print("3) Clustering")
            print("0) Back to main")
            choice = input("Select option: ")

            if choice == "1": # Regression
                # Show list of columns with their data types
                print("\n*** Available variables ***")
                print(self.eda.check_data_types())

                # Prompt user to select the target column
                target_column = input("\n Enter the name of the target column: ")

                # Spliting the dataset
                test_size = self.supervised.get_percent_data_split()
                self.split_data = self.supervised.split_df(target_column,
                    test_size=test_size, task_type="regression")

                X_train, X_test, y_train, y_test = self.split_data

                # List of models evaluated
                regre_model_group = [
                    self.supervised.regre_lineal_simple,
                    self.supervised.regre_regridge,
                    self.supervised.regre_decisionTree,
                    self.supervised.regre_randomforest,
                    self.supervised.regre_gradient_boosting
                ]

                # Execute each model and collect results
                regre_results = []
                for model in regre_model_group:
                    regre_results.append(model(X_train, X_test, y_train, y_test))
                
                table = PrettyTable()
                table.field_names = ["Model", "R²", "RMSE", "MAE"]
                # Add each result as a row
                for res in regre_results:
                    table.add_row([
                        res['model'],
                        f"{res['R2']:.4f}",
                        f"{res['RMSE']:.4f}",
                        f"{res['MAE']:.4f}"
                    ])
                print("Regression Results:")
                print(table)

                # model_list contains the model name and the model object
                model_list = []
                for res in regre_results:
                    if "estimator" in res and res["estimator"] is not None:
                        model_list.append((res["model"], res["estimator"]))
                # Section where user can serialize any evaluated model
                self.serialize_model_menu(model_list)
            elif choice == "2": # Classification
                # Show list of columns with their data types
                print("\n*** Available variables ***")
                print(self.eda.check_data_types())

                # Prompt user to select the target column
                target_column = input("\n Enter the name of the target column: ")

                # Spliting the dataset
                test_size = self.supervised.get_percent_data_split()
                self.split_data = self.supervised.split_df(target_column,
                    test_size=test_size)
                
                X_train, X_test, y_train, y_test = self.split_data

                # List of models evaluated
                classi_model_group = [
                    self.supervised.classi_decision_tree,
                    self.supervised.classi_knn,
                    self.supervised.classi_random_forest,
                    self.supervised.classi_adaboost
                ]

                # Execute each model and collect results
                classi_results = []
                for model in classi_model_group:
                    classi_results.append(model(X_train, X_test, y_train, y_test))

                table = PrettyTable()
                table.field_names = ["Model", "Accuracy", "Precision", "Recall", "F1-score"]
                for res in classi_results:
                    table.add_row([
                        res['model'],
                        f"{res['accuracy']:.4f}",
                        f"{res['precision']:.4f}",
                        f"{res['recall']:.4f}",
                        f"{res['f1_score']:.4f}"
                    ])
                print("Classification Results:")
                print(table)

                # model_list contains the model name and the model object
                model_list = []
                for res in classi_results:
                    if "estimator" in res and res["estimator"] is not None:
                        model_list.append((res["model"], res["estimator"]))
                # Section where user can serialize any evaluated model
                self.serialize_model_menu(model_list)

            elif choice == "3": # Clustering

                # Input number of clusters
                n_clusters = int(input("Enter the number of clusters (>2): "))

                # List of models evaluated
                cluster_model_group = [
                    lambda: self.unsupervised.kmeans(n_clusters, return_estimator=True),
                    lambda: self.unsupervised.k_medoids(n_clusters, return_estimator=True),
                    lambda: self.unsupervised.hac(n_clusters, return_estimator=True),
                    lambda: self.unsupervised.umap_reduc(n_clusters, return_estimator=True)
                ]

                # Execute each model and collect results
                cluster_results = []
                for model_func in cluster_model_group:
                    cluster_results.append(model_func())

                table = PrettyTable()
                table.field_names = ["Model", "Silhouette Score", "Calinski-Harabasz", "Davies-Bouldin"]
                for res in cluster_results:
                    metrics = res["metrics"]
                    table.add_row([
                        res['model'],
                        f"{metrics['Silhouette Score']:.4f}",
                        f"{metrics['Calinski-Harabasz']:.4f}",
                        f"{metrics['Davies-Bouldin']:.4f}"
                    ])
                print("Clustering Results:")
                print(table)

                # model_list contains the model name and the model object
                model_list = []
                for res in cluster_results:
                    if "estimator" in res and res["estimator"] is not None:
                        model_list.append((res["model"], res["estimator"]))
                # Section where user can serialize any evaluated model
                self.serialize_model_menu(model_list)
            elif choice == "0":
                break
            else:
                print("Invalid input, please try again. \n")

    # Serialization process
    def serialize_model_menu(self, model_list):
        """
        Serialize the model that user want to serialize or not.
        model_list: list of tuples (model_name, model_object)

        Returns:
        - model_path (str): Path where the model was saved, or None if no model
        - Data saved in serialized_models/ directory
        """

        # Print a menu with a list of models trained to serialize one
        print("\n------- Serialization Menu -------")
        for idx, (model_name, _) in enumerate(model_list, start=1):
            print(f"{idx}. {model_name}")
        print("0. Do not serialize any model!")

        choice = input("Select an option: ").strip()
        try:
            choice_int = int(choice)
        except ValueError:
            print("Invalid input, try again.")
            return None
        if choice_int == 0:
            return None
        if not (1 <= choice_int <= len(model_list)):
            print("Option out of range. No model serialized.")
            return None

        selected_name, selected_model = model_list[choice_int - 1]
        # The file name will be generated with a timestamp
        # Output: serialized_model_DATE_HOUR.joblib
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        file_name = f"serialized_model_{timestamp}.joblib"
        model_path = f"serialized_models/{file_name}"
        
        # Save file
        joblib.dump(selected_model, model_path)
        print(f"Model '{selected_name}' serialized to '{model_path}'")
        return model_path

In [None]:
# Starting program
start = Start()
start.run()