In [1]:
%load_ext autoreload
%autoreload 2

import csv
import ipywidgets as widgets
import numpy as np
import os
import pandas as pd
import time

from benchmarks import gsa_svm_fitness
from src.entities import GSA

from IPython.display import display
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from typing import Mapping, Tuple, Union
from ucimlrepo import fetch_ucirepo

In [2]:
uci_datasets = ["Breast Cancer Wisconsin (Diagnostic)",
                "Spambase",
                "Mushroom"]

widget_opt = widgets.Dropdown(
    options=uci_datasets,
    description='Dataset: '
)

display(widget_opt)



Dropdown(description='Dataset: ', options=('Breast Cancer Wisconsin (Diagnostic)', 'Spambase', 'Mushroom'), va…

In [36]:
def fetch_categorical_dataset(_id: int) -> Tuple[pd.DataFrame, pd.Series]:
    data = fetch_ucirepo(id=_id)
    X = data.data.features
    y = data.data.targets
    encoder = LabelEncoder()
    for col in X.columns:
        X.loc[:, col] = encoder.fit_transform(X[col])
    return X, y

widget_opt.value = "Spambase"

if widget_opt.value == "Breast Cancer Wisconsin (Diagnostic)":
    data = fetch_ucirepo(id=15)
    X = data.data.features
    y = data.data.targets
    X = X.fillna(value=0)
elif widget_opt.value == "Spambase":
    X, y = fetch_categorical_dataset(_id=94)
elif widget_opt.value == "Mushroom":
    X, y = fetch_categorical_dataset(_id=73)
    
# Name of selected dataset and summary (number of instances, features, etc.)
print(f"Dataset: {widget_opt.value}")
print(f"Instances: {X.shape[0]} (Train / Test: 80% / 20%)")
print(f"Features: {X.shape[1]}")

Dataset: Spambase
Instances: 4601 (Train / Test: 80% / 20%)
Features: 57


In [37]:
wa = 0.8
wf = 0.2

conf_matrix_dict = {"TP": 0, "FP": 0, "TN": 0, "FN": 0}

class UCI:
    """
    Class to handle UCI datasets
    
    Args:
        X (pd.DataFrame): Features
        y (pd.Series): Target
    """
    def __init__(self,
                 X: pd.DataFrame,
                 y: pd.Series,
                 boundaries: Mapping[str, Tuple[Tuple[float, float], ...]]
                 ) -> None:
        """
        Constructor
        
        Args:
            X (pd.DataFrame): Features
            y (pd.Series): Target
        """
        self.X = X
        self.y = y
        self.boundaries = boundaries

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(self.X)
        
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.20, random_state=5)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    
    def get_fitness(self,
                    solution: Mapping[str, np.ndarray],
                    data: Union[None, Tuple[np.ndarray, np.ndarray]] = None,
                    show_confusion_matrix: bool=False
                    ) -> Tuple[float, float]:
        """
        Get fitness of a solution
        
        Args:
            solution (Mapping[str, np.ndarray]): Solution to evaluate
            data (Union[None, Tuple[np.ndarray, np.ndarray]], optional): Data to evaluate the solution. Defaults to None.
        
        Returns:
            Tuple[float, float]: Fitness and accuracy of the solution    
        """
        if data is None:
            X_scaled = self.X_train
            y_data = self.y_train
        else:
            X_scaled, y_data = data
            
        gamma, C = solution['real']
        gamma /= 1000
        C /= 1000
        X_scaled_filtered = X_scaled[:, solution['discrete'].astype(int) == 1]
        svc_model = SVC(gamma=gamma, C=C, kernel="rbf", verbose=False)
        svc_model.fit(X_scaled_filtered, np.ravel(y_data))
        y_predict = svc_model.predict(X_scaled_filtered)
        conf_matrix = confusion_matrix(y_data, y_predict)
        if show_confusion_matrix:
            # Update global conf_matrix_dict
            conf_matrix_dict["TP"] = conf_matrix[0, 0]
            conf_matrix_dict["FP"] = conf_matrix[0, 1]
            conf_matrix_dict["TN"] = conf_matrix[1, 1]
            conf_matrix_dict["FN"] = conf_matrix[1, 0]
            print(conf_matrix)
        accuracy = accuracy_score(y_data, y_predict) * 100
        
        return gsa_svm_fitness(accuracy=accuracy, solution=solution, wa=wa, wf=wf)

    def is_feasible(self, solution: Mapping[str, np.ndarray]) -> bool:
        """
        Check if a solution is feasable

        Args:
            solution (Mapping[str, np.ndarray]): Solution to evaluate

        Returns:
            bool: True if the solution is feasable, False otherwise
        """
        real_values = solution['real']
        discrete_values = solution['discrete']

        for i, (min_val, max_val) in enumerate(self.boundaries['real']):
            if real_values[i] < min_val or real_values[i] > max_val:
                return False

        for i, (min_val, max_val) in enumerate(self.boundaries['discrete']):
            if discrete_values[i] < min_val or discrete_values[i] > max_val:
                return False

        return True

boundaries = {'real': [(1, 100_000), (1, 100_000)], 'discrete': [(0, 1) for _ in range(len(X.columns))]}
uci = UCI(X, y, boundaries)

In [45]:
# Select number of repetitions for each experiment. 
runs = 10

population_size = 5 
iterations = 20

chaotic_constant=True
repair_solution=False

# Export results ?
export = True

save_path = "data/output/"
if not os.path.exists(save_path):
    os.makedirs(save_path)

atLeastOneIteration = False

# CSV Header for the convergence
CnvgHeader = ["Sol_Iter"+str(l+1) for l in range(iterations)]

gsa_history_dict = {"best_solution_history": [],
                    "accuracy_history": [],
                    "gamma_history": [],
                    "c_history": [],
                    "n_features_history": [],
                    "execution_time_history": [],
                    "TP_history": [],
                    "FP_history": [],
                    "TN_history": [],
                    "FN_history": []}

uci = UCI(X, y, boundaries)

for k in range(runs):
    gsa_algo = GSA(objective_function = uci.get_fitness,
                   is_feasible=uci.is_feasible,
                   r_dim=2,
                   d_dim=len(X.columns),
                   boundaries=uci.boundaries)
    
    gsa_algo.optimize(population_size=population_size,
                      iters=iterations,
                      chaotic_constant=chaotic_constant,
                      repair_solution=repair_solution)
    
    print(gsa_algo.solution_history[-1])
    fitness, accuracy = uci.get_fitness(solution=gsa_algo.solution_history[-1],
                                        data=(uci.X_test, uci.y_test),
                                        show_confusion_matrix=True)
    
    print("Test accuracy: ", accuracy, " - Fitness: ", fitness)
    
    gsa_history_dict["best_solution_history"].append(gsa_algo.solution_history[-1])
    gsa_history_dict["accuracy_history"].append(accuracy)
    gsa_history_dict["gamma_history"].append(gsa_algo.solution_history[-1]['real'][0] / 1_000)
    gsa_history_dict["c_history"].append(gsa_algo.solution_history[-1]['real'][1] / 1_000)
    gsa_history_dict["n_features_history"].append(np.sum(gsa_algo.solution_history[-1]['discrete']))
    gsa_history_dict["execution_time_history"].append(gsa_algo.execution_time)
    gsa_history_dict["TP_history"].append(conf_matrix_dict["TP"])
    gsa_history_dict["FP_history"].append(conf_matrix_dict["FP"])
    gsa_history_dict["TN_history"].append(conf_matrix_dict["TN"])
    gsa_history_dict["FN_history"].append(conf_matrix_dict["FN"])

GSA is optimizing  "get_fitness"
['At iteration 1 the best fitness is 89.36803966437833']
['At iteration 2 the best fitness is 91.36460717009916']
['At iteration 3 the best fitness is 91.36460717009916']
['At iteration 4 the best fitness is 91.36460717009916']
['At iteration 5 the best fitness is 91.36460717009916']
['At iteration 6 the best fitness is 91.36460717009916']
['At iteration 7 the best fitness is 91.36460717009916']
['At iteration 8 the best fitness is 91.36460717009916']
['At iteration 9 the best fitness is 91.36460717009916']
['At iteration 10 the best fitness is 91.36460717009916']
['At iteration 11 the best fitness is 91.36460717009916']
['At iteration 12 the best fitness is 91.36460717009916']
['At iteration 13 the best fitness is 91.36460717009916']
['At iteration 14 the best fitness is 91.36460717009916']
['At iteration 15 the best fitness is 91.36460717009916']
['At iteration 16 the best fitness is 91.75591151792526']
['At iteration 17 the best fitness is 92.1285278

In [46]:
best_solution_history

[{'real': array([96771.42219108, 31989.12063355]),
  'discrete': array([1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
         1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
         0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0])},
 {'real': array([21341.91659453, 36214.24120784]),
  'discrete': array([0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
         0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
         0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1])},
 {'real': array([34999.76689013, 33029.60949548]),
  'discrete': array([1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
         1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
         1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1])},
 {'real': array([55007.94571683, 14663.2623042 ]),
  'discrete': array([1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
         1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1

In [47]:
accuracy_history

[99.0228013029316,
 99.1313789359392,
 99.8914223669924,
 100.0,
 99.7828447339848,
 99.7828447339848,
 100.0,
 99.8914223669924,
 99.2399565689468,
 100.0]

In [48]:
gamma_history

[96.77142219107846,
 21.34191659453318,
 34.99976689013261,
 55.00794571683064,
 31.46176903769579,
 67.24380916711104,
 58.52402241958865,
 68.4224658920447,
 61.0207426100039,
 92.03485898316129]

In [49]:
c_history

[31.989120633546566,
 36.21424120784452,
 33.02960949548024,
 14.663262304196643,
 83.62042339962923,
 67.908511839843,
 48.26713378894565,
 41.11505018315448,
 59.13410951567003,
 94.36293749080663]

In [50]:
mean_accuracy = np.mean(accuracy_history)
std_accuracy = np.std(accuracy_history)

print(f"Mean accuracy: {mean_accuracy} +/- {std_accuracy}")

mean_gamma = np.mean(gamma_history)
std_gamma = np.std(gamma_history)

print(f"Mean gamma: {mean_gamma} +/- {std_gamma}")

mean_c = np.mean(c_history)
std_c = np.std(c_history)

print(f"Mean C: {mean_c} +/- {std_c}")

mean_n_features = np.mean(n_features_history)
std_n_features = np.std(n_features_history)

print(f"Mean n_features: {mean_n_features} +/- {std_n_features}")

mean_execution_time = np.mean(execution_time_history)
std_execution_time = np.std(execution_time_history)

print(f"Mean execution time: {mean_execution_time} +/- {std_execution_time}")

Mean accuracy: 99.6742671009772 +/- 0.3666002826522075
Mean gamma: 58.68287195021802 +/- 23.33765807837612
Mean C: 51.0304399859117 +/- 23.71707200338937
Mean n_features: 26.9 +/- 5.223983154643591
Mean execution time: 70.19765939712525 +/- 3.598197248455956


In [51]:
import datetime

date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

results = pd.DataFrame({"date": [date],
                        "dataset_name": [widget_opt.value],
                        "r_runs": [runs],
                        "n_pop": [population_size],
                        "n_iters": [iterations],
                        "chaotic_constant": [chaotic_constant],
                        "repair_solution": [repair_solution],
                        "execution_time": [mean_execution_time],
                        "execution_time_std": [std_execution_time],
                        "wa": [wa],
                        "wf": [wf],
                        "mean_accuracy": [mean_accuracy],
                        "std_accuracy": [std_accuracy],
                        "mean_gamma": [mean_gamma],
                        "std_gamma": [std_gamma],
                        "mean_c": [mean_c],
                        "std_c": [std_c],
                        "mean_n_features": [mean_n_features],
                        "std_n_features": [std_n_features],
                        "TP": [conf_matrix_dict["TP"]],
                        "FP": [conf_matrix_dict["FP"]],
                        "TN": [conf_matrix_dict["TN"]],
                        "FN": [conf_matrix_dict["FN"]]})

if os.path.exists("data/gsa_records_new.csv"):
    # Load dataframe and append new row
    df = pd.read_csv("data/gsa_records_new.csv")
    df = pd.concat([df, results], axis=0, ignore_index=True)
    df.to_csv("data/gsa_records_new.csv", index=False)
else: # Append row to existing file
    results.to_csv("data/gsa_records_new.csv", index=False)

                /\ 
               /  \
                || 
                
        Nuestros resultados
        
Dataset: Mushroom
Instances: 8124 (Train / Test: 80% / 20%)
Features: 22

Resultados autores:

| Metric | Value             |
| --- |-------------------|
| Accuracy | 98.06 +/- 0.78    |
| Gamma | 0.0067 +/- 0.0144 |
| C | 47.35 +/- 27.57   |
| n_features | 3 +/- 2.16        |

                /\ 
               /  \
                || 
                
        Nuestros resultados
        
Dataset: Breast Cancer Wisconsin (Diagnostic) Data Set

Resultados autores:

| Metric | Value             |
| --- |-------------------|
| Accuracy | 99.54 +/- 0.25    |
| Gamma | 0.0685 +/- 0.1293 |
| C | 40.30 +/- 22.37   |
| n_features | 2 +/- 1           |