In [1]:
%load_ext autoreload
%autoreload 2

import csv
import numpy as np
import os
import pandas as pd
import time

from benchmarks import gsa_svm_fitness
from src.entities import GSA

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from typing import Mapping, Tuple, Union
from ucimlrepo import fetch_ucirepo

In [2]:
# Fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# Data (as pandas dataframes) 
X = breast_cancer_wisconsin_original.data.features 
X = X.fillna(value=0)  # Fill missing values with 0

y = breast_cancer_wisconsin_original.data.targets 

In [2]:
# Fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
X = mushroom.data.features
encoder = LabelEncoder()
for col in X.columns:
    X.loc[:, col] = encoder.fit_transform(X[col])
    
y = mushroom.data.targets 

In [3]:
X = X.iloc[:,1:23]  
y = y.iloc[:, 0]

In [3]:
class UCI:
    """
    Class to handle UCI datasets
    
    Args:
        X (pd.DataFrame): Features
        y (pd.Series): Target
    """
    def __init__(self,
                 X: pd.DataFrame,
                 y: pd.Series,
                 boundaries: Mapping[str, Tuple[Tuple[float, float], ...]] = None
                 ) -> None:
        """
        Constructor
        
        Args:
            X (pd.DataFrame): Features
            y (pd.Series): Target
        """
        self.X = X
        self.y = y
        self.boundaries = boundaries
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(self.X)
        
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.20, random_state=5)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    
    def get_fitness(self,
                    solution: Mapping[str, np.ndarray],
                    data: Union[None, Tuple[np.ndarray, np.ndarray]] = None,
                    ) -> Tuple[float, float]:
        """
        Get fitness of a solution
        
        Args:
            solution (Mapping[str, np.ndarray]): Solution to evaluate
            data (Union[None, Tuple[np.ndarray, np.ndarray]], optional): Data to evaluate the solution. Defaults to None.
        
        Returns:
            Tuple[float, float]: Fitness and accuracy of the solution    
        """
        if data is None:
            X_scaled = self.X_train
            y_data = self.y_train
        else:
            X_scaled, y_data = data
            
        gamma, C = solution['real']
        gamma /= 1000
        C /= 1000
        X_scaled_filtered = X_scaled[:, solution['discrete'].astype(int) == 1]
        svc_model = SVC(gamma=gamma, C=C, kernel="rbf", verbose=False)
        svc_model.fit(X_scaled_filtered, np.ravel(y_data))
        y_predict = svc_model.predict(X_scaled_filtered)
        conf_matrix = confusion_matrix(y_data, y_predict)
        accuracy = accuracy_score(y_data, y_predict) * 100
        
        return gsa_svm_fitness(accuracy=accuracy, solution=solution)
    
    def is_feasible(self, solution: Mapping[str, np.ndarray]) -> bool:
        """
        Check if a solution is feasable
        
        Args:
            solution (Mapping[str, np.ndarray]): Solution to evaluate
        
        Returns:
            bool: True if the solution is feasable, False otherwise
        """
        real_values = solution['real']
        discrete_values = solution['discrete']
        
        for i, (min_val, max_val) in enumerate(self.boundaries['real']):
            if real_values[i] < min_val or real_values[i] > max_val:
                return False
            
        for i, (min_val, max_val) in enumerate(self.boundaries['discrete']):
            if discrete_values[i] < min_val or discrete_values[i] > max_val:
                return False
        
        return True

boundaries = {'real': [(1, 100_000), (1, 100_000)], 'discrete': [(0, 1) for _ in range(len(X.columns))]}
uci = UCI(X, y, boundaries)

In [5]:
uci.is_feasible({'real': np.array([1.91833448, 34.69824839]),
                 'discrete': np.array([1, 1, 0, 1, 1, 1, 0, 1, 0])})

True

In [5]:
# Select number of repetitions for each experiment. 
runs = 10

# Select chaotic constant
chaotic_constant = True

# Export results ?
export = True

save_path = "data/output/"
if not os.path.exists(save_path):
    os.makedirs(save_path)

ExportToFile = save_path + "experiment" + time.strftime("%Y-%m-%d-%H-%M-%S_") + str(chaotic_constant) + ".csv"

# Check if it works at least once
atLeastOneIteration = False

population_size = 5 
iterations = 20

boundaries = {'real': [(1, 100_000), (1, 100_000)], 'discrete': [(0, 1) for _ in range(len(X.columns))]}

# CSV Header for the convergence
CnvgHeader = ["Sol_Iter"+str(l+1) for l in range(iterations)]

best_solution_history = []
accuracy_history = []
gamma_history = []
c_history = []
n_features_history = []

uci = UCI(X, y)

for k in range(runs):
    gsa_algo = GSA(objective_function = uci.get_fitness,
                   r_dim=2,
                   d_dim=len(X.columns),
                   boundaries=boundaries)
    
    gsa_algo.optimize(population_size=population_size,
                      iters=iterations,
                      chaotic_constant=True)
    
    print(gsa_algo.solution_history[-1])
    fitness, accuracy = uci.get_fitness(solution=gsa_algo.solution_history[-1],
                                        data=(uci.X_test, uci.y_test))
    
    print("Test accuracy: ", accuracy, " - Fitness: ", fitness)
    
    best_solution_history.append(gsa_algo.solution_history[-1])
    accuracy_history.append(accuracy)
    gamma_history.append(gsa_algo.solution_history[-1]['real'][0] / 1000)
    c_history.append(gsa_algo.solution_history[-1]['real'][1] / 1000)
    n_features_history.append(np.sum(gsa_algo.solution_history[-1]['discrete']))
    
    if export:
        with open(ExportToFile, 'a') as out:
            writer = csv.writer(out, delimiter=',')
            if not atLeastOneIteration:  # just one time to write the header of the CSV file
                header = np.concatenate(
                    [["Optimizer", "objfname", "startTime", "EndTime", "ExecutionTime"], CnvgHeader])
                writer.writerow(header)
            a = np.concatenate(
                [[gsa_algo.objective_function_name,
                  gsa_algo.start_time,
                  gsa_algo.end_time,
                  gsa_algo.execution_time],
                  gsa_algo.convergence])
            writer.writerow(a)
        out.close()
    atLeastOneIteration = True  # at least one experiment

if not atLeastOneIteration:  # Failed to run at least one experiment
    print("No Optimizer or Cost function is selected. Check lists of available optimizers and cost functions")

GSA is optimizing  "get_fitness"
['At iteration 1 the best fitness is 90.81309212406306']
['At iteration 2 the best fitness is 91.08844584148478']
['At iteration 3 the best fitness is 92.55782940965278']
['At iteration 4 the best fitness is 92.55782940965278']
['At iteration 5 the best fitness is 92.55782940965278']
['At iteration 6 the best fitness is 92.55782940965278']
['At iteration 7 the best fitness is 92.55782940965278']
['At iteration 8 the best fitness is 92.55782940965278']
['At iteration 9 the best fitness is 92.55782940965278']
['At iteration 10 the best fitness is 92.55782940965278']
['At iteration 11 the best fitness is 93.05475567669752']
['At iteration 12 the best fitness is 93.05475567669752']
['At iteration 13 the best fitness is 93.05475567669752']
['At iteration 14 the best fitness is 93.05475567669752']
['At iteration 15 the best fitness is 93.05475567669752']
['At iteration 16 the best fitness is 93.14546560276673']
['At iteration 17 the best fitness is 93.1454656

In [6]:
best_solution_history

[{'real': array([43616.09323357, 63817.03205613]),
  'discrete': array([1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])},
 {'real': array([ 6381.35140442, 80863.02641048]),
  'discrete': array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0])},
 {'real': array([62967.11912781, 46645.83110483]),
  'discrete': array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])},
 {'real': array([33588.34266432, 66832.28220316]),
  'discrete': array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0])},
 {'real': array([55292.23317125, 51986.15210918]),
  'discrete': array([1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])},
 {'real': array([59757.16656072, 56852.13310564]),
  'discrete': array([1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1])},
 {'real': array([84567.72521235, 82262.94628858]),
  'discrete': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1])},
 {'real': array([23241.90738793,  

In [7]:
accuracy_history

[98.4,
 99.63076923076923,
 99.38461538461539,
 99.75384615384615,
 99.75384615384615,
 100.0,
 99.50769230769231,
 97.84615384615385,
 99.75384615384615,
 99.87692307692308]

In [8]:
gamma_history

[43.61609323357452,
 6.381351404415115,
 62.967119127811976,
 33.588342664318816,
 55.29223317124827,
 59.75716656072248,
 84.5677252123505,
 23.241907387931622,
 98.3104276825638,
 13.368036571574333]

In [9]:
c_history

[63.81703205612602,
 80.86302641047628,
 46.645831104831224,
 66.83228220315813,
 51.98615210918452,
 56.8521331056368,
 82.26294628857819,
 3.008823663310346,
 89.43312685802297,
 54.554378717879786]

In [10]:
mean_accuracy = np.mean(accuracy_history)
std_accuracy = np.std(accuracy_history)

print(f"Mean accuracy: {mean_accuracy} +/- {std_accuracy}")

mean_gamma = np.mean(gamma_history)
std_gamma = np.std(gamma_history)

print(f"Mean gamma: {mean_gamma} +/- {std_gamma}")

mean_c = np.mean(c_history)
std_c = np.std(c_history)

print(f"Mean C: {mean_c} +/- {std_c}")

mean_n_features = np.mean(n_features_history)
std_n_features = np.std(n_features_history)

print(f"Mean n_features: {mean_n_features} +/- {std_n_features}")

Mean accuracy: 99.39076923076922 +/- 0.6664646636761704
Mean gamma: 48.10904030165114 +/- 28.38133806816956
Mean C: 59.625573251720425 +/- 23.210288854915223
Mean n_features: 5.4 +/- 1.2


                /\ 
               /  \
                || 
                
        Nuestros resultados
        
Dataset: Mushroom
Instances: 8124 (Train / Test: 80% / 20%)
Features: 22

Resultados autores:

| Metric | Value             |
| --- |-------------------|
| Accuracy | 98.06 +/- 0.78    |
| Gamma | 0.0067 +/- 0.0144 |
| C | 47.35 +/- 27.57   |
| n_features | 3 +/- 2.16        |

                /\ 
               /  \
                || 
                
        Nuestros resultados
        
Dataset: Breast Cancer Wisconsin (Diagnostic) Data Set

Resultados autores:

| Metric | Value             |
| --- |-------------------|
| Accuracy | 99.54 +/- 0.25    |
| Gamma | 0.0685 +/- 0.1293 |
| C | 40.30 +/- 22.37   |
| n_features | 2 +/- 1           |

In [8]:
solution= {'real': np.array([0.91833448, 34.69824839]),
           'discrete': np.array([1, 1, 0, 1, 1, 1, 0, 1, 0])}

fitness, accuracy = uci.get_fitness(solution=solution,
                                    data=(uci.X_test_scaled, uci.y_test))

print("Test accuracy: ", accuracy, " - Fitness: ", fitness)

Test accuracy:  100.0  - Fitness:  0.2666666666666666
