In [1]:
import pandas as pd
import numpy as np
from skmultilearn.base import MLClassifierBase
from skmultilearn.problem_transform import ClassifierChain
import pygad
from typing import List
import sklearn.metrics as metrics
from typing import Any

In [8]:
def has_duplicates(int_list: List[int]) -> bool:
    seen = set()
    for num in int_list:
        if num in seen:
            return True
        seen.add(num)
    return False

def has_negatives(int_list: List[int]) -> bool:
    for num in int_list:
        if num < 0:
            return True
    return False

In [None]:
class GeneticAlgorithmForMultiLabel():
    random_state: int

    def __init__(self, random_state: int) -> None:
        self.random_state = random_state
        # TODO: later check if we should have a way to avoid setting random state
        # or just do it randomly
    
    def run(self, X_train: Any, y_train: Any, X_test: Any, y_test: Any):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

        label_space = np.arange(y_train.shape[1])

        ga_model = pygad.GA( #type:ignore
            gene_type=int,
            gene_space=label_space
            random_seed=self.random_state,
            save_best_solutions=False,
            fitness_func=model_fitness_func,
            allow_duplicate_genes=False, # very important, otherwise we will have duplicate labels in the ordering
            num_genes=labels_count,

            # set up
            num_generations=10,
            num_parents_mating=2,
            sol_per_pop=4,

            # following what the article describes
            keep_elitism=1, # also following what the article describes, but we have to double check [TODO]
            parent_selection_type="rws", # following what the article describes
            # mutation_probability=0.005, # following what the article describes

            # TODO let's check how the article defines this
            crossover_type="scattered",
            mutation_type="random",
            mutation_by_replacement=True,
            mutation_num_genes=1,
        )

        ga_model.run()
        
    @staticmethod
    def model_fitness_func(self, ga_instance, solution, solution_idx):
        if has_duplicates(solution):
            print("solutions contains duplicated values, skipping")
            return 0
        
        if has_negatives(solution):
            print("solutions contains negative values, skipping")
            return 0

        hamming_loss = self.test_ordering(solution)
        return 1/hamming_loss
        # this will be the fitness function result, and we want to maximize it
        # therefore, we have to return the inverse of the hamming loss
    
    def test_ordering(self, solution: List[int]):
        print(f"testing order: {solution}")

        classifier = ClassifierChain(
            classifier=SVC(), # self.base_classifier
            require_dense=[False, True],
            order=solution,
        )

        classifier.fit(X_train, y_train)
        preds = classifier.predict(X_test)
        # TODO: big question
        # the user of this method will pass X_train and y_train, as this is supposed to be a fit() function
        # but we need to use X_test and y_test to evaluate the ordering
        # ideas:
        # * get x_train and further split it into x_train and x_test
        #   * but in this case, should we do cross validation?
        # * get x_test and y_test from the user

        hamming_loss = metrics.hamming_loss(
            y_test, preds)
