In [21]:
from skmultilearn.dataset import load_dataset
import numpy as np
from skmultilearn.problem_transform import ClassifierChain
import pygad
from typing import List
import sklearn.metrics as metrics
from typing import Any
import copy
from sklearn.ensemble import RandomForestClassifier

In [8]:
desired_datasets = ["scene", "emotions", "birds"]

datasets = {}
for dataset_name in desired_datasets:
    print(f"getting dataset `{dataset_name}`")
    
    full_dataset = load_dataset(dataset_name, "undivided")
    X, y, _, _ = full_dataset

    train_dataset = load_dataset(dataset_name, "train")
    X_train, y_train, _, _ = train_dataset

    test_dataset = load_dataset(dataset_name, "test")
    X_test, y_test, _, _ = test_dataset

    datasets[dataset_name] = {
        "X": X,
        "y": y,
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "rows": X.shape[0],
        "labels_count": y.shape[1]
    }


for name, info in datasets.items():
    print("===")
    print(f"information for dataset `{name}`")
    print(f"rows: {info['rows']}, labels: {info['labels_count']}")


getting dataset `scene`
scene:undivided - exists, not redownloading
scene:train - exists, not redownloading
scene:test - exists, not redownloading
getting dataset `emotions`
emotions:undivided - exists, not redownloading
emotions:train - exists, not redownloading
emotions:test - exists, not redownloading
getting dataset `birds`
birds:undivided - exists, not redownloading
birds:train - exists, not redownloading
birds:test - exists, not redownloading
===
information for dataset `scene`
rows: 2407, labels: 6
===
information for dataset `emotions`
rows: 593, labels: 6
===
information for dataset `birds`
rows: 645, labels: 19


In [4]:
def has_duplicates(int_list: List[int]) -> bool:
    seen = set()
    for num in int_list:
        if num in seen:
            return True
        seen.add(num)
    return False

def has_negatives(int_list: List[int]) -> bool:
    for num in int_list:
        if num < 0:
            return True
    return False

In [29]:
class GeneticAlgorithmForMultiLabel:
    random_state: int

    def __init__(self, base_classifier: Any, random_state: int) -> None:
        self.random_state = random_state
        # TODO: later check if we should have a way to avoid setting random state
        # or just do it randomly

        self.base_classifier = base_classifier

        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
    
    def run(self, X_train: Any, y_train: Any, X_test: Any, y_test: Any):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

        label_space = np.arange(y_train.shape[1])

        ga_model = pygad.GA( #type:ignore
            gene_type=int,
            gene_space=label_space,
            random_seed=self.random_state,
            save_best_solutions=False,
            fitness_func=self.model_fitness_func,
            allow_duplicate_genes=False, # very important, otherwise we will have duplicate labels in the ordering
            num_genes=len(label_space),

            # set up
            num_generations=10,
            num_parents_mating=2,
            sol_per_pop=4,

            # following what the article describes
            keep_elitism=1, # also following what the article describes, but we have to double check [TODO]
            parent_selection_type="rws", # following what the article describes
            # mutation_probability=0.005, # following what the article describes

            # TODO let's check how the article defines this
            crossover_type="scattered",
            mutation_type="random",
            mutation_by_replacement=True,
            mutation_num_genes=1,
        )

        ga_model.run()

        solution, _, _ = ga_model.best_solution()

        best_classifier = ClassifierChain(
            classifier=copy.deepcopy(self.base_classifier),
            require_dense=[False, True],
            order=solution,
        )

        best_classifier.fit(self.X_train, self.y_train)
        return best_classifier
        
    def model_fitness_func(self, ga_instance: Any, solution: Any, solution_idx: Any) -> float:
        if has_duplicates(solution):
            print("solutions contains duplicated values, skipping")
            return 0
        
        if has_negatives(solution):
            print("solutions contains negative values, skipping")
            return 0

        hamming_loss = self.test_ordering(solution)
        return 1/hamming_loss
        # this will be the fitness function result, and we want to maximize it
        # therefore, we have to return the inverse of the hamming loss
    
    def test_ordering(self, solution: List[int]):
        print(f"testing order: {solution}")

        classifier = ClassifierChain(
            classifier=copy.deepcopy(self.base_classifier),
            require_dense=[False, True],
            order=solution,
        )

        classifier.fit(self.X_train, self.y_train)
        preds = classifier.predict(self.X_test)

        return metrics.hamming_loss(self.y_test, preds)



In [30]:
m = GeneticAlgorithmForMultiLabel(
    base_classifier=RandomForestClassifier(random_state=42),
    random_state=42,
)
r = m.run(datasets["scene"]["X_train"], datasets["scene"]["y_train"], datasets["scene"]["X_test"], datasets["scene"]["y_test"])
r

testing order: [3 4 2 0 5 1]
testing order: [2 5 1 4 3 0]
testing order: [5 4 1 3 0 2]
testing order: [1 3 4 0 2 5]
testing order: [1 4 2 0 3 5]
testing order: [3 1 2 0 5 4]
testing order: [1 3 2 0 4 5]
testing order: [1 4 2 0 5 3]
testing order: [4 1 2 0 5 3]
testing order: [1 4 3 0 2 5]
testing order: [4 1 2 0 5 3]
testing order: [1 3 2 0 5 4]
testing order: [1 3 2 0 5 4]
testing order: [1 4 2 0 5 3]
testing order: [1 3 5 0 2 4]
testing order: [2 1 3 0 5 4]
testing order: [1 3 0 2 5 4]
testing order: [3 2 1 0 5 4]
testing order: [3 2 1 0 5 4]


ClassifierChain(classifier=RandomForestClassifier(random_state=42),
                order=array([3, 1, 2, 0, 5, 4]), require_dense=[False, True])

In [33]:
preds = r.predict(datasets["scene"]["X_test"])

print(metrics.hamming_loss(datasets["scene"]["y_test"], preds))
print(metrics.f1_score(datasets["scene"]["y_test"], preds, average="macro"))

0.205685618729097
0.31301095162100195
