In [48]:
from skmultilearn.dataset import load_dataset
import numpy as np
from skmultilearn.problem_transform import ClassifierChain
import pygad
from typing import List
import sklearn.metrics as metrics
from typing import Any, Optional
import copy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import math

In [2]:
desired_datasets = ["scene", "emotions", "birds"]

datasets = {}
for dataset_name in desired_datasets:
    print(f"getting dataset `{dataset_name}`")
    
    full_dataset = load_dataset(dataset_name, "undivided")
    X, y, _, _ = full_dataset

    train_dataset = load_dataset(dataset_name, "train")
    X_train, y_train, _, _ = train_dataset

    test_dataset = load_dataset(dataset_name, "test")
    X_test, y_test, _, _ = test_dataset

    datasets[dataset_name] = {
        "X": X,
        "y": y,
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "rows": X.shape[0],
        "labels_count": y.shape[1]
    }

for name, info in datasets.items():
    print("===")
    print(f"information for dataset `{name}`")
    print(f"rows: {info['rows']}, labels: {info['labels_count']}")


getting dataset `scene`
scene:undivided - exists, not redownloading
scene:train - exists, not redownloading
scene:test - exists, not redownloading
getting dataset `emotions`
emotions:undivided - exists, not redownloading
emotions:train - exists, not redownloading
emotions:test - exists, not redownloading
getting dataset `birds`
birds:undivided - exists, not redownloading
birds:train - exists, not redownloading
birds:test - exists, not redownloading
===
information for dataset `scene`
rows: 2407, labels: 6
===
information for dataset `emotions`
rows: 593, labels: 6
===
information for dataset `birds`
rows: 645, labels: 19


In [7]:
def has_duplicates(int_list: List[int]) -> bool:
    seen = set()
    for num in int_list:
        if num in seen:
            return True
        seen.add(num)
    return False

def has_negatives(int_list: List[int]) -> bool:
    for num in int_list:
        if num < 0:
            return True
    return False

In [50]:
class GeneticAlgorithmForMultiLabel:
    def __init__(self, base_classifier: Any, num_generations: int = 5, random_state: Optional[int] = None) -> None:
        self.base_classifier = base_classifier
        self.num_generations = num_generations

        if random_state is None:
            self.random_state = np.random.randint(0, 1000)
        else:
            self.random_state = random_state
    
    def fit(self, X: Any, y: Any):
        self.x = X
        self.y = y
        # this is the most practical way to pass the data to the fitness function

        label_count = self.y.shape[1]
        if label_count < 3:
            raise Exception("label count is too low, we need at least 3 labels")

        label_space = np.arange(label_count)
        solutions_per_population = math.ceil(label_count / 2)
        # to simplify the model, some heuristics are used

        ga_model = pygad.GA( #type:ignore
            gene_type=int,
            gene_space=label_space,
            random_seed=self.random_state,
            save_best_solutions=False,
            fitness_func=self.model_fitness_func,
            allow_duplicate_genes=False, # very important, otherwise we will have duplicate labels in the ordering
            num_genes=label_count,

            # set up
            num_generations=self.num_generations,
            sol_per_pop=solutions_per_population,

            # following what the article describes
            keep_elitism=1, # also following what the article describes, but we have to double check [TODO]
            parent_selection_type="rws", # following what the article describes
            # mutation_probability=0.005, # following what the article describes

            # the following settings are fixed
            # they were chosen for no particular reason
            # they are being kept as fixed to simplify the model
            num_parents_mating=2,
            crossover_type="scattered",
            mutation_type="random",
            mutation_by_replacement=True,
            mutation_num_genes=1,
        )

        ga_model.run()

        solution, _, _ = ga_model.best_solution()

        best_classifier = ClassifierChain(
            classifier=copy.deepcopy(self.base_classifier),
            require_dense=[False, True],
            order=solution,
        )

        best_classifier.fit(self.x, self.y)
        return best_classifier
        
    def model_fitness_func(self, ga_instance: Any, solution: Any, solution_idx: Any) -> float:
        if has_duplicates(solution):
            print("solutions contains duplicated values, skipping")
            return 0
        
        if has_negatives(solution):
            print("solutions contains negative values, skipping")
            return 0

        hamming_loss = self.test_ordering(solution)
        hamming_loss = float(hamming_loss)
        return 1/hamming_loss
        # this will be the fitness function result, and we want to maximize it
        # therefore, we have to return the inverse of the hamming loss
    
    def test_ordering(self, solution: List[int]):
        print(f"testing order: {solution}")

        classifier = ClassifierChain(
            classifier=copy.deepcopy(self.base_classifier),
            require_dense=[False, True],
            order=solution,
        )

        X_train, X_test, y_train, y_test = train_test_split(
            self.x, self.y, test_size=0.2, random_state=self.random_state
        )

        classifier.fit(X_train, y_train)
        preds = classifier.predict(X_test)

        return metrics.hamming_loss(y_test, preds)



In [51]:
m = GeneticAlgorithmForMultiLabel(
    base_classifier=RandomForestClassifier(random_state=456),
    num_generations=1,
    random_state=123,
)
r = m.fit(datasets["scene"]["X_train"], datasets["scene"]["y_train"])
r

testing order: [5 2 4 0 1 3]
testing order: [2 3 1 5 0 4]
testing order: [1 0 4 2 3 5]
testing order: [2 3 0 1 5 4]
testing order: [2 3 4 5 1 0]
testing order: [2 3 0 1 5 4]
testing order: [2 3 4 5 1 0]


ClassifierChain(classifier=RandomForestClassifier(random_state=456),
                order=array([1, 0, 4, 2, 3, 5]), require_dense=[False, True])

In [52]:
preds = r.predict(datasets["scene"]["X_test"])

print(metrics.hamming_loss(datasets["scene"]["y_test"], preds))
print(metrics.f1_score(datasets["scene"]["y_test"], preds, average="macro"))

0.2568283166109253
0.13486096370513379
