# 1. First Model Experiments

Experiments to handle the very first model for my thesis.
T
The plan is to implement a **basic stacking algorithm** using the existing Binary Relevance implementation, found in the `scikit-multilearn` library, and then implement the actual new model, which runs a specific feature selection process between each layer of the stacking algorithm.

The initial experimentation follows [this page](http://scikit.ml/tutorial.html).

## 1.1. Setup

In [1]:
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.dataset import load_dataset, available_data_sets
from sklearn.svm import SVC
from skmultilearn.base.problem_transformation import ProblemTransformationBase
from typing import List, Optional, Any, Tuple, Dict
import numpy as np
import sklearn.metrics as metrics
import json


## 1.2. Setting up a stacking classifier

At first, let's build a simple stacking implementation, so we can have a baseline to compare with the specialized stacking implementation.

In [12]:
available_data_sets()

defaultdict(list,
            {('bibtex', 'undivided'): ['5c1e474c2fd026519aec931a26ad18a6',
              'bibtex-undivided.scikitml.bz2'],
             ('bibtex', 'test'): ['d250caa297d060374f59318ad6b93771',
              'bibtex-test.scikitml.bz2'],
             ('bibtex', 'train'): ['1dd15daca7b8b2c17d692bdadce5dc31',
              'bibtex-train.scikitml.bz2'],
             ('birds', 'undivided'): ['1da06f4ae896800547dabf89044584e1',
              'birds-undivided.scikitml.bz2'],
             ('birds', 'test'): ['77fbfcc66d77040d3806c2a5ea4ff829',
              'birds-test.scikitml.bz2'],
             ('birds', 'train'): ['5c2bacaa5506e904b6501cc50ddfefe2',
              'birds-train.scikitml.bz2'],
             ('Corel5k', 'undivided'): ['062ea897821608035748a2a3b200d382',
              'Corel5k-undivided.scikitml.bz2'],
             ('Corel5k', 'test'): ['cb91444418a2f8b9814d10d4696af9f0',
              'Corel5k-test.scikitml.bz2'],
             ('Corel5k', 'train'): ['1863ec41b

In [13]:
X_train, y_train, feature_names, label_names = load_dataset("scene", "train")
X_test, y_test, _, _ = load_dataset("scene", "test")

# using `scene` as this dataset seems to be a little bit more comprehensible than the others I tested

scene:train - does not exists downloading
Downloaded scene-train
scene:test - does not exists downloading
Downloaded scene-test


In [14]:
feature_names


[('Att1', 'NUMERIC'),
 ('Att2', 'NUMERIC'),
 ('Att3', 'NUMERIC'),
 ('Att4', 'NUMERIC'),
 ('Att5', 'NUMERIC'),
 ('Att6', 'NUMERIC'),
 ('Att7', 'NUMERIC'),
 ('Att8', 'NUMERIC'),
 ('Att9', 'NUMERIC'),
 ('Att10', 'NUMERIC'),
 ('Att11', 'NUMERIC'),
 ('Att12', 'NUMERIC'),
 ('Att13', 'NUMERIC'),
 ('Att14', 'NUMERIC'),
 ('Att15', 'NUMERIC'),
 ('Att16', 'NUMERIC'),
 ('Att17', 'NUMERIC'),
 ('Att18', 'NUMERIC'),
 ('Att19', 'NUMERIC'),
 ('Att20', 'NUMERIC'),
 ('Att21', 'NUMERIC'),
 ('Att22', 'NUMERIC'),
 ('Att23', 'NUMERIC'),
 ('Att24', 'NUMERIC'),
 ('Att25', 'NUMERIC'),
 ('Att26', 'NUMERIC'),
 ('Att27', 'NUMERIC'),
 ('Att28', 'NUMERIC'),
 ('Att29', 'NUMERIC'),
 ('Att30', 'NUMERIC'),
 ('Att31', 'NUMERIC'),
 ('Att32', 'NUMERIC'),
 ('Att33', 'NUMERIC'),
 ('Att34', 'NUMERIC'),
 ('Att35', 'NUMERIC'),
 ('Att36', 'NUMERIC'),
 ('Att37', 'NUMERIC'),
 ('Att38', 'NUMERIC'),
 ('Att39', 'NUMERIC'),
 ('Att40', 'NUMERIC'),
 ('Att41', 'NUMERIC'),
 ('Att42', 'NUMERIC'),
 ('Att43', 'NUMERIC'),
 ('Att44', 'NUMERIC'

In [15]:
label_names


[('Beach', ['0', '1']),
 ('Sunset', ['0', '1']),
 ('FallFoliage', ['0', '1']),
 ('Field', ['0', '1']),
 ('Mountain', ['0', '1']),
 ('Urban', ['0', '1'])]

Using `scene` as this dataset seems to be a little bit more comprehensible than the others I tested.

For more information regarding the datasets and how `scikit-multilearn` handles them, [read this page](http://scikit.ml/datasets.html).

In [7]:
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)


In [8]:
clf.fit(X_train, y_train)


BinaryRelevance(classifier=SVC(), require_dense=[False, True])

In [9]:
prediction = clf.predict(X_test)


In [10]:
prediction.todense()


matrix([[0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        ...,
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0]], dtype=int64)

In [11]:
label_names

[('Beach', ['0', '1']),
 ('Sunset', ['0', '1']),
 ('FallFoliage', ['0', '1']),
 ('Field', ['0', '1']),
 ('Mountain', ['0', '1']),
 ('Urban', ['0', '1'])]

In [2]:
regular_br_hamming_loss = metrics.hamming_loss(y_test, prediction)
regular_br_hamming_loss


NameError: name 'y_test' is not defined

In [16]:
regular_br_accuracy_score = metrics.accuracy_score(y_test, prediction)
regular_br_accuracy_score


0.5869565217391305

In [3]:
class BasicStacking(ProblemTransformationBase):
    first_layer_classifiers: BinaryRelevance
    second_layer_classifiers: BinaryRelevance

    def __init__(self, classifier: Any = None, require_dense: Optional[List[bool]] = None):
        super(BasicStacking, self).__init__(classifier, require_dense)

        self.first_layer_classifiers = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )

        self.second_layer_classifiers = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )
    
    def fit(self, X: Any, y: Any):
        self.first_layer_classifiers.fit(X, y)

        first_layer_predictions = self.first_layer_classifiers.predict(X)
        X_expanded = np.hstack([X.todense(), first_layer_predictions.todense()])

        self.second_layer_classifiers.fit(X_expanded, y)
    
    def predict(self, X: Any):
        first_layer_predictions = self.first_layer_classifiers.predict(X)
        X_expanded = np.hstack([X.todense(), first_layer_predictions.todense()])
        return self.second_layer_classifiers.predict(X_expanded)


In [18]:
model = BasicStacking()
model.fit(X_train, y_train)

[[0.646467 0.666435 0.685047 ... 0.       0.       0.      ]
 [0.770156 0.767255 0.761053 ... 0.       0.       0.      ]
 [0.793984 0.772096 0.76182  ... 0.       0.       0.      ]
 ...
 [0.85639  1.       1.       ... 0.       0.       0.      ]
 [0.805592 0.80417  0.811438 ... 0.       0.       1.      ]
 [0.855064 0.858896 0.911177 ... 0.       0.       1.      ]]


In [19]:
stacking_prediction = model.predict(X_test)

In [21]:
stacking_hamming_loss = metrics.hamming_loss(y_test, stacking_prediction)
stacking_accuracy_score = metrics.accuracy_score(y_test, stacking_prediction)

print("br hamming_loss", regular_br_hamming_loss)
print("br accuracy", regular_br_accuracy_score)
print("===")
print("stacking hamming_loss", stacking_hamming_loss)
print("stacking accuracy", stacking_accuracy_score)


br hamming_loss 0.08416945373467112
br accuracy 0.5869565217391305
===
stacking hamming_loss 0.08416945373467112
stacking accuracy 0.5869565217391305


### Conclusion

Both the Binary Relevance and the Stacking approaches resulted in the exact same performance.

Possibilites:
* The stacking implementation is wrong.
  * To test this, we have to review the code and compare it to other stacking implementations; for instance: the stacking implementation in the `utiml` library for R.
* The stacking implementation is correct, but the labels are not correlated at all, meaning that the stacking approach is not useful.
  * To test this, we have to check other datasets and see if the stacking approach is useful in those cases.

## 1.3. Comparing to other stacking implementations

Let's compare my implementation of the stacking classifier to the one from [utiml](https://github.com/rivolli/utiml).

[Here is a guide](https://cran.r-project.org/web/packages/utiml/vignettes/utiml-overview.html) to get started with the `utiml` library.

2023-08-17: I tried to read the source code, but I could not spot exactly how the stacking is being implemented. As a matter of fact, I don't even know if my understanding of the stacking is correct.

For now, I will proceed with implementing the actual model, and then I will try to go back and debug all of this.

**TODO**

## 1.4. Using other datasets

I will try to list all available datasets, then try all of them using Binary Relevance as a baseline and my Basic Stacking approach. The objective is to see if the performance metrics change for any of the datasets.

In [6]:
SplitDataset = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]

class BasicStackingAgainstBinaryRelevanceBaselineTest:
    def run(self):
        results = []

        for name in self.get_unique_available_data_set_names():
            data_sets = self.get_data_set(name)

            baseline_metrics = self.get_metrics_for_baseline_model(data_sets)
            stacking_metrics = self.get_metrics_for_basic_stacking(data_sets)

            results.append({
                "dataset_name": name,
                "baseline_accuracy": baseline_metrics[0],
                "baseline_hamming_loss": baseline_metrics[1],
                "stacking_accuracy": stacking_metrics[0],
                "stacking_hamming_loss": stacking_metrics[1]
            })

            self.save_results(results)
            print(f"finished baseline for {name}")

    def get_metrics_for_baseline_model(self, split_dataset: SplitDataset) -> Tuple[float, float]:
        clf = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )
        return self.get_metrics_for_model(clf, split_dataset)

    def get_metrics_for_basic_stacking(self, split_dataset: SplitDataset) -> Tuple[float, float]:
        clf = BasicStacking()
        return self.get_metrics_for_model(clf, split_dataset)

    def get_metrics_for_model(self, clf: Any, split_dataset: SplitDataset) -> Tuple[float, float]:
        X_train, y_train, X_test, y_test = split_dataset

        clf.fit(X_train, y_train)

        prediction = clf.predict(X_test)
        accuracy_score = metrics.accuracy_score(y_test, prediction)
        hamming_loss = metrics.hamming_loss(y_test, prediction)

        return accuracy_score, hamming_loss

    def save_results(self, results: List[Dict[str, Any]]):
        with open("results.json", "w") as f:
            json.dump(results, f, indent=4)

    def get_unique_available_data_set_names(self) -> List[str]:
        dataset_names = []

        _available_data_sets = available_data_sets()
        if _available_data_sets is None:
            raise Exception("could not load available data sets")

        skip = ["bibtex", "Corel5k", "delicious", "enron", "genbase", "mediamill"]
        # bibtex -> takes too long to train and test
        # Corel5k -> apparently it has only a single class, which is not accepted by the classifier
        # delicious -> takes too long to train and test
        # enron -> apparently it has only a single class, which is not accepted by the classifier
        # genbase -> apparently it has only a single class, which is not accepted by the classifier
        # mediamill -> takes too long to train and test

        for dataset_name, variant in _available_data_sets:
            if dataset_name in skip:
                continue

            if dataset_name not in dataset_names:
                dataset_names.append(dataset_name)

        return dataset_names

    def get_data_set(self, name: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        train_data = load_dataset(name, "train")
        if train_data is None:
            raise Exception(f"could not load data set {name}")

        test_data = load_dataset(name, "test")
        if test_data is None:
            raise Exception(f"could not load data set {name}")
        
        X_train, y_train, _, _ = train_data
        X_test, y_test, _, _ = test_data
        
        return X_train, y_train, X_test, y_test

# using `scene` as this dataset seems to be a little bit more comprehensible than the others I tested


In [7]:
pipeline = BasicStackingAgainstBinaryRelevanceBaselineTest()
pipeline.run()

birds:train - exists, not redownloading
birds:test - exists, not redownloading
finished baseline for birds
emotions:train - exists, not redownloading
emotions:test - exists, not redownloading
finished baseline for emotions
mediamill:train - exists, not redownloading
mediamill:test - exists, not redownloading
