# 1. First Model Experiments

Experiments to handle the very first model for my thesis.
T
The plan is to implement a **basic stacking algorithm** using the existing Binary Relevance implementation, found in the `scikit-multilearn` library, and then implement the actual new model, which runs a specific feature selection process between each layer of the stacking algorithm.

The initial experimentation follows [this page](http://scikit.ml/tutorial.html).

## 1.1. Setup

In [1]:
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.dataset import load_dataset, available_data_sets
from sklearn.svm import SVC
from skmultilearn.base.problem_transformation import ProblemTransformationBase
from typing import List, Optional, Any, Tuple, Dict
import numpy as np
import sklearn.metrics as metrics
import json
import pandas as pd
from sklearn.feature_selection import f_classif


## 1.2. Setting up a basic stacking classifier

At first, let's build a simple stacking implementation, so we can have a baseline to compare with the specialized stacking implementation.

In [2]:
available_data_sets()

defaultdict(list,
            {('bibtex', 'undivided'): ['5c1e474c2fd026519aec931a26ad18a6',
              'bibtex-undivided.scikitml.bz2'],
             ('bibtex', 'test'): ['d250caa297d060374f59318ad6b93771',
              'bibtex-test.scikitml.bz2'],
             ('bibtex', 'train'): ['1dd15daca7b8b2c17d692bdadce5dc31',
              'bibtex-train.scikitml.bz2'],
             ('birds', 'undivided'): ['1da06f4ae896800547dabf89044584e1',
              'birds-undivided.scikitml.bz2'],
             ('birds', 'test'): ['77fbfcc66d77040d3806c2a5ea4ff829',
              'birds-test.scikitml.bz2'],
             ('birds', 'train'): ['5c2bacaa5506e904b6501cc50ddfefe2',
              'birds-train.scikitml.bz2'],
             ('Corel5k', 'undivided'): ['062ea897821608035748a2a3b200d382',
              'Corel5k-undivided.scikitml.bz2'],
             ('Corel5k', 'test'): ['cb91444418a2f8b9814d10d4696af9f0',
              'Corel5k-test.scikitml.bz2'],
             ('Corel5k', 'train'): ['1863ec41b

In [3]:
X_train, y_train, feature_names, label_names = load_dataset("scene", "train")
X_test, y_test, _, _ = load_dataset("scene", "test")

# using `scene` as this dataset seems to be a little bit more comprehensible than the others I tested

scene:train - exists, not redownloading
scene:test - exists, not redownloading


In [4]:
feature_names

[('Att1', 'NUMERIC'),
 ('Att2', 'NUMERIC'),
 ('Att3', 'NUMERIC'),
 ('Att4', 'NUMERIC'),
 ('Att5', 'NUMERIC'),
 ('Att6', 'NUMERIC'),
 ('Att7', 'NUMERIC'),
 ('Att8', 'NUMERIC'),
 ('Att9', 'NUMERIC'),
 ('Att10', 'NUMERIC'),
 ('Att11', 'NUMERIC'),
 ('Att12', 'NUMERIC'),
 ('Att13', 'NUMERIC'),
 ('Att14', 'NUMERIC'),
 ('Att15', 'NUMERIC'),
 ('Att16', 'NUMERIC'),
 ('Att17', 'NUMERIC'),
 ('Att18', 'NUMERIC'),
 ('Att19', 'NUMERIC'),
 ('Att20', 'NUMERIC'),
 ('Att21', 'NUMERIC'),
 ('Att22', 'NUMERIC'),
 ('Att23', 'NUMERIC'),
 ('Att24', 'NUMERIC'),
 ('Att25', 'NUMERIC'),
 ('Att26', 'NUMERIC'),
 ('Att27', 'NUMERIC'),
 ('Att28', 'NUMERIC'),
 ('Att29', 'NUMERIC'),
 ('Att30', 'NUMERIC'),
 ('Att31', 'NUMERIC'),
 ('Att32', 'NUMERIC'),
 ('Att33', 'NUMERIC'),
 ('Att34', 'NUMERIC'),
 ('Att35', 'NUMERIC'),
 ('Att36', 'NUMERIC'),
 ('Att37', 'NUMERIC'),
 ('Att38', 'NUMERIC'),
 ('Att39', 'NUMERIC'),
 ('Att40', 'NUMERIC'),
 ('Att41', 'NUMERIC'),
 ('Att42', 'NUMERIC'),
 ('Att43', 'NUMERIC'),
 ('Att44', 'NUMERIC'

In [5]:
label_names

[('Beach', ['0', '1']),
 ('Sunset', ['0', '1']),
 ('FallFoliage', ['0', '1']),
 ('Field', ['0', '1']),
 ('Mountain', ['0', '1']),
 ('Urban', ['0', '1'])]

Using `scene` as this dataset seems to be a little bit more comprehensible than the others I tested.

For more information regarding the datasets and how `scikit-multilearn` handles them, [read this page](http://scikit.ml/datasets.html).

In [6]:
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)


In [7]:
clf.fit(X_train, y_train)

BinaryRelevance(classifier=SVC(), require_dense=[False, True])

In [8]:
prediction = clf.predict(X_test)

In [9]:
prediction.todense()

matrix([[0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        ...,
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0]], dtype=int64)

In [10]:
label_names

[('Beach', ['0', '1']),
 ('Sunset', ['0', '1']),
 ('FallFoliage', ['0', '1']),
 ('Field', ['0', '1']),
 ('Mountain', ['0', '1']),
 ('Urban', ['0', '1'])]

In [11]:
regular_br_hamming_loss = metrics.hamming_loss(y_test, prediction)
regular_br_hamming_loss


0.08416945373467112

In [12]:
regular_br_accuracy_score = metrics.accuracy_score(y_test, prediction)
regular_br_accuracy_score


0.5869565217391305

In [13]:
class BasicStacking(ProblemTransformationBase):
    first_layer_classifiers: BinaryRelevance
    second_layer_classifiers: BinaryRelevance

    def __init__(self, classifier: Any = None, require_dense: Optional[List[bool]] = None):
        super(BasicStacking, self).__init__(classifier, require_dense)

        self.first_layer_classifiers = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )

        self.second_layer_classifiers = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )
    
    def fit(self, X: Any, y: Any):
        self.first_layer_classifiers.fit(X, y)

        first_layer_predictions = self.first_layer_classifiers.predict(X)
        X_expanded = np.hstack([X.todense(), first_layer_predictions.todense()])

        self.second_layer_classifiers.fit(X_expanded, y)
    
    def predict(self, X: Any):
        first_layer_predictions = self.first_layer_classifiers.predict(X)
        X_expanded = np.hstack([X.todense(), first_layer_predictions.todense()])
        return self.second_layer_classifiers.predict(X_expanded)


In [14]:
model = BasicStacking()
model.fit(X_train, y_train)

In [15]:
stacking_prediction = model.predict(X_test)

In [16]:
stacking_hamming_loss = metrics.hamming_loss(y_test, stacking_prediction)
stacking_accuracy_score = metrics.accuracy_score(y_test, stacking_prediction)

print("br hamming_loss", regular_br_hamming_loss)
print("br accuracy", regular_br_accuracy_score)
print("===")
print("stacking hamming_loss", stacking_hamming_loss)
print("stacking accuracy", stacking_accuracy_score)


br hamming_loss 0.08416945373467112
br accuracy 0.5869565217391305
===
stacking hamming_loss 0.08416945373467112
stacking accuracy 0.5869565217391305


### Conclusion

Both the Binary Relevance and the Stacking approaches resulted in the exact same performance.

Possibilites:
* The stacking implementation is wrong.
  * To test this, we have to review the code and compare it to other stacking implementations; for instance: the stacking implementation in the `utiml` library for R.
* The stacking implementation is correct, but the labels are not correlated at all, meaning that the stacking approach is not useful.
  * To test this, we have to check other datasets and see if the stacking approach is useful in those cases.

## 1.3. Using other datasets

I will try to list all available datasets, then try all of them using Binary Relevance as a baseline and my Basic Stacking approach. The objective is to see if the performance metrics change for any of the datasets.

In [3]:
SplitDataset = Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]

class BasicStackingAgainstBinaryRelevanceBaselineTest:
    def run(self):
        results = []

        for name in self.get_unique_available_data_set_names():
            data_sets = self.get_data_set(name)

            baseline_metrics = self.get_metrics_for_baseline_model(data_sets)
            stacking_metrics = self.get_metrics_for_basic_stacking(data_sets)

            results.append({
                "dataset_name": name,
                "baseline_accuracy": baseline_metrics[0],
                "baseline_hamming_loss": baseline_metrics[1],
                "stacking_accuracy": stacking_metrics[0],
                "stacking_hamming_loss": stacking_metrics[1]
            })

            self.save_results(results)
            print(f"finished baseline for {name}")

    def get_metrics_for_baseline_model(self, split_dataset: SplitDataset) -> Tuple[float, float]:
        clf = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )
        return self.get_metrics_for_model(clf, split_dataset)

    def get_metrics_for_basic_stacking(self, split_dataset: SplitDataset) -> Tuple[float, float]:
        clf = BasicStacking()
        return self.get_metrics_for_model(clf, split_dataset)

    def get_metrics_for_model(self, clf: Any, split_dataset: SplitDataset) -> Tuple[float, float]:
        X_train, y_train, X_test, y_test = split_dataset

        clf.fit(X_train, y_train)

        prediction = clf.predict(X_test)
        accuracy_score = metrics.accuracy_score(y_test, prediction)
        hamming_loss = metrics.hamming_loss(y_test, prediction)

        return accuracy_score, hamming_loss

    def save_results(self, results: List[Dict[str, Any]]):
        with open("1_first_model_experiments_other_datasets_metrics.json", "w") as f:
            json.dump(results, f, indent=4)

    def get_unique_available_data_set_names(self) -> List[str]:
        dataset_names = []

        _available_data_sets = available_data_sets()
        if _available_data_sets is None:
            raise Exception("could not load available data sets")

        skip = ["bibtex", "Corel5k", "delicious", "enron", "genbase", "mediamill", "medical", "rcv1subset1"]
        # bibtex -> takes too long to train and test
        # Corel5k -> apparently it has only a single class, which is not accepted by the classifier
        # delicious -> takes too long to train and test
        # enron -> apparently it has only a single class, which is not accepted by the classifier
        # genbase -> apparently it has only a single class, which is not accepted by the classifier
        # mediamill -> takes too long to train and test
        # medical -> apparently it has only a single class, which is not accepted by the classifier
        # rcv1subset1 -> takes too long to train and test

        for dataset_name, variant in _available_data_sets:
            if dataset_name in skip:
                continue

            if dataset_name not in dataset_names:
                dataset_names.append(dataset_name)

        return dataset_names

    def get_data_set(self, name: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        train_data = load_dataset(name, "train")
        if train_data is None:
            raise Exception(f"could not load data set {name}")

        test_data = load_dataset(name, "test")
        if test_data is None:
            raise Exception(f"could not load data set {name}")
        
        X_train, y_train, _, _ = train_data
        X_test, y_test, _, _ = test_data
        
        return X_train, y_train, X_test, y_test

# using `scene` as this dataset seems to be a little bit more comprehensible than the others I tested


In [16]:
# notice: this cell takes a long time to run
# the results are already saved in the file `1_first_model_experiments_other_datasets_metrics.json`
# so you can skip this cell

pipeline = BasicStackingAgainstBinaryRelevanceBaselineTest()
pipeline.run()

birds:train - exists, not redownloading
birds:test - exists, not redownloading
finished baseline for birds
emotions:train - exists, not redownloading
emotions:test - exists, not redownloading
finished baseline for emotions
rcv1subset2:train - does not exists downloading
Downloaded rcv1subset2-train
rcv1subset2:test - does not exists downloading
Downloaded rcv1subset2-test
finished baseline for rcv1subset2
rcv1subset3:train - does not exists downloading
Downloaded rcv1subset3-train
rcv1subset3:test - does not exists downloading
Downloaded rcv1subset3-test
finished baseline for rcv1subset3
rcv1subset4:train - does not exists downloading
Downloaded rcv1subset4-train
rcv1subset4:test - does not exists downloading
Downloaded rcv1subset4-test
finished baseline for rcv1subset4
rcv1subset5:train - does not exists downloading
Downloaded rcv1subset5-train
rcv1subset5:test - does not exists downloading
Downloaded rcv1subset5-test
finished baseline for rcv1subset5
scene:train - exists, not redownl

### Conclusion

The pipeline compares metrics obtained for a regular `BinaryRelevance` model and my basic implementation for a stacking model.

After running this comparison for almost all datasets, we can conclude that there's nearly no difference between using the regular `BinaryRelevance` and the stacking implementation.

These are the datasets that were tested:

- birds
- emotions
- rcv1subset2
- rcv1subset3
- rcv1subset4
- rcv1subset5
- scene

Other datasets were skipped for taking an abnormally long time to run. Also, some final datasets were not used as I considered I had enough results that were representative of the whole.

Some datasets, for some reason, take quite some more time to train and they end up resulting in 0.00 _accuracy_. However, for these datasets, the _hamming loss_ has a value `>0`, and there's a very tiny difference between the stacking and the regular binary relevance models. **So we can say that the implementation seems to be doing at least _something_, even if completely minimal**.

## 1.4. Comparing to other stacking implementations

**TODO: move this section to after the "using other datasets".**

Let's compare my implementation of the stacking classifier to the one from [utiml](https://github.com/rivolli/utiml).

[Here is a guide](https://cran.r-project.org/web/packages/utiml/vignettes/utiml-overview.html) to get started with the `utiml` library.

2023-08-17: I tried to read the source code, but I could not spot exactly how the stacking is being implemented. As a matter of fact, I don't even know if my understanding of the stacking is correct.

For now, I will proceed with implementing the actual model, and then I will try to go back and debug all of this.

## 1.5. Implementing the specialized model

Model based on this article: ["An efficient stacking model with label selection for multi-label classification"](https://link.springer.com/article/10.1007/s10489-020-01807-z).

Let's start with some basic experiments to understand how to code certain sections of the model.

In [17]:
# doing some short experiments below
# lets start with a baseline model

name = "scene"

train_data = load_dataset(name, "train")
test_data = load_dataset(name, "test")

X_train, y_train, _, _ = train_data
X_test, y_test, _, _ = test_data

clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)

clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print("Accuracy score: ", metrics.accuracy_score(y_test, preds))
print("Hamming loss: ", metrics.hamming_loss(y_test, preds))


scene:train - exists, not redownloading
scene:test - exists, not redownloading
Accuracy score:  0.5869565217391305
Hamming loss:  0.08416945373467112


In [18]:
preds.todense().shape


(1196, 6)

In [19]:
preds.shape[1]

6

In [90]:
labels_count = preds.shape[1]

results = []

for i in range(0, labels_count):
    for j in range(0, labels_count):
        if i == j:
            continue

        X = preds.todense()[:, i]
        X = np.asarray(X).reshape(-1, 1)
        y = preds.todense()[:, j]
        y = np.asarray(y).reshape(-1)

        f_test_result = float(f_classif(X, y)[0])
        # as per the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html
        # `f_classif` returns a tuple of two arrays, the first one is the F-value and the second one is the p-value
        # we only need the F-value

        results.append({
            "label_being_tested": i,
            "against_label": j,
            "f_test_result": f_test_result
        })


In [91]:
results

[{'label_being_tested': 0,
  'against_label': 1,
  'f_test_result': 28.525597269624583},
 {'label_being_tested': 0,
  'against_label': 2,
  'f_test_result': 30.483421750663112},
 {'label_being_tested': 0,
  'against_label': 3,
  'f_test_result': 33.838056680161955},
 {'label_being_tested': 0,
  'against_label': 4,
  'f_test_result': 17.263665283934625},
 {'label_being_tested': 0,
  'against_label': 5,
  'f_test_result': 19.69910729184047},
 {'label_being_tested': 1,
  'against_label': 0,
  'f_test_result': 28.52559726962455},
 {'label_being_tested': 1,
  'against_label': 2,
  'f_test_result': 26.642728910601583},
 {'label_being_tested': 1,
  'against_label': 3,
  'f_test_result': 32.59656218402426},
 {'label_being_tested': 1,
  'against_label': 4,
  'f_test_result': 10.297689123645533},
 {'label_being_tested': 1,
  'against_label': 5,
  'f_test_result': 18.98438159560994},
 {'label_being_tested': 2,
  'against_label': 0,
  'f_test_result': 30.48342175066313},
 {'label_being_tested': 2,

In [22]:
class StackingWithFTests(ProblemTransformationBase):
    first_layer_classifiers: BinaryRelevance
    second_layer_classifiers: List[Any] # TODO should be any generic type of classifier

    def __init__(self, classifier: Any = None, require_dense: Optional[List[bool]] = None):
        super(StackingWithFTests, self).__init__(classifier, require_dense)

        self.first_layer_classifiers = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )
        # TODO: allow for any base model (base classifier) to be used
        # right now I am forcing the use of SVC

        self.second_layer_classifiers = []
        self.correlated_labels_map = pd.DataFrame()

    def fit(self, X: Any, y: Any):
        self.first_layer_classifiers.fit(X, y)

        predictions = self.first_layer_classifiers.predict(X)
        # TODO: does it really make sense to take predictions over the same set of data used for training?
        
        alpha = 0.6

        f_tested_label_pairs = self.calculate_f_test_for_all_label_pairs(predictions)
        self.correlated_labels_map = self.get_map_of_correlated_labels(f_tested_label_pairs, alpha)

        labels_count = predictions.shape[1]

        for i in range(labels_count):
            mask = self.correlated_labels_map["for_label"] == i
            split_df = self.correlated_labels_map[mask].reset_index(drop=True)
            labels_to_expand = split_df["expand_this_label"].to_list()

            additional_input = predictions.todense()[:, labels_to_expand]
            
            X_expanded = np.hstack([X.todense(), additional_input])
            X_expanded = np.asarray(X_expanded)

            y_label_specific = y.todense()[:, i]
            y_label_specific = self.convert_matrix_to_vector(y_label_specific)

            meta_classifier = SVC()
            meta_classifier.fit(X_expanded, y_label_specific)

            self.second_layer_classifiers.append(meta_classifier)
            print(f"finished training meta classifier for label {i}")
    
    def calculate_f_test_for_all_label_pairs(self, predictions: Any) -> List[Dict[str, Any]]:
        labels_count = predictions.shape[1]
        results = []

        for i in range(0, labels_count):
            for j in range(0, labels_count):
                if i == j:
                    continue

                X = predictions.todense()[:, i]
                base_label = self.convert_matrix_to_array(X)

                y = predictions.todense()[:, j]
                against_label = self.convert_matrix_to_vector(y)

                f_test_result = float(f_classif(base_label, against_label)[0])

                results.append({
                    "label_being_tested": i,
                    "against_label": j,
                    "f_test_result": f_test_result
                })
        
        return results
    
    def convert_matrix_to_array(self, matrix: Any):
        return np.asarray(matrix).reshape(-1, 1)

    def convert_matrix_to_vector(self, matrix: Any):
        return np.asarray(matrix).reshape(-1)
    
    def get_map_of_correlated_labels(self, f_test_results: List[Dict[str, Any]], alpha: float) -> pd.DataFrame:
        # TODO must ensure alpha > 0.0 && <= 1.0

        temp_df = pd.DataFrame(f_test_results)
        
        sorted_temp_df = temp_df.sort_values(
            by=["label_being_tested", "f_test_result"],
            ascending=[True, False])
        # this will make it easier to select the features afterwards

        selected_features = []

        for i in range(0, 6):
            mask = sorted_temp_df["label_being_tested"] == i
            split_df = sorted_temp_df[mask].reset_index(drop=True)

            big_f = split_df["f_test_result"].sum()
            max_cum_f = alpha * big_f

            cum_f = 0
            for _, row in split_df.iterrows():
                cum_f += row["f_test_result"]
                if cum_f > max_cum_f:
                    break

                selected_features.append({
                    "for_label": i,
                    "expand_this_label": int(row["against_label"]),
                    "f_test_result": float(row["f_test_result"]),
                })
        
        return pd.DataFrame(selected_features)
    
    def predict(self, X: Any) -> List[Any]:
        if self.correlated_labels_map.empty:
            raise Exception("model was not trained yet")


        predictions = self.first_layer_classifiers.predict(X)
        labels_count = predictions.shape[1]

        second_layer_predictions = []

        for i in range(labels_count):
            mask = self.correlated_labels_map["for_label"] == i
            split_df = self.correlated_labels_map[mask].reset_index(drop=True)
            labels_to_expand = split_df["expand_this_label"].to_list()

            additional_input = predictions.todense()[:, labels_to_expand]

            X_expanded = np.hstack([X.todense(), additional_input])
            X_expanded = np.asarray(X_expanded)

            temp_preds = self.second_layer_classifiers[i].predict(X_expanded)
            second_layer_predictions.append(temp_preds)

        return second_layer_predictions


In [23]:
clf = StackingWithFTests()
clf.fit(X_train, y_train)


In [84]:
a = clf.predict(X_test)

In [97]:
b = np.asarray(a).T

In [98]:
accuracy_score = metrics.accuracy_score(y_test, b)
hamming_loss = metrics.hamming_loss(y_test, b)

print("Accuracy score: ", accuracy_score)
print("Hamming loss: ", hamming_loss)


Accuracy score:  0.5994983277591973
Hamming loss:  0.08319397993311037
