# Getting metrics for the first model

Let's try to update my report with the metrics for the first model.

As per the tests conducted on the first notebook, regarding the datasets (to find which datasets can be run in a reasonable time), we are going to use these default datasets:

* `birds`
* `emotions`
* `scene`

The **baseline** will be the regular Binary Relevance. Then we will compare those to the Basic Stacking approach. Finally, we will run the Stacking With F-Test, with `alpha=0.5`. All other parameters, of the other models, will be the default ones, using the `SVC` as the base classifier.

The metrics we will use, as per what was defined in my report, are the hamming loss and the f1 score. We will use the `EvaluationPipeline` class to run the experiments.

## Setup

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.dataset import load_dataset, available_data_sets
from sklearn.svm import SVC
from skmultilearn.base.problem_transformation import ProblemTransformationBase
from typing import List, Optional, Any, Tuple, Dict
import numpy as np
import sklearn.metrics as metrics
import json
import pandas as pd
from sklearn.feature_selection import f_classif

from lib.evaluation import EvaluationPipeline


## Models

In [2]:
# TODO: move this to an actual python file

class BasicStacking(ProblemTransformationBase):
    first_layer_classifiers: BinaryRelevance
    second_layer_classifiers: BinaryRelevance

    def __init__(self, classifier: Any = None, require_dense: Optional[List[bool]] = None):
        super(BasicStacking, self).__init__(classifier, require_dense)

        self.first_layer_classifiers = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )

        self.second_layer_classifiers = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )
    
    def fit(self, X: Any, y: Any):
        self.first_layer_classifiers.fit(X, y)

        first_layer_predictions = self.first_layer_classifiers.predict(X)
        X_expanded = np.hstack([X.todense(), first_layer_predictions.todense()])

        self.second_layer_classifiers.fit(X_expanded, y)
    
    def predict(self, X: Any):
        first_layer_predictions = self.first_layer_classifiers.predict(X)
        X_expanded = np.hstack([X.todense(), first_layer_predictions.todense()])
        return self.second_layer_classifiers.predict(X_expanded)


In [3]:
# TODO move this to an actual python file

class StackingWithFTests(ProblemTransformationBase):
    alpha: float
    use_first_layer_to_calculate_correlations: bool
    
    first_layer_classifiers: BinaryRelevance
    second_layer_classifiers: List[Any] # TODO should be any generic type of classifier
    labels_count: int

    def __init__(
        self,
        alpha: float = 0.5,
        use_first_layer_to_calculate_correlations: bool = False,
        classifier: Any = None,
        require_dense: Optional[List[bool]] = None
    ):
        super(StackingWithFTests, self).__init__(classifier, require_dense)

        if alpha < 0.0 or alpha > 1.0:
            raise Exception("alpha must be >= 0.0 and <= 1.0")

        self.alpha = alpha
        self.use_first_layer_to_calculate_correlations = use_first_layer_to_calculate_correlations
        
        self.first_layer_classifiers = BinaryRelevance(
            classifier=SVC(),
            require_dense=[False, True]
        )
        # TODO: allow for any base model (base classifier) to be used
        # right now I am forcing the use of SVC

        self.second_layer_classifiers = []
        self.correlated_labels_map = pd.DataFrame()
        self.labels_count = 0


    def fit(self, X: Any, y: Any):
        self.labels_count = y.shape[1]

        self.first_layer_classifiers.fit(X, y)
        
        label_classifications = y
        if self.use_first_layer_to_calculate_correlations:
            label_classifications = self.first_layer_classifiers.predict(X)

        f_tested_label_pairs = self.calculate_f_test_for_all_label_pairs(label_classifications)
        self.correlated_labels_map = self.get_map_of_correlated_labels(f_tested_label_pairs)

        for i in range(self.labels_count):
            mask = self.correlated_labels_map["for_label"] == i
            split_df = self.correlated_labels_map[mask].reset_index(drop=True)
            labels_to_expand = split_df["expand_this_label"].to_list()

            additional_input = label_classifications.todense()[:, labels_to_expand]
            
            X_expanded = np.hstack([X.todense(), additional_input])
            X_expanded = np.asarray(X_expanded)

            y_label_specific = y.todense()[:, i]
            y_label_specific = self.convert_matrix_to_vector(y_label_specific)

            meta_classifier = SVC()
            meta_classifier.fit(X_expanded, y_label_specific)

            self.second_layer_classifiers.append(meta_classifier)
            print(f"finished training meta classifier for label {i}")
    
    def calculate_f_test_for_all_label_pairs(self, label_classifications: Any) -> List[Dict[str, Any]]:
        results = []

        for i in range(0, self.labels_count):
            for j in range(0, self.labels_count):
                if i == j:
                    continue

                X = label_classifications.todense()[:, i]
                base_label = self.convert_matrix_to_array(X)

                y = label_classifications.todense()[:, j]
                against_label = self.convert_matrix_to_vector(y)

                f_test_result = f_classif(base_label, against_label)[0]

                results.append({
                    "label_being_tested": i,
                    "against_label": j,
                    "f_test_result": float(f_test_result)
                })
        
        return results
    
    def convert_matrix_to_array(self, matrix: Any):
        return np.asarray(matrix).reshape(-1, 1)

    def convert_matrix_to_vector(self, matrix: Any):
        return np.asarray(matrix).reshape(-1)
    
    def get_map_of_correlated_labels(self, f_test_results: List[Dict[str, Any]]) -> pd.DataFrame:
        temp_df = pd.DataFrame(f_test_results)
        
        sorted_temp_df = temp_df.sort_values(
            by=["label_being_tested", "f_test_result"],
            ascending=[True, False])
        # ordering in descending order by the F-test result,
        # following what the main article describes

        selected_features = []

        for i in range(0, self.labels_count):
            mask = sorted_temp_df["label_being_tested"] == i
            split_df = sorted_temp_df[mask].reset_index(drop=True)

            big_f = split_df["f_test_result"].sum()
            max_cum_f = self.alpha * big_f

            cum_f = 0
            for _, row in split_df.iterrows():
                cum_f += row["f_test_result"]
                if cum_f > max_cum_f:
                    break

                selected_features.append({
                    "for_label": i,
                    "expand_this_label": int(row["against_label"]),
                    "f_test_result": float(row["f_test_result"]),
                })
        
        cols = ["for_label", "expand_this_label", "f_test_result"]
        return pd.DataFrame(selected_features, columns=cols)
    
    def predict(self, X: Any) -> np.ndarray[Any,Any]:
        if self.correlated_labels_map.columns.size == 0:
            raise Exception("model was not trained yet")

        predictions = self.first_layer_classifiers.predict(X)
        local_labels_count = predictions.shape[1]

        second_layer_predictions = []

        for i in range(local_labels_count):
            mask = self.correlated_labels_map["for_label"] == i
            split_df = self.correlated_labels_map[mask].reset_index(drop=True)
            labels_to_expand = split_df["expand_this_label"].to_list()

            additional_input = predictions.todense()[:, labels_to_expand]

            X_expanded = np.hstack([X.todense(), additional_input])
            X_expanded = np.asarray(X_expanded)

            temp_preds = self.second_layer_classifiers[i].predict(X_expanded)
            second_layer_predictions.append(temp_preds)

        reshaped_array = np.asarray(second_layer_predictions).T
        return reshaped_array


## Datasets

In [62]:
desired_datasets = ["scene", "emotions", "birds"]

datasets = {}
for dataset_name in desired_datasets:
    print(f"getting dataset `{dataset_name}`")
    
    full_dataset = load_dataset(dataset_name, "undivided")
    X, y, _, _ = full_dataset

    train_dataset = load_dataset(dataset_name, "train")
    X_train, y_train, _, _ = train_dataset

    test_dataset = load_dataset(dataset_name, "test")
    X_test, y_test, _, _ = test_dataset

    datasets[dataset_name] = {
        "X": X,
        "y": y,
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "rows": X.shape[0],
        "labels_count": y.shape[1]
    }


for name, info in datasets.items():
    print("===")
    print(f"information for dataset `{name}`")
    print(f"rows: {info['rows']}, labels: {info['labels_count']}")


getting dataset `scene`
scene:undivided - exists, not redownloading
scene:train - exists, not redownloading
scene:test - exists, not redownloading
getting dataset `emotions`
emotions:undivided - exists, not redownloading
emotions:train - exists, not redownloading
emotions:test - exists, not redownloading
getting dataset `birds`
birds:undivided - exists, not redownloading
birds:train - exists, not redownloading
birds:test - exists, not redownloading
===
information for dataset `scene`
rows: 2407, labels: 6
===
information for dataset `emotions`
rows: 593, labels: 6
===
information for dataset `birds`
rows: 645, labels: 19


In [81]:
from sklearn.model_selection import train_test_split

qf_x_train, qf_x_test, qf_y_train, qf_y_test = train_test_split(datasets["emotions"]["X"].toarray(), datasets["emotions"]["y"].toarray(), test_size=0.33, random_state=42)

## Evaluations

In [5]:
baseline_binary_relevance_model = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)

basic_stacking_model = BasicStacking()
stacking_with_f_tests_model = StackingWithFTests(alpha=0.5)

models = {
    "baseline_binary_relevance_model": baseline_binary_relevance_model,
    "basic_stacking_model": basic_stacking_model,
    "stacking_with_f_tests_model": stacking_with_f_tests_model
}



In [6]:
evaluation_results = {}


In [42]:
for model_name, model in models.items():
    print(f"# running model `{model_name}`")

    evaluation_results[model_name] = {}

    n_folds = 10
    evaluation_pipeline = EvaluationPipeline(model, n_folds)

    for dataset_name, info in datasets.items():
        print(f"## running dataset `{dataset_name}`")

        result = evaluation_pipeline.run(info["X"], info["y"])
        evaluation_results[model_name][dataset_name] = result

        print(f"results obtained:")
        result.describe()


# running model `baseline_binary_relevance_model`
## running dataset `scene`


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


results obtained:
Accuracy: 0.4910 ± 0.21
Hamming Loss: -0.1107 ± 0.05
F1 score: 0.3016 ± 0.09
## running dataset `emotions`
results obtained:
Accuracy: 0.0186 ± 0.02
Hamming Loss: -0.3016 ± 0.03
F1 score: 0.0640 ± 0.02
## running dataset `birds`


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


results obtained:
Accuracy: 0.4637 ± 0.07
Hamming Loss: -0.0535 ± 0.01
F1 score: 0.0126 ± 0.01
# running model `basic_stacking_model`
## running dataset `scene`


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


results obtained:
Accuracy: 0.4910 ± 0.21
Hamming Loss: -0.1107 ± 0.05
F1 score: 0.3016 ± 0.09
## running dataset `emotions`
results obtained:
Accuracy: 0.0186 ± 0.02
Hamming Loss: -0.3016 ± 0.03
F1 score: 0.0640 ± 0.02
## running dataset `birds`


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


results obtained:
Accuracy: 0.4637 ± 0.07
Hamming Loss: -0.0535 ± 0.01
F1 score: 0.0126 ± 0.01
# running model `stacking_with_f_tests_model`
## running dataset `scene`
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
fini

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier 

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
results obtained:
Accuracy: 0.5982 ± 0.13
Hamming Loss: -0.0985 ± 0.04
F1 score: 0.3224 ± 0.07
## running dataset `emotions`
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 0
f

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5
finished training meta classifier for label 6
finished training meta classifier for label 7
finished training meta classifier for label 8
finished training meta classifier for label 9
finished training meta classifier for label 10
finished training meta classifier for label 11
finished training meta classifier for label 12
finished training meta classifier for label 13
finished training meta classifier for label 14
finished training meta classifier for label 15
finished training meta classifier for label 16
finished training meta classifier for label 17
finished training meta classifier for label 18


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


results obtained:
Accuracy: 0.4637 ± 0.07
Hamming Loss: -0.0535 ± 0.01
F1 score: 0.0126 ± 0.01


In [43]:
evaluation_results

{'baseline_binary_relevance_model': {'scene': <evaluation.EvaluationPipelineResult at 0x2359ed1c760>,
  'emotions': <evaluation.EvaluationPipelineResult at 0x2359b14d280>,
  'birds': <evaluation.EvaluationPipelineResult at 0x2359b1672e0>},
 'basic_stacking_model': {'scene': <evaluation.EvaluationPipelineResult at 0x2359b130130>,
  'emotions': <evaluation.EvaluationPipelineResult at 0x23596b515e0>,
  'birds': <evaluation.EvaluationPipelineResult at 0x2359ed2c730>},
 'stacking_with_f_tests_model': {'scene': <evaluation.EvaluationPipelineResult at 0x2359ed15c40>,
  'emotions': <evaluation.EvaluationPipelineResult at 0x2359ed1f760>,
  'birds': <evaluation.EvaluationPipelineResult at 0x2358a348790>}}

In [59]:
evaluation_results["baseline_binary_relevance_model"]["birds"].describe()

Accuracy: 0.4637 ± 0.07
Hamming Loss: -0.0535 ± 0.01
F1 score: 0.0126 ± 0.01


In [60]:
evaluation_results["basic_stacking_model"]["birds"].describe()

Accuracy: 0.4637 ± 0.07
Hamming Loss: -0.0535 ± 0.01
F1 score: 0.0126 ± 0.01


In [61]:
evaluation_results["stacking_with_f_tests_model"]["birds"].describe()

Accuracy: 0.4637 ± 0.07
Hamming Loss: -0.0535 ± 0.01
F1 score: 0.0126 ± 0.01


## Results so far

It is very weird, but even with different number of folds, the results are always identical for binary relevance and stacking. Stacking with F-test is usually a bit better, but only by a tiny bit. I am not sure if this is a bug or not, but let's try to check the results with a simple train/test split.

In [87]:
simple_evaluation_results = {}

for model_name, model in models.items():
    print(f"# running model `{model_name}`")

    simple_evaluation_results[model_name] = {}

    for dataset_name, info in datasets.items():
        if dataset_name == "emotions":
            continue

        if dataset_name == "birds":
            continue

        print(f"## running dataset `{dataset_name}`")
        
        model.fit(info["X_train"], info["y_train"])
        predictions = model.predict(info["X_test"])

        simple_evaluation_results[model_name][dataset_name] = {
            "accuracy": metrics.accuracy_score(info["y_test"], predictions),
            "f1": metrics.f1_score(info["y_test"], predictions, average="macro"),
            "hamming_loss": metrics.hamming_loss(info["y_test"], predictions)
        }

# running model `baseline_binary_relevance_model`
## running dataset `scene`
# running model `basic_stacking_model`
## running dataset `scene`
# running model `stacking_with_f_tests_model`
## running dataset `scene`
finished training meta classifier for label 0
finished training meta classifier for label 1
finished training meta classifier for label 2
finished training meta classifier for label 3
finished training meta classifier for label 4
finished training meta classifier for label 5


In [88]:
simple_evaluation_results

{'baseline_binary_relevance_model': {'scene': {'accuracy': 0.5869565217391305,
   'f1': 0.7237789962754925,
   'hamming_loss': 0.08416945373467112}},
 'basic_stacking_model': {'scene': {'accuracy': 0.5869565217391305,
   'f1': 0.7237789962754925,
   'hamming_loss': 0.08416945373467112}},
 'stacking_with_f_tests_model': {'scene': {'accuracy': 0.6279264214046822,
   'f1': 0.761565955949257,
   'hamming_loss': 0.08235785953177258}}}