In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.dataset import load_dataset
from sklearn.svm import SVC
from skmultilearn.base.problem_transformation import ProblemTransformationBase
from typing import List, Optional, Any, Tuple, Dict
import numpy as np
import sklearn.metrics as metrics
import json
import pandas as pd
from sklearn.feature_selection import f_classif
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
import copy

from metrics.evaluation import EvaluationPipeline
from sklearn.neighbors import KNeighborsClassifier

from skmultilearn.problem_transform import BinaryRelevance

In [7]:
desired_datasets = ["scene", "emotions", "birds"]

datasets = {}
for dataset_name in desired_datasets:
    print(f"getting dataset `{dataset_name}`")
    
    full_dataset = load_dataset(dataset_name, "undivided")
    X, y, _, _ = full_dataset

    train_dataset = load_dataset(dataset_name, "train")
    X_train, y_train, _, _ = train_dataset

    test_dataset = load_dataset(dataset_name, "test")
    X_test, y_test, _, _ = test_dataset

    datasets[dataset_name] = {
        "X": X,
        "y": y,
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "rows": X.shape[0],
        "labels_count": y.shape[1]
    }


for name, info in datasets.items():
    print("===")
    print(f"information for dataset `{name}`")
    print(f"rows: {info['rows']}, labels: {info['labels_count']}")


getting dataset `scene`
scene:undivided - exists, not redownloading
scene:train - exists, not redownloading
scene:test - exists, not redownloading
getting dataset `emotions`
emotions:undivided - exists, not redownloading
emotions:train - exists, not redownloading
emotions:test - exists, not redownloading
getting dataset `birds`
birds:undivided - exists, not redownloading
birds:train - exists, not redownloading
birds:test - exists, not redownloading
===
information for dataset `scene`
rows: 2407, labels: 6
===
information for dataset `emotions`
rows: 593, labels: 6
===
information for dataset `birds`
rows: 645, labels: 19


In [20]:
X_train = datasets["scene"]["X_train"]
X_test = datasets["scene"]["X_test"]

shuffled_order = np.random.permutation(X_train.shape[1])
shuffled_X_train = X_train[:, shuffled_order]
shuffled_X_test = X_test[:, shuffled_order]

y_train = datasets["scene"]["y_train"]
y_test = datasets["scene"]["y_test"]

display(X_train.todense())
display(shuffled_X_train.todense())

matrix([[0.646467, 0.666435, 0.685047, ..., 0.247298, 0.014025, 0.029709],
        [0.770156, 0.767255, 0.761053, ..., 0.137833, 0.082672, 0.03632 ],
        [0.793984, 0.772096, 0.76182 , ..., 0.051125, 0.112506, 0.083924],
        ...,
        [0.85639 , 1.      , 1.      , ..., 0.019464, 0.022167, 0.043738],
        [0.805592, 0.80417 , 0.811438, ..., 0.346736, 0.231481, 0.332623],
        [0.855064, 0.858896, 0.911177, ..., 0.262119, 0.104471, 0.34728 ]])

matrix([[0.13679 , 0.354982, 0.58803 , ..., 0.124623, 0.327473, 0.204866],
        [0.234595, 0.303399, 0.638408, ..., 0.231433, 0.347137, 0.019469],
        [0.284239, 0.00346 , 0.726349, ..., 0.292222, 0.025036, 0.011205],
        ...,
        [0.34684 , 0.040355, 0.47558 , ..., 0.343549, 0.073848, 0.004937],
        [0.153156, 0.151438, 0.561282, ..., 0.129029, 0.272418, 0.02801 ],
        [0.195887, 0.164443, 0.496182, ..., 0.201372, 0.150776, 0.10174 ]])

In [21]:
def run_regular_order(model):
    br_model = BinaryRelevance(
        classifier=model,
        require_dense=[False, True]
    )

    br_model.fit(X_train, y_train)
    predictions = br_model.predict(X_test)

    print("accuracy")
    print(metrics.accuracy_score(y_test, predictions))

    print("hamming loss")
    print(metrics.hamming_loss(y_test, predictions))

    print("f1 score")
    print(metrics.f1_score(y_test, predictions, average="macro"))

def run_shuffled_order(model):
    br_model = BinaryRelevance(
        classifier=model,
        require_dense=[False, True]
    )

    br_model.fit(shuffled_X_train, y_train)
    predictions = br_model.predict(shuffled_X_test)

    print("accuracy")
    print(metrics.accuracy_score(y_test, predictions))

    print("hamming loss")
    print(metrics.hamming_loss(y_test, predictions))

    print("f1 score")
    print(metrics.f1_score(y_test, predictions, average="macro"))

In [22]:
run_regular_order(KNeighborsClassifier())

accuracy
0.596989966555184
hamming loss
0.10451505016722408
f1 score
0.6809836443612469


In [23]:
run_shuffled_order(KNeighborsClassifier())

accuracy
0.596989966555184
hamming loss
0.10451505016722408
f1 score
0.6809836443612469


In [None]:
# random forest classifier
# regular order

knn_model = BinaryRelevance(
    classifier=RandomForestClassifier(),
    require_dense=[False, True]
)

knn_model.fit(X_train, y_train)
predictions = knn_model.predict(X_test)

print("accuracy")
print(metrics.accuracy_score(y_test, predictions))

print("hamming loss")
print(metrics.hamming_loss(y_test, predictions))

print("f1 score")
print(metrics.f1_score(y_test, predictions, average="macro"))