In [1]:
import os
import sys

sys.path.append(os.path.abspath('../lib'))

In [2]:
import random

import numpy as np
import skml
from skml.problem_transformation import ProbabilisticClassifierChain
from skml.datasets import sample_down_label_space
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.base import clone
from sklearn.externals import joblib

from experimental_framework import load_from_arff
random.seed(2018)

In [3]:
X, y = load_from_arff('../data/enron/enron.arff', labelcount=53, endian='little')
y = sample_down_label_space(y, k=10)

X = X.todense()
y = y.todense()

In [4]:
try:
    results = joblib.load('../data/predictions.pkl')
except:
    results = []

In [5]:
if len(results) == 0:
    clf = ProbabilisticClassifierChain(LogisticRegression())
    kf = KFold()

    results = []

    for train_index, test_index in kf.split(X):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]

        pcc = clone(clf)

        pcc.fit(X_train, y_train)
        y_pred = pcc.predict(X_test)
        y_pred_pp = pcc.predict_proba(X_test)
        results.append((y_pred, y_pred_pp))


        print("----------")

        print("hamming loss: ")
        print(hamming_loss(y_test, y_pred))

        print("accuracy:")
        print(accuracy_score(y_test, y_pred))

        print("f1 score:")
        print("micro")
        print(f1_score(y_test, y_pred, average='micro'))
        print("macro")
        print(f1_score(y_test, y_pred, average='macro'))

        print("precision:")
        print("micro")
        print(precision_score(y_test, y_pred, average='micro'))
        print("macro")
        print(precision_score(y_test, y_pred, average='macro'))

        print("recall:")
        print("micro")
        print(recall_score(y_test, y_pred, average='micro'))
        print("macro")
        print(recall_score(y_test, y_pred, average='macro'))

    joblib.dump(results, '../data/predictions.pkl') 

In [17]:
test = ['?', 1., 0., '?']
gt = [0., 1., 0., 1.]
y_test = results[0][1][0]

In [23]:
# this places the '?' as np.nan
np.place(y_test, mask=np.logical_and(y_test > 1/3, y_test < 2/3), vals=np.nan)

In [24]:
y_test

array([[0.96297518, 0.89641911, 0.91614432,        nan, 0.93988057,
        0.95834067, 0.88272036, 0.93995837, 0.85116763, 0.81247914]])

In [22]:
np.logical_and(y_test > 1/3, y_test < 2/3)

array([[False, False, False,  True, False, False, False, False, False,
        False]])

In [32]:
y_test.shape

(1, 10)

In [30]:
idxs = np.isnan(y_test)

In [31]:
N = np.sum(idxs)
L = y_test.shape[1]

w0 = 1 / (L - N)
w1 = 1 / N

1

In [25]:
np.array(y_test, dtype=np.float)

array([[0.96297518, 0.89641911, 0.91614432,        nan, 0.93988057,
        0.95834067, 0.88272036, 0.93995837, 0.85116763, 0.81247914]])

In [26]:
np.array(gt, dtype=np.float)

array([0., 1., 0., 1.])

In [28]:
test = ['?', 1., 0., '?']
gt = [0., 1., 0., 1.]

def uncertain_hamming_loss(y, y_pred):
    y = np.array(y, dtype=np.float)
    y_pred = np.array(y, dtype=np.float)
    
    idxs = np.isnan(y_test)

In [7]:
for res in results:
    y_pred, y_pred_pp = res
    
    print("hamming loss: ")
    print(hamming_loss(y_test, y_pred))

    print("accuracy:")
    print(accuracy_score(y_test, y_pred))

    print("f1 score:")
    print("micro")
    print(f1_score(y_test, y_pred, average='micro'))
    print("macro")
    print(f1_score(y_test, y_pred, average='macro'))

    print("precision:")
    print("micro")
    print(precision_score(y_test, y_pred, average='micro'))
    print("macro")
    print(precision_score(y_test, y_pred, average='macro'))

    print("recall:")
    print("micro")
    print(recall_score(y_test, y_pred, average='micro'))
    print("macro")
    print(recall_score(y_test, y_pred, average='macro'))