In [None]:
########################################################################################################################
# Filename: Binary_Relevance_Classification.ipynb
#
# Purpose: Multi-label Text-categorization via binary relevance, using k-nearest neighbors, gradient boosted trees, 
#          and support vector machines.

# Author(s): Bobby (Robert) Lumpkin, Archit Datar (kNN)
#
# Library Dependencies: numpy, pandas, scikit-learn
########################################################################################################################

In [32]:
import numpy as np 
import scipy
import random
import matplotlib.pyplot as plt
import pandas as pd
from skmultilearn.adapt import MLkNN
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from joblib import dump, load
import sys
sys.path.append('../../../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary    ## Import the 'predict_test_labels_binary()' function 
import os
os.chdir('C:\\Users\\rober\\OneDrive\\Documents\\Multilabel-Text-Classification\\Binary Relevance Models\\kNN Based')  
## Replace with above path with appropriate working directory

# Multilabel Classification Using Binary Relevance Models

Arguably, the most intuitive among multilabel modeling approaches is what's referred to as "binary relevance". This approach works by decomposing the multi-label learning task into a number of independent binary learning tasks (one per class label) (Zhang et al. [2018]). Binary Relevance methods are often criticized in the literature because of their label independence assumption, producing a potential weakness of ignoring correlations among labels (Luaces et al. [2012]). In this notebook, we'll explore binary relevance models built using differenct base classifiers. Later, in other notebooks, we'll train more novel approaches for comparison.

Each of our models will be evaluated using the default constant threshold function of $t(x) \equiv 0.5$ in addition to using a learned threshold function. Learning threshold functions has the advantage of allowing for different instances to possess different thresholds. Thsi can be useful, when a model either cannot consistenty separate true from false labels around a constant value OR when sufficient training is resource intensive. In many instances, a model may learn to separate true from false labels earlier in the training process than it learns to separate about a constant threshold.

Additionally, each of the models will be trained using both the separable principal component scores and the autoencoder encodings derived in 'Preprocessing and Dimension Reduction/tfidf_Dimension_Reduction.ipynb'. Below, we'll load the data and compute one baseline for validating our models according to Hamming Loss. Namely, since our labels are sparse, we'll compute the validation Hamming Loss associated with a constant zero classifier.

In [33]:
## Load the 'separable' PC features
npzfile = np.load("../../Data/tfidf_PC_separable.npz")
X_sepPCs_train = npzfile["X_sepPCs_train"]
X_sepPCs_test = npzfile["X_sepPCs_test"]

## Load the autoencoder encodings
npzfile = np.load("../../Data/tfidf_encoded_data.npz")
encoded_train = npzfile["encoded_train"]
encoded_test = npzfile["encoded_test"]

## Load the labels
Y_train = npzfile["Y_train"]
Y_test = npzfile["Y_test"]

In [34]:
## Compute the validation Hamming Loss for a constant zero classifier (used as silly baseline for sparse labels)
prop_one_bpmll = np.sum(Y_test == 1) / (Y_test.shape[0] * Y_test.shape[1])
prop_one_bpmll

0.013779397151374627

## Base Classifier: k-Nearest Neighbors

In [35]:
## Implement a binary relevance model using KNN classifiers (Naive approach to be compared with ML-KNN, later)
br_classifier = BinaryRelevance(
    classifier = kNN()
)

br_classifier.fit(X_sepPCs_train, Y_train)

#br_train_preds = br_classifier.predict(X_sepPCs_train).toarray() -- Making predictions takes some time. 
#br_test_preds = br_classifier.predict(X_sepPCs_test).toarray()      Instead, load the predictions from 'kNN_based_preds.npz', on next line.

npzfile = npzfile = np.load("kNN_based_preds.npz", allow_pickle = True)
br_train_preds = npzfile["br_train_preds"]
br_test_preds = npzfile["br_test_preds"]

print (f"The Hamming loss for the training data is {metrics.hamming_loss(Y_train, br_train_preds):.3f}")
print (f"The Hamming loss for the test data is {metrics.hamming_loss(Y_test, br_test_preds):.3f}")

The Hamming loss for the training data is 0.003
The Hamming loss for the test data is 0.005


In [20]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the kNN based predictions
outfile = "kNN_based_preds"
#np.savez_compressed(outfile, br_train_preds = br_train_preds,
#                             br_test_preds = br_test_preds)

In [37]:
%%capture
random.seed(123)
n_neighbors_list = list(range(3, 16, 3))
n_neighbors_list.insert(0, 1)
parameters_br = {'n_neighbors': n_neighbors_list}  
# By default, the Hamming loss as an option is not provided in the scoring string options. 
# So, we first define a Hamming loss scorer and use that. 
hamming_scorer = metrics.make_scorer(metrics.hamming_loss)

clf_br = GridSearchCV(kNN(), parameters_br, scoring = hamming_scorer, cv = 5, verbose = 1)
#clf_br.fit(X_sepPCs_train, Y_train) #-- To save time, load the pre-fit grid search object in the next line.
clf_br = load("clf_br_gridSearch_object.joblib") 

In [31]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the kNN based predictions
outfile = "clf_br_gridSearch_object.joblib"
#dump(clf_br, outfile, compress = 3) 

['kNN Based/clf_br_gridSearch_object.joblib']

In [42]:
best_index_br = np.argmin(clf_br.cv_results_["mean_test_score"])
best_parameters_br = clf_br.cv_results_["params"][best_index_br]

df_CV_br = pd.DataFrame(columns=["Params", "Mean out-of-bag Hamming loss"])
df_CV_br["Params"] = clf_br.cv_results_["params"]
df_CV_br[ "Mean out-of-bag Hamming loss"] = clf_br.cv_results_["mean_test_score"]
display(df_CV_br)
print(f"Best parameters: {best_parameters_br}. Best mean out-of-bag Hamming loss: {np.min(clf_br.cv_results_['mean_test_score'])}")

Unnamed: 0,Params,Mean out-of-bag Hamming loss
0,{'n_neighbors': 1},0.005997
1,{'n_neighbors': 3},0.004788
2,{'n_neighbors': 6},0.004697
3,{'n_neighbors': 9},0.004714
4,{'n_neighbors': 12},0.004881
5,{'n_neighbors': 15},0.004877


Best parameters: {'n_neighbors': 6}. Best mean out-of-bag Hamming loss: 0.004696695591737446


In [43]:
# Threshold learning using the best parameters from the cross-validated grid search 
classifier_best_br = BinaryRelevance(
    classifier = kNN(n_neighbors = best_parameters_br['n_neighbors'])
)

classifier_best_br.fit(X_sepPCs_train, Y_train)
#Y_train_pred_best_br = classifier_best_br.predict(X_sepPCs_train) # -- These 'predict()' steps can be time costly.
#Y_train_pred_best_array_br = Y_train_pred_best_br.toarray()          # Instead, load the predictions below
#Y_test_pred_best_br = classifier_best_br.predict(X_sepPCs_test)
#Y_test_pred_best_array_br = Y_test_pred_best_br.toarray()

npzfile = np.load("kNN_bestModel_preds.npz", allow_pickle = True)
Y_train_pred_best_array_br = npzfile["Y_train_pred_best_array_br"]
Y_test_pred_best_array_br = npzfile["Y_test_pred_best_array_br"]
Y_train_pred_proba_array = npzfile["Y_train_pred_proba_array"]
Y_test_pred_proba_array = npzfile["Y_test_pred_proba_array"]

threshold_function = load("learned_threshold_function.joblib")

print (f"Best parameters: The Hamming loss training data is {metrics.hamming_loss(Y_train, Y_train_pred_best_array_br):.3f}")
print (f"Best parameters: The Hamming loss test data is {metrics.hamming_loss(Y_test, Y_test_pred_best_array_br):.3f}")

# Learn a threshold function
#Y_train_pred_proba = classifier_best_br.predict_proba(X_sepPCs_train)
#Y_train_pred_proba_array = Y_train_pred_proba.toarray()
#Y_test_pred_proba = classifier_best_br.predict_proba(X_sepPCs_test)
#Y_test_pred_proba_array = Y_test_pred_proba.toarray()

t_range = (0, 1)

#test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred_proba_array, Y_train, Y_test_pred_proba_array, t_range)
print (f"Best parameters with threshold function learning: Hamming loss Test set is {metrics.hamming_loss(Y_test, test_labels_binary)}")

Best parameters: The Hamming loss training data is 0.004
Best parameters: The Hamming loss test data is 0.005
Best parameters with threshold function learning: Hamming loss Test set is 0.009355562916344632


In [27]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the kNN based best model predictions with & without a learned threshold
##                                               and save the learned threshold
outfile = "kNN_bestModel_preds.npz"
#np.savez_compressed(outfile, Y_train_pred_best_array_br = Y_train_pred_best_array_br,
#                             Y_test_pred_best_array_br = Y_test_pred_best_array_br,
#                             Y_train_pred_proba_array = Y_train_pred_proba_array,
#                             Y_test_pred_proba_array = Y_test_pred_proba_array,
#                             test_labels_binary = test_labels_binary)

outfile = "learned_threshold_function.joblib"
#dump(threshold_function, outfile, compress = 3) 

['kNN Based/learned_threshold_function.joblib']

# References

Zhang, ML., Li, YK., Liu, XY. et al. Binary relevance for multi-label learning: an overview. Front. Comput. Sci. 12, 191–202 (2018). https://doi.org/10.1007/s11704-017-7031-7

Luaces, O., Díez, J., Barranquero, J. et al. Binary relevance efficacy for multilabel classification. Prog Artif Intell 1, 303–313 (2012). https://doi.org/10.1007/s13748-012-0030-x