In [None]:
########################################################################################################################
# Filename: SVM_Based_Models.ipynb
#
# Purpose: Multi-label Text-categorization via binary relevance, using support vector machines as base classifiers
#
# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, scikit-learn
########################################################################################################################

In [39]:
import numpy as np 
import scipy
import random
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier 
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC
#from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from joblib import dump, load
import sys
sys.path.append('../../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary    ## Import the 'predict_test_labels_binary()' function 
import os
os.chdir('C:\\Users\\rober\\OneDrive\\Documents\\Multilabel-Text-Classification\\Binary Relevance Models\\SVM Based')  
## Replace with above path with appropriate working directory

# Multilabel Classification Using Binary Relevance Models

Arguably, the most intuitive among multilabel modeling approaches is what's referred to as "binary relevance". This approach works by decomposing the multi-label learning task into a number of independent binary learning tasks (one per class label) (Zhang et al. [2018]). Binary Relevance methods are often criticized in the literature because of their label independence assumption, producing a potential weakness of ignoring correlations among labels (Luaces et al. [2012]). In this notebook, we'll explore binary relevance models built using differenct base classifiers. Later, in other notebooks, we'll train more novel approaches for comparison.

For the other base classifiers we consider (GBTs and kNN) we will consider different threshold function methods: constant vs. using a learned threshold function. Learning threshold functions has the advantage of allowing for different instances to possess different thresholds. This can be useful when a model either cannot consistenty separate true from false labels around a constant value OR when sufficient training is resource intensive. In many instances, a model may learn to separate true from false labels earlier in the training process than it learns to separate about a constant threshold. We do not apply these methods here, since we do not generate probability estimates.

Additionally, each of the models will be trained using both the separable principal component scores and the autoencoder encodings derived in 'Preprocessing and Dimension Reduction/tfidf_Dimension_Reduction.ipynb'. Below, we'll load the data and compute one baseline for validating our models according to Hamming Loss. Namely, since our labels are sparse, we'll compute the validation Hamming Loss associated with a constant zero classifier.

In [146]:
## Load the 'separable' PC features
npzfile = np.load("../../Data/tfidf_PC_separable.npz")
X_sepPCs_train = npzfile["X_sepPCs_train"]
X_sepPCs_test = npzfile["X_sepPCs_test"]

## Load the autoencoder encodings
npzfile = np.load("../../Data/tfidf_encoded_data.npz")
encoded_train = npzfile["encoded_train"]
encoded_test = npzfile["encoded_test"]

## Load the labels
Y_train = npzfile["Y_train"]
Y_test = npzfile["Y_test"]

In [147]:
## Compute the validation Hamming Loss for a constant zero classifier (used as silly baseline for sparse labels)
prop_one_bpmll = np.sum(Y_test == 1) / (Y_test.shape[0] * Y_test.shape[1])
prop_one_bpmll

0.013779397151374627

# Base Classifier: Support Vector Machine
## PCA Features

In [111]:
## Implement a binary relevance model using SVM classifiers (Naive approach to be compared with novel approaches, later)
br_classifier = BinaryRelevance(
    classifier = SVC(C = 1, kernel = 'rbf')
)

#br_classifier.fit(X_sepPCs_train, Y_train)

#br_train_preds = br_classifier.predict(X_sepPCs_train).toarray() ## -- Making predictions takes some time. 
#br_test_preds = br_classifier.predict(X_sepPCs_test).toarray()      ## Instead, load the predictions from 'SVM_based_preds.npz', on next line.

npzfile = npzfile = np.load("SVM_based_preds.npz", allow_pickle = True)
br_train_preds = npzfile["br_train_preds"]
br_test_preds = npzfile["br_test_preds"]

print (f"The Hamming loss for the training data is {metrics.hamming_loss(Y_train, br_train_preds):.3f}")
print (f"The Hamming loss for the test data is {metrics.hamming_loss(Y_test, br_test_preds):.3f}")

The Hamming loss for the training data is 0.004
The Hamming loss for the test data is 0.005


In [9]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the SVM based predictions
outfile = "SVM_based_preds"
#np.savez_compressed(outfile, br_train_preds = br_train_preds,
#                             br_test_preds = br_test_preds)

In [131]:
## 'GridSearchcv()' returns NaNs -- Implement grid search 'by hand' (OR IMPORT GRIDSEARCH DATA IN NEXT CELL)
C_list = list(np.arange(2, 22, 2))
gridSearch_df = pd.DataFrame({'C' : C_list, 'Hamming-Loss' : np.zeros(len(C_list))})
count = 0
for C in C_list:
    br_classifier = BinaryRelevance(classifier = SVC(C = C, kernel = 'rbf'))
    br_classifier.fit(X_sepPCs_train, Y_train)
  
    br_test_preds = br_classifier.predict(X_sepPCs_test).toarray()     
    gridSearch_df.loc[count, 'Hamming-Loss'] = metrics.hamming_loss(Y_test, br_test_preds)
    count += 1
gridSearch_df.to_json('SVM_bestModel_preds.json')

In [135]:
gridSearch_df = pd.read_json('SVM_bestModel_preds.json')
display(gridSearch_df.style.hide_index())
best_index_br = np.argmin(gridSearch_df["Hamming-Loss"])
best_parameters_br = gridSearch_df["C"][best_index_br]
best_params_validation_HL = gridSearch_df['Hamming-Loss'][best_index_br]
print(f"Best parameters: {best_parameters_br}. Best validation Hamming loss: {best_params_validation_HL : 0.3f}")

C,Hamming-Loss
2,0.00456
4,0.004446
6,0.004413
8,0.004369
10,0.00435
12,0.004354
14,0.004332
16,0.004343
18,0.00438
20,0.00442


Best parameters: 14. Best validation Hamming loss:  0.004


## ANN Autoencoder Features

In [148]:
## Implement a binary relevance model using KNN classifiers (Naive approach to be compared with ML-KNN, later)
br_classifier = BinaryRelevance(
    classifier = SVC(C = 1, kernel = 'rbf')
)

br_classifier.fit(encoded_train, Y_train)

br_train_preds = br_classifier.predict(encoded_train).toarray() #-- Making predictions takes some time. 
br_test_preds = br_classifier.predict(encoded_test).toarray()      #Instead, load the predictions from 'kNN_based_preds.npz', on next line.

#npzfile = npzfile = np.load("SVM_based_preds_encoded.npz", allow_pickle = True)
#br_train_preds = npzfile["br_train_preds"]
#br_test_preds = npzfile["br_test_preds"]

print (f"The Hamming loss for the training data is {metrics.hamming_loss(Y_train, br_train_preds):.3f}")
print (f"The Hamming loss for the test data is {metrics.hamming_loss(Y_test, br_test_preds):.3f}")

The Hamming loss for the training data is 0.007
The Hamming loss for the test data is 0.007


In [121]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the SVM based predictions
outfile = "SVM_based_preds_encoded"
#np.savez_compressed(outfile, br_train_preds = br_train_preds,
#                             br_test_preds = br_test_preds)

In [133]:
## 'GridSearchcv()' returns NaNs -- Implement grid search 'by hand' (OR IMPORT GRIDSEARCH DATA IN NEXT CELL)
#C_list = list(np.arange(2, 22, 2))
#gridSearch_df = pd.DataFrame({'C' : C_list, 'Hamming-Loss' : np.zeros(len(C_list))})
#count = 0
#for C in C_list:
#    br_classifier = BinaryRelevance(classifier = SVC(C = C, kernel = 'rbf'))
#    br_classifier.fit(encoded_train, Y_train)
  
#    br_test_preds = br_classifier.predict(encoded_test).toarray()     
#    gridSearch_df.loc[count, 'Hamming-Loss'] = metrics.hamming_loss(Y_test, br_test_preds)
#    count += 1
#gridSearch_df.to_json('SVM_bestModel_preds_encoded.json')

In [134]:
gridSearch_df = pd.read_json('SVM_bestModel_preds_encoded.json')
display(gridSearch_df.style.hide_index())
best_index_br = np.argmin(gridSearch_df["Hamming-Loss"])
best_parameters_br = gridSearch_df["C"][best_index_br]
best_params_validation_HL_encoded = gridSearch_df['Hamming-Loss'][best_index_br]
print(f"Best parameters: {best_parameters_br}. Best validation Hamming loss: {best_params_validation_HL_encoded : 0.3f}")

C,Hamming-Loss
2,0.006982
4,0.00679
6,0.006731
8,0.006654
10,0.006628
12,0.006581
14,0.006551
16,0.006544
18,0.006496
20,0.006466


Best parameters: 20. Best validation Hamming loss:  0.006


## Results

In [141]:
results_df = pd.DataFrame({'PCA' : [best_params_validation_HL],
                           'Autoencoder' : [best_params_validation_HL_encoded]}, 
                          index = ['Constant Threshold'])
results_df

Unnamed: 0,PCA,Autoencoder
Constant Threshold,0.004332,0.006466


In [150]:
results_df.to_json('SVM_results.json')

# References

Zhang, ML., Li, YK., Liu, XY. et al. Binary relevance for multi-label learning: an overview. Front. Comput. Sci. 12, 191–202 (2018). https://doi.org/10.1007/s11704-017-7031-7

Luaces, O., Díez, J., Barranquero, J. et al. Binary relevance efficacy for multilabel classification. Prog Artif Intell 1, 303–313 (2012). https://doi.org/10.1007/s13748-012-0030-x