In [None]:
########################################################################################################################
# Filename: Gradient_Boosted_Trees_Based_Models.ipynb
#
# Purpose: Multi-label Text-categorization via binary relevance, using gradient boosted trees as base classifiers
#
# Author(s): Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, scikit-learn
########################################################################################################################

In [15]:
import numpy as np 
import random
import matplotlib.pyplot as plt
import pandas as pd
from lightgbm import LGBMClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from joblib import dump, load
import os
os.chdir('C:\\Users\\rober\\OneDrive\\Documents\\Multilabel-Text-Classification\\Binary Relevance Models\\Gradient Boosted Tree Based')  
## Replace above path with appropriate working directory
import sys
sys.path.append('../../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary, predict_labels_binary    ## Import the 'predict_test_labels_binary()' function 

In [54]:
## Set config values
path_to_sepPCs_data = '../../Data/tfidf_PC_separable.npz'
path_to_encoded_data = '../../Data/tfidf_encoded_data.npz'
path_to_prelim_sepPC_preds = 'Cache/GBT_based_prelim_preds.npz'
path_to_prelim_encoded_preds = 'Cache/GBT_based_prelim_encoded_preds.npz'
path_to_sepPC_gridSearch_object = 'Cache/sepPCs_gridSearch_object.joblib'

# Multilabel Classification Using Binary Relevance Models

Arguably, the most intuitive among multilabel modeling approaches is what's referred to as "binary relevance". This approach works by decomposing the multi-label learning task into a number of independent binary learning tasks (one per class label) (Zhang et al. [2018]). Binary Relevance methods are often criticized in the literature because of their label independence assumption, producing a potential weakness of ignoring correlations among labels (Luaces et al. [2012]). In this notebook, we'll explore binary relevance models built using differenct base classifiers. Later, in other notebooks, we'll train more novel approaches for comparison.

For other base classifiers, one of which we consider in another notebook (kNN), we can consider different threshold function methods: constant vs. using a learned threshold function. Learning threshold functions has the advantage of allowing for different instances to possess different thresholds. This can be useful when a model either cannot consistenty separate true from false labels around a constant value OR when sufficient training is resource intensive. In many instances, a model may learn to separate true from false labels earlier in the training process than it learns to separate about a constant threshold. We do not apply these methods here, since we do not generate probability estimates.

Additionally, each of the models will be trained using both the separable principal component scores and the autoencoder encodings derived in 'Preprocessing and Dimension Reduction/tfidf_Dimension_Reduction.ipynb'. Below, we'll load the data and compute one baseline for validating our models according to Hamming Loss. Namely, since our labels are sparse, we'll compute the validation Hamming Loss associated with a constant zero classifier.

In [4]:
## Load the 'separable' PC features
npzfile = np.load(path_to_sepPCs_data)
X_sepPCs_train = npzfile["X_sepPCs_train"]
X_sepPCs_test = npzfile["X_sepPCs_test"]

## Load the autoencoder encodings
npzfile = np.load(path_to_encoded_data)
encoded_train = npzfile["encoded_train"]
encoded_test = npzfile["encoded_test"]

## Load the labels
Y_train = npzfile["Y_train"]
Y_test = npzfile["Y_test"]

In [3]:
## Compute the validation Hamming Loss for a constant zero classifier (used as silly baseline for sparse labels)
prop_one_bpmll = np.sum(Y_test == 1) / (Y_test.shape[0] * Y_test.shape[1])
prop_one_bpmll

0.013779397151374627

# Base Classifier: Gradient Boosted Decision Trees
## PCA Features

In [51]:
## Implement a binary relevance model using GBT classifiers (Naive approach to be compared with novel approaches, later)
params = {'boosting_type':'gbdt', 
          'num_leaves':5, 
          'learning_rate':0.01, 
          'n_estimators':250}
br_classifier = BinaryRelevance(
    classifier = LGBMClassifier(**params)
)

#br_classifier.fit(X_sepPCs_train, Y_train)

#br_train_preds = br_classifier.predict(X_sepPCs_train).toarray() ## -- Making predictions takes some time. 
#br_test_preds = br_classifier.predict(X_sepPCs_test).toarray()      ## Instead, load the predictions from 'SVM_based_preds.npz', on next line.

npzfile = npzfile = np.load(path_to_prelim_sepPC_preds, allow_pickle = True)
br_train_preds = npzfile["br_train_preds"]
br_test_preds = npzfile["br_test_preds"]

print (f"The Hamming loss for the training data is {metrics.hamming_loss(Y_train, br_train_preds):.3f}")
print (f"The Hamming loss for the test data is {metrics.hamming_loss(Y_test, br_test_preds):.3f}")

The Hamming loss for the training data is 0.004
The Hamming loss for the test data is 0.006


In [50]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the GBT based predictions
#np.savez_compressed(path_to_prelim_sepPC_preds, 
#                    br_train_preds = br_train_preds,
#                    br_test_preds = br_test_preds)

In [38]:
%%capture
random.seed(123)
n_estimators_list = list(range(100, 500, 100))
num_leaves_list = list(range(2, 6))
learning_rate_list = [0.1, 0.01, 0.001]
parameters_br = {'classifier' : [LGBMClassifier()],
                 'classifier__n_estimators': n_estimators_list,
                 'classifier__num_leaves' : num_leaves_list,
                 'classifier__learning_rate' : learning_rate_list}  
# By default, the Hamming loss as an option is not provided in the scoring string options. 
# So, we first define a Hamming loss scorer and use that. 
hamming_scorer = metrics.make_scorer(metrics.hamming_loss)

#clf_br = GridSearchCV(BinaryRelevance(), parameters_br, scoring = hamming_scorer, cv = 5, verbose = 1)
#clf_br.fit(X_sepPCs_train, Y_train) #-- To save time, load the pre-fit grid search object in the next line.
clf_br = load(path_to_sepPC_gridSearch_object) 

In [33]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the gbm based grid search object
#dump(clf_br, path_to_sepPC_gridSearch_object, compress = 3) 

['Cache/sepPCs_gridSearch_object.joblib']

In [39]:
best_index_br = np.argmin(clf_br.cv_results_["mean_test_score"])
best_parameters_br = clf_br.cv_results_["params"][best_index_br]

df_CV_br = pd.DataFrame(columns=["Params", "Mean out-of-bag Hamming loss"])
df_CV_br["Params"] = clf_br.cv_results_["params"]
df_CV_br[ "Mean out-of-bag Hamming loss"] = clf_br.cv_results_["mean_test_score"]
display(df_CV_br)
print(f"Best parameters: {best_parameters_br}. Best mean out-of-bag Hamming loss: {np.min(clf_br.cv_results_['mean_test_score'])}")

Unnamed: 0,Params,Mean out-of-bag Hamming loss
0,{'classifier': LGBMClassifier(learning_rate=0....,0.008212
1,{'classifier': LGBMClassifier(learning_rate=0....,0.01003
2,{'classifier': LGBMClassifier(learning_rate=0....,0.009279
3,{'classifier': LGBMClassifier(learning_rate=0....,0.009521
4,{'classifier': LGBMClassifier(learning_rate=0....,0.007633
5,{'classifier': LGBMClassifier(learning_rate=0....,0.009745
6,{'classifier': LGBMClassifier(learning_rate=0....,0.008791
7,{'classifier': LGBMClassifier(learning_rate=0....,0.008916
8,{'classifier': LGBMClassifier(learning_rate=0....,0.007355
9,{'classifier': LGBMClassifier(learning_rate=0....,0.009528


Best parameters: {'classifier': LGBMClassifier(learning_rate=0.001, num_leaves=2), 'classifier__learning_rate': 0.01, 'classifier__n_estimators': 400, 'classifier__num_leaves': 5}. Best mean out-of-bag Hamming loss: 0.0051829494843017064


## ANN Autoencoder Features

In [56]:
## Implement a binary relevance model using GBT classifiers (Naive approach to be compared with novel approaches, later)
params = {'boosting_type':'gbdt', 
          'num_leaves':5, 
          'learning_rate':0.01, 
          'n_estimators':250}
br_classifier = BinaryRelevance(
    classifier = LGBMClassifier(**params)
)

#br_classifier.fit(encoded_train, Y_train)

#br_train_preds = br_classifier.predict(encoded_train).toarray() ## -- Making predictions takes some time. 
#br_test_preds = br_classifier.predict(encoded_test).toarray()      ## Instead, load the predictions from 'SVM_based_preds.npz', on next line.

npzfile = npzfile = np.load(path_to_prelim_encoded_preds, allow_pickle = True)
br_train_preds = npzfile["br_train_preds"]
br_test_preds = npzfile["br_test_preds"]

print (f"The Hamming loss for the training data is {metrics.hamming_loss(Y_train, br_train_preds):.3f}")
print (f"The Hamming loss for the test data is {metrics.hamming_loss(Y_test, br_test_preds):.3f}")

The Hamming loss for the training data is 0.007
The Hamming loss for the test data is 0.007


In [55]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the GBT based predictions
#np.savez_compressed(path_to_prelim_encoded_preds, 
#                    br_train_preds = br_train_preds,
#                    br_test_preds = br_test_preds)

In [None]:
%%capture
random.seed(123)
n_estimators_list = list(range(100, 500, 100))
num_leaves_list = list(range(2, 6))
learning_rate_list = [0.1, 0.01, 0.001]
parameters_br = {'classifier' : [LGBMClassifier()],
                 'classifier__n_estimators': n_estimators_list,
                 'classifier__num_leaves' : num_leaves_list,
                 'classifier__learning_rate' : learning_rate_list}  
# By default, the Hamming loss as an option is not provided in the scoring string options. 
# So, we first define a Hamming loss scorer and use that. 
hamming_scorer = metrics.make_scorer(metrics.hamming_loss)

clf_br = GridSearchCV(BinaryRelevance(), parameters_br, scoring = hamming_scorer, cv = 5, verbose = 1)
clf_br.fit(encoded_train, Y_train) #-- To save time, load the pre-fit grid search object in the next line.
#clf_br = load(path_to_sepPC_gridSearch_object) 