In [None]:
########################################################################################################################
# Filename: ML-kNN_Models.ipynb
#
# Purpose: Multi-label Text-categorization via an algorithmic adaptation of the k-nearest-neighbors
#          algorithm -- ML-kNN

# Author(s): Archit Datar, Laren Contard, Bobby (Robert) Lumpkin
#
# Library Dependencies: numpy, pandas, scikit-learn, skmultilearn, joblib, os, sys, threshold_learning
########################################################################################################################

# Novel Algorithmic Adaptations for Multilabel Learning

In [24]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
from skmultilearn.adapt import MLkNN
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from joblib import dump, load
import os
os.chdir('C:\\Users\\rober\\OneDrive\\Documents\\Multilabel-Text-Classification\\Novel Algorithmic Adaptations\\ML-kNN')  
## Replace with above path with appropriate working directory
import sys
sys.path.append('../../ThresholdFunctionLearning')    ## Append path to the ThresholdFunctionLearning directory to the interpreters
                                                   ## search path
from threshold_learning import predict_test_labels_binary, predict_labels_binary    ## Import the 'predict_test_labels_binary()' function 

# ML-kNN Models
## PCA Features

In [5]:
## Load the 'separable' PC features
npzfile = np.load("../../Data/tfidf_PC_separable.npz")
X_sepPCs_train = npzfile["X_sepPCs_train"]
X_sepPCs_test = npzfile["X_sepPCs_test"]

## Load the autoencoder encodings
npzfile = np.load("../../Data/tfidf_encoded_data.npz")
encoded_train = npzfile["encoded_train"]
encoded_test = npzfile["encoded_test"]

## Load the labels
Y_train = npzfile["Y_train"]
Y_test = npzfile["Y_test"]

In [6]:
## Compute the validation Hamming Loss for a constant zero classifier (used as silly baseline for sparse labels)
prop_one_bpmll = np.sum(Y_test == 1) / (Y_test.shape[0] * Y_test.shape[1])
prop_one_bpmll

0.013779397151374627

In [14]:
## Fit the ML-kNN model to the separable PCs dataset and evaluate the train/test set hamming loss 
## (This can take some time -- Can read in predictions from 'MLkNN_based_preds' instead)
classifier = MLkNN(k = 3)
#classifier.fit(X_sepPCs_train, Y_train)
#Y_train_pred = classifier.predict(X_sepPCs_train)
#Y_train_pred_array = Y_train_pred.toarray()
#Y_test_pred = classifier.predict(X_sepPCs_test)
#Y_test_pred_array = Y_test_pred.toarray()

npzfile = npzfile = np.load("MLkNN_based_preds.npz", allow_pickle = True)
Y_train_pred_array = npzfile["Y_train_pred_array"]
Y_test_pred_array = npzfile["Y_test_pred_array"]

print (f"The Hamming loss for the training data is {metrics.hamming_loss(Y_train, Y_train_pred_array):.3f}")
print (f"The Hamming loss for the test data is {metrics.hamming_loss(Y_test, Y_test_pred_array):.3f}")

The Hamming loss for the training data is 0.003
The Hamming loss for the test data is 0.005


In [13]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the ML-kNN based predictions
outfile = "MLkNN_based_preds.npz"
np.savez_compressed(outfile, Y_train_pred_array = Y_train_pred_array,
                             Y_test_pred_array = Y_test_pred_array)

In [17]:
%%capture
## Perform cross-validation on a grid search to tune hyperparameters 'k' & 's'
## (This can take a long time -- Load the fit GridSearch object instead)
clf = load('clf_MLkNN_gridSearch_object.joblib')
#k_list = list(range(3, 16, 3))
#k_list.insert(0, 1)
#parameters = {'k': k_list, 's': [1.0, 1.5, 2.5, 5.0, 10.0]}  
#By default, the Hamming loss as an option is not provided in the scoring string options. So, we will make the Hamming loss funciton as a scorer and use that. 
#hamming_scorer = metrics.make_scorer(metrics.hamming_loss)

#clf = GridSearchCV(MLkNN(), parameters, scoring = hamming_scorer, cv = 5, verbose = 1)
#clf.fit(X_sepPCs_train, Y_train)

In [12]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the kNN based grid search object
outfile = "clf_MLkNN_gridSearch_object.joblib"
#dump(clf, outfile, compress = 3) 

['clf_MLkNN_gridSearch_object.joblib']

In [18]:
best_index = np.argmin(clf.cv_results_["mean_test_score"])
best_parameters = clf.cv_results_["params"][best_index]

df_CV = pd.DataFrame(columns=["Params", "Mean out-of-bag Hamming loss"])
df_CV["Params"] = clf.cv_results_["params"]
df_CV[ "Mean out-of-bag Hamming loss"] = clf.cv_results_["mean_test_score"]
display(df_CV)
print(f"Best parameters: {best_parameters}, Best Hamming loss: {np.min(clf.cv_results_['mean_test_score'])}")

#We can make a plot of this later. 

Unnamed: 0,Params,Mean out-of-bag Hamming loss
0,"{'k': 1, 's': 1.0}",0.005997
1,"{'k': 1, 's': 1.5}",0.005997
2,"{'k': 1, 's': 2.5}",0.00599
3,"{'k': 1, 's': 5.0}",0.005962
4,"{'k': 1, 's': 10.0}",0.005912
5,"{'k': 3, 's': 1.0}",0.004827
6,"{'k': 3, 's': 1.5}",0.00479
7,"{'k': 3, 's': 2.5}",0.004783
8,"{'k': 3, 's': 5.0}",0.004793
9,"{'k': 3, 's': 10.0}",0.004807


Best parameters: {'k': 6, 's': 2.5}, Best Hamming loss: 0.004705294016304963


In [29]:
## Threshold learning using the best parameters from the cross-validation with original threshold. 
classifier_best = MLkNN(k = best_parameters['k'], 
                        s = best_parameters['s'])
#classifier_best.fit(X_sepPCs_train, Y_train)
#Y_train_pred_best_array = classifier_best.predict(X_sepPCs_train).toarray()    # -- These 'predict()' steps can be time costly.
#Y_test_pred_best_array = classifier_best.predict(X_sepPCs_test).toarray()         # Instead, load the predictions and learned 
                                                                                   # threshold function, below.
npzfile = np.load("MLkNN_bestModel_preds.npz", allow_pickle = True)
Y_train_pred_best_array = npzfile["Y_train_pred_best_array"]
Y_test_pred_best_array = npzfile["Y_test_pred_best_array"]
Y_train_pred_proba_array = npzfile["Y_train_pred_proba_array"]
Y_test_pred_proba_array = npzfile["Y_test_pred_proba_array"]

threshold_function = load("learned_threshold_function.joblib")

best_params_validation_HL = metrics.hamming_loss(Y_test, Y_test_pred_best_array)

print (f"Best parameters: The Hamming loss training data is {metrics.hamming_loss(Y_train, Y_train_pred_best_array) : 0.3f}")
print (f"Best parameters: The Hamming loss test data is {best_params_validation_HL : 0.3f}")

#Y_train_pred_proba_array = classifier_best.predict_proba(X_sepPCs_train).toarray()
#Y_test_pred_proba_array = classifier_best.predict_proba(X_sepPCs_test).toarray()

t_range = (0, 1)

#test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred_proba_array, Y_train, Y_test_pred_proba_array, t_range)
test_labels_binary = predict_labels_binary(Y_test_pred_proba_array, threshold_function)
best_params_validation_HL_withThreshold = metrics.hamming_loss(Y_test, test_labels_binary)

print (f"Best parameters with threshold function learning: Hamming loss Test set is {best_params_validation_HL_withThreshold : 0.3f}")

Best parameters: The Hamming loss training data is  0.003
Best parameters: The Hamming loss test data is  0.005
Best parameters with threshold function learning: Hamming loss Test set is  0.006


In [23]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the MLkNN based best model predictions with & without a learned threshold
##                                               and save the learned threshold
outfile = "MLkNN_bestModel_preds.npz"
#np.savez_compressed(outfile, Y_train_pred_best_array = Y_train_pred_best_array,
#                             Y_test_pred_best_array = Y_test_pred_best_array,
#                             Y_train_pred_proba_array = Y_train_pred_proba_array,
#                             Y_test_pred_proba_array = Y_test_pred_proba_array,
#                             test_labels_binary = test_labels_binary)

outfile = "learned_threshold_function.joblib"
#dump(threshold_function, outfile, compress = 3) 

['learned_threshold_function.joblib']

## ANN Autoencoder Features

In [32]:
## Fit the ML-kNN model to the ANN autoencoder dataset and evaluate the train/test set hamming loss 
## (This can take some time -- Can read in predictions from 'MLkNN_based_preds_encoded' instead)
classifier = MLkNN(k = 3)
#classifier.fit(encoded_train, Y_train)
#Y_train_pred = classifier.predict(encoded_train)
#Y_train_pred_array = Y_train_pred.toarray()
#Y_test_pred = classifier.predict(encoded_test)
#Y_test_pred_array = Y_test_pred.toarray()

npzfile = npzfile = np.load("MLkNN_based_preds_encoded.npz", allow_pickle = True)
Y_train_pred_array = npzfile["Y_train_pred_array"]
Y_test_pred_array = npzfile["Y_test_pred_array"]

print (f"The Hamming loss for the training data is {metrics.hamming_loss(Y_train, Y_train_pred_array):.3f}")
print (f"The Hamming loss for the test data is {metrics.hamming_loss(Y_test, Y_test_pred_array):.3f}")

The Hamming loss for the training data is 0.004
The Hamming loss for the test data is 0.007


In [31]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the ML-kNN based predictions
outfile = "MLkNN_based_preds_encoded.npz"
#np.savez_compressed(outfile, Y_train_pred_array = Y_train_pred_array,
#                             Y_test_pred_array = Y_test_pred_array)

In [36]:
%%capture
## Perform cross-validation on a grid search to tune hyperparameters 'k' & 's'
## (This can take a long time -- Load the fit GridSearch object instead)
clf = load('clf_MLkNN_gridSearch_object_encoded.joblib')
#k_list = list(range(3, 16, 3))
#k_list.insert(0, 1)
#parameters = {'k': k_list, 's': [1.0, 1.5, 2.5, 5.0, 10.0]}  
#By default, the Hamming loss as an option is not provided in the scoring string options. So, we will make the Hamming loss funciton as a scorer and use that. 
#hamming_scorer = metrics.make_scorer(metrics.hamming_loss)

#clf = GridSearchCV(MLkNN(), parameters, scoring = hamming_scorer, cv = 5, verbose = 1)
#clf.fit(encoded_train, Y_train)

In [34]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the kNN based grid search object
outfile = "clf_MLkNN_gridSearch_object_encoded.joblib"
#dump(clf, outfile, compress = 3) 

['clf_MLkNN_gridSearch_object_encoded.joblib']

In [37]:
best_index = np.argmin(clf.cv_results_["mean_test_score"])
best_parameters = clf.cv_results_["params"][best_index]

df_CV = pd.DataFrame(columns=["Params", "Mean out-of-bag Hamming loss"])
df_CV["Params"] = clf.cv_results_["params"]
df_CV[ "Mean out-of-bag Hamming loss"] = clf.cv_results_["mean_test_score"]
display(df_CV)
print(f"Best parameters: {best_parameters}, Best Hamming loss: {np.min(clf.cv_results_['mean_test_score'])}")

#We can make a plot of this later. 

Unnamed: 0,Params,Mean out-of-bag Hamming loss
0,"{'k': 1, 's': 1.0}",0.00981
1,"{'k': 1, 's': 1.5}",0.00981
2,"{'k': 1, 's': 2.5}",0.009798
3,"{'k': 1, 's': 5.0}",0.009758
4,"{'k': 1, 's': 10.0}",0.009695
5,"{'k': 3, 's': 1.0}",0.007513
6,"{'k': 3, 's': 1.5}",0.007504
7,"{'k': 3, 's': 2.5}",0.007484
8,"{'k': 3, 's': 5.0}",0.007453
9,"{'k': 3, 's': 10.0}",0.00741


Best parameters: {'k': 9, 's': 2.5}, Best Hamming loss: 0.0071007987105862185


In [40]:
## Threshold learning using the best parameters from the cross-validation with original threshold. 
classifier_best = MLkNN(k = best_parameters['k'], 
                        s = best_parameters['s'])
#classifier_best.fit(encoded_train, Y_train)
#Y_train_pred_best_array = classifier_best.predict(encoded_train).toarray()    # -- These 'predict()' steps can be time costly.            
#Y_test_pred_best_array = classifier_best.predict(encoded_test).toarray()   # Instead, load the predictions and learned 
                                                                           # threshold function, below.
npzfile = np.load("MLkNN_bestModel_preds_encoded.npz", allow_pickle = True)
Y_train_pred_best_array = npzfile["Y_train_pred_best_array"]
Y_test_pred_best_array = npzfile["Y_test_pred_best_array"]
Y_train_pred_proba_array = npzfile["Y_train_pred_proba_array"]
Y_test_pred_proba_array = npzfile["Y_test_pred_proba_array"]

threshold_function = load("learned_threshold_function_encoded.joblib")

best_params_validation_HL_encoded = metrics.hamming_loss(Y_test, Y_test_pred_best_array)

print (f"Best parameters: The Hamming loss training data is {metrics.hamming_loss(Y_train, Y_train_pred_best_array) : 0.3f}")
print (f"Best parameters: The Hamming loss test data is {best_params_validation_HL_encoded : 0.3f}")

#Y_train_pred_proba_array = classifier_best.predict_proba(encoded_train).toarray()
#Y_test_pred_proba_array = classifier_best.predict_proba(encoded_test).toarray()

t_range = (0, 1)

#test_labels_binary, threshold_function = predict_test_labels_binary(Y_train_pred_proba_array, Y_train, Y_test_pred_proba_array, t_range)
test_labels_binary = predict_labels_binary(Y_test_pred_proba_array, threshold_function)
best_params_validation_HL_withThreshold_encoded = metrics.hamming_loss(Y_test, test_labels_binary)

print (f"Best parameters with threshold function learning: Hamming loss Test set is {best_params_validation_HL_withThreshold_encoded : 0.3f}")

Best parameters: The Hamming loss training data is  0.006
Best parameters: The Hamming loss test data is  0.007
Best parameters with threshold function learning: Hamming loss Test set is  0.008


In [39]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Save the MLkNN based best model predictions with & without a learned threshold
##                                               and save the learned threshold
outfile = "MLkNN_bestModel_preds_encoded.npz"
#np.savez_compressed(outfile, Y_train_pred_best_array = Y_train_pred_best_array,
#                             Y_test_pred_best_array = Y_test_pred_best_array,
#                             Y_train_pred_proba_array = Y_train_pred_proba_array,
#                             Y_test_pred_proba_array = Y_test_pred_proba_array,
#                             test_labels_binary = test_labels_binary)

outfile = "learned_threshold_function_encoded.joblib"
#dump(threshold_function, outfile, compress = 3) 

['learned_threshold_function_encoded.joblib']

## Results

In [42]:
## (CAUTION: DO NOT OVERWRITE EXISTING FILES) -- Display the validation HL results and save as a .json file
results_df = pd.DataFrame({'PCA' : [best_params_validation_HL, best_params_validation_HL_withThreshold],
                           'Autoencoder' : [best_params_validation_HL_encoded, best_params_validation_HL_withThreshold_encoded]}, 
                          index = ['Constant Threshold', 'Learned Threshold'])
#results_df.to_json('MLkNN_results.json')
results_df

Unnamed: 0,PCA,Autoencoder
Constant Threshold,0.005234,0.007107
Learned Threshold,0.006378,0.007817
