# Apply and Test

This script shows how the model can be applied when it has already been trained. It also shows how each target can be classified from the SDGs and an embedding model.

Further this script shows how these classification models can be tested and evaluated on unseen data to get an estiamte of the performance.

In [1]:
import torch
import wandb
import itertools
import sklearn
import logging
import pandas as pd
import numpy as np
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

In [2]:
# see GPU avaialability
cuda_available = torch.cuda.is_available()

In [3]:
# import unseen data 
data = pd.read_csv('OneHot_Combined_cln_utf8.tsv', sep='\t')
sdg_test_data = data[data['source']=='SASDG_Hub']
sdg_lst = ['SDG1','SDG2','SDG3','SDG4','SDG5','SDG6','SDG7','SDG8','SDG9','SDG10','SDG11','SDG12','SDG13','SDG14','SDG15','SDG16','SDG17']
sdg_test_data['sdg_onehot'] = sdg_test_data[sdg_lst].values.tolist()
sdg_test_data.reset_index(drop=True, inplace=True)

target_test_data = data[data['target'].notna()][['abstract', 'target']].reset_index(drop=True)
target_test_text_lst = target_test_data['abstract'].tolist()

# import target data
target_df = pd.read_csv('Targets.csv', sep=';')
targets_lst = target_df['target'].unique().tolist()
sdg_lst = list(range(1,18))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdg_test_data['sdg_onehot'] = sdg_test_data[sdg_lst].values.tolist()


In [4]:
# define data here
eval_df = sdg_test_data[['abstract', 'sdg_onehot']]
eval_df.columns = ['text', 'labels']
eval_df.reset_index(inplace=True, drop=True)

# # use the threshold to adjust classifications (ROC curve?)
# threshold_val=0.1

# # Optional model configuration (all parameters defind in sweep config in parameter optimisation are now stated here)
# model_args = MultiLabelClassificationArgs(use_multiprocessing = True,
#                                           threshold=threshold_val, # see value above
#                                           wandb_project = 'sasdghub_ml_classify',
#                                           wandb_kwargs={
#                                               'entity':'sasdghub'
#                                                        }
#                                          )

# # import model from path (this path is the directory with all the model files)
# sdg_model = MultiLabelClassificationModel(
#         "xlnet",
#         "outputs/best_model/",  
#         num_labels=17,
#         use_cuda=cuda_available,
#         args=model_args,
#         )

# create functions for additional evaluation outputs
def sdg_acc_result(true, pred):
    pred=(pred>=wandb.config['threshold']).astype(int)
    acc_sum = 0
    for i in range(true.shape[0]):
        acc_sum += sklearn.metrics.accuracy_score(true[i], pred[i])    
    acc = acc_sum/true.shape[0]
    return acc

def sdg_f1_macro_result(true, pred):
    pred=(pred>=wandb.config['threshold']).astype(int)
    f1 = sklearn.metrics.f1_score(true, pred, average='samples')
    return f1

def sdg_cm_wandb_result(true, pred):
    pred=(pred>=wandb.config['threshold']).astype(int)
    # modify labels and fill all combinations to use wand multiclass confusion matrix visually
    d=true-pred
    t_d = (d==1)
    p_d = (d==-1)
    n_d = (d==0)
    idx_ar = np.array(range(0,true.shape[1]))
    idx = np.tile(idx_ar,true.shape[0]).reshape(true.shape[0],true.shape[1])
    n_labels = idx[n_d]
    t_lst = []
    for row in t_d:
        if row.sum()==0:
            t_lst.append(idx_ar)
        else:
            t_lst.append(idx_ar[row])
    p_lst = []
    for row in p_d:
        if row.sum()==0:
            p_lst.append(idx_ar)
        else:
            p_lst.append(idx_ar[row])
    for i in range(len(t_lst)):
        fill_ar = np.array(list(itertools.product(p_lst[i], t_lst[i])))
        t_labels = np.append(n_labels,fill_ar[:,0].tolist())
        p_labels = np.append(n_labels,fill_ar[:,1].tolist())
    wandb_cm = wandb.plot.confusion_matrix(probs=None, y_true=t_labels, preds=p_labels, class_names=sdg_lst)
    return wandb_cm

def sdg_cm_result(true, pred):
    pred=(pred>=wandb.config['threshold']).astype(int)
    cm = sklearn.metrics.multilabel_confusion_matrix(true, pred)
    return cm

def sdg_cm_avg_result(true, pred):
    pred=(pred>=wandb.config['threshold']).astype(int)
    cm = sklearn.metrics.multilabel_confusion_matrix(true, pred)
    cm_avg = cm.sum(axis=0)/true.shape[1]
    return cm_avg

# # run sdg classification test
# wandb.init()

# # model evaluation
# result, model_outputs, wrong_predictions = sdg_model.eval_model(
#     verbose=True,
#     eval_df=eval_df,
#     accuracy=sdg_acc_result,
#     f1_macro=sdg_f1_macro_result,
#     cm=sdg_cm_result,
#     cm_avg=sdg_cm_avg_result,
#     wandb_cm=sdg_cm_wandb_result
#     )

In [5]:
# use optimal parameters for function here
# preferably use precalculated target_embedding_reduced_path
sweep_config = {
    "name" : "sdg_test",
    "method": "random",
    "parameters": {
        "threshold": {"value": 0.1}, # use the threshold vlaue specified during training to make sure to get 
    },
}

# define the project and entity under which the outputs will be recorded in wandb
sweep_id = wandb.sweep(sweep_config, entity='sasdghub', project="sasdghub_ml_classify")

# Set logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# define the training function
def train():
    
    # Initialize a new wandb run 
    wandb.init()

    # Create a MultiLabelClassificationModel
    sdg_model = MultiLabelClassificationModel(
            "xlnet",
            "outputs/best_model/",  
            num_labels=17,
            use_cuda=cuda_available,
            sweep_config=wandb.config
            )
    
    # Evaluate the model
    result, model_outputs, wrong_predictions = sdg_model.eval_model(
        verbose=True,
        eval_df=eval_df,
        accuracy=sdg_acc_result,
        f1_macro=sdg_f1_macro_result,
        cm=sdg_cm_result,
        cm_avg=sdg_cm_avg_result,
        wandb_cm=sdg_cm_wandb_result,
        )
    
    # Sync wandb
    wandb.join()

# run the sweep and record results in wandb    
wandb.agent(sweep_id, train,count=1)

INFO:wandb.agents.pyagent:Starting sweep agent: entity=None, project=None, count=1


Create sweep with ID: pu5z05rt
Sweep URL: https://wandb.ai/sasdghub/sasdghub_ml_classify/sweeps/pu5z05rt


[34m[1mwandb[0m: Agent Starting Run: 44y0b1dl with config:
[34m[1mwandb[0m: 	threshold: 0.1
[34m[1mwandb[0m: Currently logged in as: [33mchristopher-marais[0m ([33msasdghub[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/225 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_xlnet_128_0_2


Running Evaluation:   0%|          | 0/12 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'LRAP': 0.7024627409627408, 'accuracy': 0.5385620915032678, 'f1_macro': 0.23198238614956884, 'cm': array([[[115,  96],
        [  4,  10]],

       [[ 78, 127],
        [  0,  20]],

       [[ 71,  76],
        [  0,  78]],

       [[ 92, 101],
        [  1,  31]],

       [[ 86, 132],
        [  0,   7]],

       [[ 70, 142],
        [  1,  12]],

       [[136,  75],
        [  0,  14]],

       [[115,  86],
        [  1,  23]],

       [[142,  71],
        [  5,   7]],

       [[ 52, 170],
        [  0,   3]],

       [[105,  89],
        [  3,  28]],

       [[145,  77],
        [  1,   2]],

       [[156,  65],
        [  0,   4]],

       [[152,  73],
        [  0,   0]],

       [[146,  73],
        [  0,   6]],

       [[ 82, 140],
        [  0,   3]],

       [[ 69, 156],
        [  0,   0]]], dtype=int64), 'cm_avg': array([[106.58823529, 102.88235294],
       [  0.94117647,  14.58823529]]), 'wandb_cm': <wandb.viz.Cus

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [6]:
# function to apply model to text
def classify_sdg(text_lst):
    # see GPU avaialability
    cuda_available = torch.cuda.is_available()
    
    # import model from path (this path is the directory with all the model files)
    sdg_model = MultiLabelClassificationModel(
            "xlnet",
            "outputs/best_model/",  
            num_labels=17,
            use_cuda=cuda_available,
            )
    predictions, raw_outputs = sdg_model.predict(text_lst)
    return(predictions, raw_outputs)

In [7]:
# make target predictions
# target prediction requires SDG classification to already be done
# function to classify targets
def classify_sdg_target(text_lst,
                        sdg_predictions,
                        sdg_raw_outputs,
                        target_data_path='Targets.csv',
                        run_isomap=True, # run faster for multiple samples otherwise (target_data_path required when this is False) (do not use when only one sample)
                        target_embedding_reduced_path=None, # load a previously calculated and reduced embedding for the targets 'outputs/targets_embedded_reduced_gpt2_2D.csv'
                        isomap_dims = 2,
                        isomap_neigbors = 5, # has to be <= len(text_lst) a.k.a n_samples
                        pre_trained_model_type='gpt2', 
                        pre_trained_model_name='gpt2',
                        target_threshold_val=0.5,
                        ):
    
    

    # define and load model from hugging face
    model_args = ModelArgs(max_seq_length=1024)
    # model import
    model = RepresentationModel(
        pre_trained_model_type,
        pre_trained_model_name, #gpt2 , gpt2-large
        args=model_args,
    )    
    
    # get embeddings of text
    word_embeddings = model.encode_sentences(text_lst, combine_strategy="mean")
    
    
    # ISOMAP
    if run_isomap==True: 
        # reduce isomap_neigbors to fit the number of samples
        n_samples = len(text_lst)
        if n_samples < isomap_neigbors:
            isomap_neigbors = np.max([1,n_samples-1])
            print('Reduced isomap_n_neigbors to: ', isomap_neigbors)

        # reduce dimensions of embeddings to 2 (can be reduced to higher dimensions)
        isomap = Isomap(n_components=isomap_dims, n_neighbors=isomap_neigbors-1) # input is an array with samples x features
        word_embeddings_transformed = isomap.fit_transform(word_embeddings)

        if target_embedding_reduced_path == None:
            # load pre-calculated embeddings
            target_df = pd.read_csv(target_data_path, sep=';')
            # get sentence list from target data
            target_sentence_list = target_df['text'].tolist()
            # get embeddings of targets
            target_embeddings = model.encode_sentences(target_sentence_list, combine_strategy="mean")
            target_embeddings_transformed = isomap.fit_transform(target_embeddings)

            # add labels to reduced embeddings
            target_trans_df = pd.DataFrame(target_embeddings_transformed)
            target_trans_df['target'] = target_df['target']
            target_trans_df['sdg'] = target_df['sdg']
            target_trans_df.to_csv('outputs/targets_embedded_reduced_'+pre_trained_model_name+'_'+str(isomap_dims)+'D.csv', index=False)

            # define source and target for KNN
            Y = target_trans_df
            X = pd.DataFrame(word_embeddings_transformed)
            idx_ar = np.array(range(0,17), np.int64)
            sdg_label_lst = []
            for row in (np.array(sdg_predictions)==1):
                sdg_label_lst.append(idx_ar[row])
            X['sdg'] = sdg_label_lst
            sdg_prob_lst = []
            for row in sdg_raw_outputs:
                sdg_prob_lst.append(row)
            X['sdg_probability'] = sdg_prob_lst
            
        else:
            # import reduced embedding of targets
            target_trans_df = pd.read_csv(target_embedding_reduced_path, sep=',')
            
            # define source and target for KNN
            Y = target_trans_df
            X = pd.DataFrame(word_embeddings_transformed)
            idx_ar = np.array(range(0,17), np.int64)
            sdg_label_lst = []
            for row in (np.array(sdg_predictions)==1):
                sdg_label_lst.append(idx_ar[row])
            X['sdg'] = sdg_label_lst
            sdg_prob_lst = []
            for row in sdg_raw_outputs:
                sdg_prob_lst.append(row)
            X['sdg_probability'] = sdg_prob_lst

        # plot embeddings if they are 2D
        if isomap_dims ==2:
            trans_df = pd.DataFrame(word_embeddings_transformed)
            trans_df['target'] = target_df['target']
            trans_df['sdg'] = target_df['sdg']
            trans_df.plot.scatter(0,1,c='sdg', colormap='viridis') # colour by sdg
            plt.title('Isomap 2D plot of text embedding')
            plt.show()
    
    else:
        # load pre-calculated embeddings
        target_df = pd.read_csv(target_data_path, sep=';')
        # get sentence list from target data
        target_sentence_list = target_df['text'].tolist()
        # get embeddings of targets
        target_embeddings = model.encode_sentences(target_sentence_list, combine_strategy="mean")
        
        # define source and target for KNN
        Y = pd.DataFrame(target_embeddings)
        Y['target'] = target_df['target']
        Y['sdg'] = target_df['sdg']
        X = pd.DataFrame(word_embeddings)
        idx_ar = np.array(range(0,17), np.int64)
        sdg_label_lst = []
        for row in (np.array(sdg_predictions)==1):
            sdg_label_lst.append(idx_ar[row])
        X['sdg'] = sdg_label_lst
        sdg_prob_lst = []
        for row in sdg_raw_outputs:
            sdg_prob_lst.append(row)
        X['sdg_probability'] = sdg_prob_lst
    
    # use cosine similarity Kmeans variant to classify targets
    # define final results table
    results_df = pd.DataFrame()
    results_df['text'] = text_lst
    results_df['sdg'] = X['sdg']
    results_df['sdg_probability'] = X['sdg_probability']
    # calculate pairwise cosine similarity between targets and text list
    similarity_ar = cosine_similarity(X.loc[:, ~X.columns.isin(['sdg', 'sdg_probability'])], 
                                  Y.loc[:, ~Y.columns.isin(['sdg', 'target'])],
                                 )
    results_df['target_similarity'] = similarity_ar.tolist()
    # select targets on distance sdg and threshold
    targets_ar = np.array(Y['target'])
    targets_full_ar = np.tile(targets_ar, (len(text_lst), 1)) 
    sdg_ar = np.array(Y['sdg'])
    sdg_full_ar = np.tile(sdg_ar, (len(text_lst), 1))
    sdg_select_ar = np.array(X['sdg'])
    
    # select classified SDGs
    sdg_onehot_lst = []
    for i in range(len(sdg_full_ar)):
        isin_ar = np.isin(sdg_full_ar[i], sdg_select_ar[i])
        sdg_onehot_lst.append(isin_ar)
    sdg_onehot_ar = np.vstack(sdg_onehot_lst)
    target_onehot_ar = (similarity_ar>=target_threshold_val)*sdg_onehot_ar
    target_label_lst = (targets_full_ar*target_onehot_ar).tolist()
    results_df['target'] = [[ele for ele in sub if ele != ''] for sub in target_label_lst]
    
    return results_df

In [8]:
# evaluation metrics for targets
# create functions for additional evaluation outputs
def acc_result(true, pred, class_lst):
    one_hot = MultiLabelBinarizer(classes=class_lst)
    pred=one_hot.fit_transform(pred)
    true=one_hot.fit_transform(true)
    acc_sum = 0
    for i in range(true.shape[0]):
        acc_sum += sklearn.metrics.accuracy_score(true[i], pred[i])    
    acc = acc_sum/true.shape[0]
    return acc

def f1_macro_result(true, pred, class_lst):
    one_hot = MultiLabelBinarizer(classes=class_lst)
    pred=one_hot.fit_transform(pred)
    true=one_hot.fit_transform(true)
    f1 = sklearn.metrics.f1_score(true, pred, average='samples')
    return f1

def cm_wandb_result(true, pred, class_lst):
    one_hot = MultiLabelBinarizer(classes=class_lst)
    pred=one_hot.fit_transform(pred)
    true=one_hot.fit_transform(true)
    # modify labels and fill all combinations to use wand multiclass confusion matrix visually
    d=true-pred
    t_d = (d==1)
    p_d = (d==-1)
    n_d = (d==0)
    idx_ar = np.array(range(0,true.shape[1]))
    idx = np.tile(idx_ar,true.shape[0]).reshape(true.shape[0],true.shape[1])
    n_labels = idx[n_d]
    t_lst = []
    for row in t_d:
        if row.sum()==0:
            t_lst.append(idx_ar)
        else:
            t_lst.append(idx_ar[row])
    p_lst = []
    for row in p_d:
        if row.sum()==0:
            p_lst.append(idx_ar)
        else:
            p_lst.append(idx_ar[row])
    for i in range(len(t_lst)):
        fill_ar = np.array(list(itertools.product(p_lst[i], t_lst[i])))
        t_labels = np.append(n_labels,fill_ar[:,0].tolist())
        p_labels = np.append(n_labels,fill_ar[:,1].tolist())
    wandb_cm = wandb.plot.confusion_matrix(probs=None, y_true=t_labels, preds=p_labels, class_names=class_lst)
    return wandb_cm

def cm_result(true, pred, class_lst):
    one_hot = MultiLabelBinarizer(classes=class_lst)
    pred=one_hot.fit_transform(pred)
    true=one_hot.fit_transform(true)
    cm = sklearn.metrics.multilabel_confusion_matrix(true, pred)
    return cm

def cm_avg_result(true, pred, class_lst):
    one_hot = MultiLabelBinarizer(classes=class_lst)
    pred=one_hot.fit_transform(pred)
    true=one_hot.fit_transform(true)
    cm = sklearn.metrics.multilabel_confusion_matrix(true, pred)
    cm_avg = cm.sum(axis=0)/true.shape[1]
    return cm_avg

In [None]:
# define data here
val_data = target_test_data

# use optimal parameters for function here
# preferably use precalculated target_embedding_reduced_path
sweep_config = {
    "name" : "sdg_target_test",
    "method": "random",  # bayes, grid, random
    "parameters": {
        "run_isomap": {"value": True},
        "isomap_dims": {"value": 2},
        "isomap_neigbors": {"value": 2},
        "target_threshold_val": {"value": 0.7},
    },
}

# define the project and entity under which the outputs will be recorded in wandb
sweep_id = wandb.sweep(sweep_config, entity='sasdghub', project="sasdghub_ml_classify")

# Set logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# define the training function
def train():
    
    # Initialize a new wandb run 
    wandb.init()
    
    # classify SDGs as input to classify targets
    predictions, raw_outputs = classify_sdg(target_test_text_lst)
    # use optimal parameters for function here
    # preferably use precalculated target_embedding_reduced_path
    results_df = classify_sdg_target(text_lst=target_test_text_lst,
                                     sdg_predictions=predictions,
                                     sdg_raw_outputs=raw_outputs,
                                     target_data_path='Targets.csv',
                                     run_isomap=wandb.config.run_isomap, # run faster for multiple samples otherwise (target_data_path required when this is False) (do not use when only one sample)
                                     target_embedding_reduced_path=None, # load a previously calculated and reduced embedding for the targets 'outputs/targets_embedded_reduced_gpt2_2D.csv'
                                     isomap_dims = wandb.config.isomap_dims,
                                     isomap_neigbors = wandb.config.isomap_neigbors,
                                     pre_trained_model_type='gpt2', 
                                     pre_trained_model_name='gpt2',
                                     target_threshold_val=wandb.config.target_threshold_val,)

    accuracy = acc_result(true=val_data['target'], pred=results_df['target'])
    f1_macro = f1_macro_result(true=val_data['target'], pred=results_df['target'])
    cm = cm_result(true=val_data['target'], pred=results_df['target'])
    cm_avg = cm_avg_result(true=val_data['target'], pred=results_df['target'])
    # cm_wandb = cm_wandb_result(true=val_data['target'], pred=results_df['target']) #very hard to see the confusion matrix
    
    print("accuracy", accuracy)
    print("f1_macro", f1_macro)
    
    wandb.log({"accuracy": accuracy, 
               "f1_macro": f1_macro,
               "confusion_matrix": cm,
               "confusion_matrix_average": cm_avg,
               "confusion_matrix_wandb": cm_wandb,
               "run_isomap":wandb.config.run_isomap,
               "isomap_dims":wandb.config.isomap_dims,
               "isomap_neigbors":wandb.config.isomap_neigbors,
               "target_threshold_val":wandb.config.target_threshold_val
              })
    
    # Sync wandb
    wandb.join()

# run the sweep and record results in wandb    
wandb.agent(sweep_id, train, count=1)

Create sweep with ID: 5rstiqj7
Sweep URL: https://wandb.ai/sasdghub/sasdghub_ml_classify/sweeps/5rstiqj7


[34m[1mwandb[0m: Agent Starting Run: dyjsme4v with config:
[34m[1mwandb[0m: 	isomap_dims: 2
[34m[1mwandb[0m: 	isomap_neigbors: 2
[34m[1mwandb[0m: 	run_isomap: True
[34m[1mwandb[0m: 	target_threshold_val: 0.7


INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/88595 [00:00<?, ?it/s]

In [None]:
# copy the method for targets and use for the SDGs OR
# enlarge the evaluation batch size to maximum with wandb/simple trasnfoemrers