# Creating multi label classification model LookUPY

In [1]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from datasets import load_dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

  from .autonotebook import tqdm as notebook_tqdm


Tokenizer for BERT model

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Reading in csv with labeled prompts \
1 means the label applies to the prompt, 0 means it does not

In [3]:
df = pd.read_csv("taggedprompts.csv")
df

Unnamed: 0,text,wherestudent,whereadmin,whereprof,whereperson,wherebuild,whereclassroom,classesprofessor,subjectprofessors,studenttutor,grouptutor,classroomoccupied,groupstudents,groupclasses,groupclassroom,careergroups,careercoord,time
0,Where can I find the student Elizabeth Ramos?,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,How do I locate student Rayner Mathew?,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,I'm trying to find the student Rayner Mathew. ...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,I'm looking for this student: Emily Davis,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Where is Sandalio Leith currently located with...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,Where is the class of the group Cybersecurity ...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1318,Can you tell me the classroom location of Cybe...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1319,Can you tell me the classroom location of Embe...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1320,What is the classroom assigned to Data 4B?,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


Something the model needs is a way to transform the labels into ids and back \
Let's extract the labels

In [4]:
labels = list(df.columns)
labels.remove("text")
labels

['wherestudent',
 'whereadmin',
 'whereprof',
 'whereperson',
 'wherebuild',
 'whereclassroom',
 'classesprofessor',
 'subjectprofessors',
 'studenttutor',
 'grouptutor',
 'classroomoccupied',
 'groupstudents',
 'groupclasses',
 'groupclassroom',
 'careergroups',
 'careercoord',
 'time']

This dictionary translates ids to labels

In [5]:
id2label = {idx:label for idx,label in enumerate(labels)}
id2label

{0: 'wherestudent',
 1: 'whereadmin',
 2: 'whereprof',
 3: 'whereperson',
 4: 'wherebuild',
 5: 'whereclassroom',
 6: 'classesprofessor',
 7: 'subjectprofessors',
 8: 'studenttutor',
 9: 'grouptutor',
 10: 'classroomoccupied',
 11: 'groupstudents',
 12: 'groupclasses',
 13: 'groupclassroom',
 14: 'careergroups',
 15: 'careercoord',
 16: 'time'}

This one translates labels to ids

In [6]:
label2id = {label:idx for idx, label in enumerate(labels)}
label2id

{'wherestudent': 0,
 'whereadmin': 1,
 'whereprof': 2,
 'whereperson': 3,
 'wherebuild': 4,
 'whereclassroom': 5,
 'classesprofessor': 6,
 'subjectprofessors': 7,
 'studenttutor': 8,
 'grouptutor': 9,
 'classroomoccupied': 10,
 'groupstudents': 11,
 'groupclasses': 12,
 'groupclassroom': 13,
 'careergroups': 14,
 'careercoord': 15,
 'time': 16}

I want to get random samples from my prompts dataset \
but I want them to be balanced depending on their labeling \
that way, the model will learn equally all types of prompts I have an interest on \
Let's create a column that uniquely describes them \
This will allow me to stratify the samples.

In [7]:
# Creating a combined column for stratification
df['stratify_col'] = df['wherestudent'].astype(str) + "_" + df['whereadmin'].astype(str) + "_" + \
                     df['whereprof'].astype(str) + "_" + df['whereperson'].astype(str) + "_" + \
                     df['wherebuild'].astype(str) + "_" + df['whereclassroom'].astype(str) + "_" + \
                     df['classesprofessor'].astype(str) + "_" + df['subjectprofessors'].astype(str) + "_" + \
                     df['studenttutor'].astype(str) + "_" + df['grouptutor'].astype(str) + "_" + \
                     df['classroomoccupied'].astype(str) + "_" + df['groupstudents'].astype(str) + "_" + \
                     df['groupclasses'].astype(str) + "_" + df['groupclassroom'].astype(str) + "_" + \
                     df['careergroups'].astype(str) + "_" + df['careercoord'].astype(str) + "_" + \
                     df['time'].astype(str)

I need 3 divisions of my dataset: train, test and evaluation \
Let's divide them using our stratifying column

In [8]:
train_df, temp_df = train_test_split(df, test_size = 0.4, stratify=df["stratify_col"], random_state = 32)

In [9]:
test_df, val_df = train_test_split(temp_df, test_size = 0.5, stratify=temp_df["stratify_col"], random_state = 32)

In [10]:
train_df.head()

Unnamed: 0,text,wherestudent,whereadmin,whereprof,whereperson,wherebuild,whereclassroom,classesprofessor,subjectprofessors,studenttutor,grouptutor,classroomoccupied,groupstudents,groupclasses,groupclassroom,careergroups,careercoord,time,stratify_col
1059,What is the status of classroom G982 on next w...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_1
1132,Can you list the groups associated with Cybers...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0_0_0_0_0_0_0_0_0_0_0_0_0_0_1_0_0
1156,Can you tell me the different groups in Data E...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0_0_0_0_0_0_0_0_0_0_0_0_0_0_1_0_0
412,Can you help me find our university's manager ...,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0_1_0_1_0_0_0_0_0_0_0_0_0_0_0_0_0
928,I'm looking for the tutor of the group Robotic...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_0_0


In [11]:
test_df.head()

Unnamed: 0,text,wherestudent,whereadmin,whereprof,whereperson,wherebuild,whereclassroom,classesprofessor,subjectprofessors,studenttutor,grouptutor,classroomoccupied,groupstudents,groupclasses,groupclassroom,careergroups,careercoord,time,stratify_col
1045,Can you tell me if classroom K744 is in use ar...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_1
774,Can you list the courses taught by Smith?,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0_0_0_0_0_0_1_0_0_0_0_0_0_0_0_0_0
640,"I need to meet with Quinn, where can I find th...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0_0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_1
13,Is Sarah Johnson in the library this morning?,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1_0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_1
961,What's the name of the tutor for the Cybersecu...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_0_0


In [12]:
val_df.head()

Unnamed: 0,text,wherestudent,whereadmin,whereprof,whereperson,wherebuild,whereclassroom,classesprofessor,subjectprofessors,studenttutor,grouptutor,classroomoccupied,groupstudents,groupclasses,groupclassroom,careergroups,careercoord,time,stratify_col
1234,I need to know the timetable for the group Dat...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0_0_0_0_0_0_0_0_0_0_0_0_1_0_0_0_0
436,"I need to locate our secretary David, can you ...",0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0_1_0_1_0_0_0_0_0_0_0_0_0_0_0_0_1
340,Can you help me find student Casey 9:00am?,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1_0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_1
390,"Where's the office of Karen, the director?",0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0_1_0_1_0_0_0_0_0_0_0_0_0_0_0_0_0
648,"I need to meet with Jordan, where can I find t...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0_0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_1


We don't need that column anymore

In [13]:
train_df.drop(columns = ["stratify_col"], inplace=True)
val_df.drop(columns = ["stratify_col"], inplace=True)
test_df.drop(columns = ["stratify_col"], inplace=True)

Since I am using HuggingFace transformers, I need to load the dataset using their datasets module for their own format \
Here, I save the samples into csv's, then load them using the datasets module

In [14]:
train_df.to_csv("train.csv", index=False)

In [15]:
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [16]:
dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv", "validation": "val.csv"})

Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 22753.91it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 2252.58it/s]
Generating train split: 793 examples [00:00, 34477.90 examples/s]
Generating test split: 264 examples [00:00, 48984.57 examples/s]
Generating validation split: 265 examples [00:00, 52094.61 examples/s]


In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'wherestudent', 'whereadmin', 'whereprof', 'whereperson', 'wherebuild', 'whereclassroom', 'classesprofessor', 'subjectprofessors', 'studenttutor', 'grouptutor', 'classroomoccupied', 'groupstudents', 'groupclasses', 'groupclassroom', 'careergroups', 'careercoord', 'time'],
        num_rows: 793
    })
    test: Dataset({
        features: ['text', 'wherestudent', 'whereadmin', 'whereprof', 'whereperson', 'wherebuild', 'whereclassroom', 'classesprofessor', 'subjectprofessors', 'studenttutor', 'grouptutor', 'classroomoccupied', 'groupstudents', 'groupclasses', 'groupclassroom', 'careergroups', 'careercoord', 'time'],
        num_rows: 264
    })
    validation: Dataset({
        features: ['text', 'wherestudent', 'whereadmin', 'whereprof', 'whereperson', 'wherebuild', 'whereclassroom', 'classesprofessor', 'subjectprofessors', 'studenttutor', 'grouptutor', 'classroomoccupied', 'groupstudents', 'groupclasses', 'groupclassroom', '

Now I need a function to convert my dataset with the use of the tokenizer \
The model can only be trained using the information the tokenizer returns

In [18]:
def preprocess_data(textgroup):
    text = textgroup["text"]
    encoding = tokenizer(text, padding = "max_length", truncation = True, max_length = 128)
    labels_batch = {k: textgroup[k] for k in textgroup.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()
    return encoding

Mapping the entire dataset with the tokenizer

In [19]:
encoded_dataset = dataset.map(preprocess_data, batched = True, remove_columns=dataset["train"].column_names)

Map: 100%|██████████| 793/793 [00:00<00:00, 3537.49 examples/s]
Map: 100%|██████████| 264/264 [00:00<00:00, 3805.66 examples/s]
Map: 100%|██████████| 265/265 [00:00<00:00, 4246.01 examples/s]


Concept: attention mask \
This is what the tokenizer returns to tell the model exactly what it should focus on \
Tokens are added for padding for mathematical purposes and these do not matter \
The attention mask tells the model what does matter

In [20]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 793
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 264
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 265
    })
})

In [21]:
encoded_dataset["train"][0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [22]:
tokenizer.decode(encoded_dataset["train"][0]["input_ids"])

'[CLS] what is the status of classroom g982 on next week? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

Checking if encoding was correct for the labels

In [23]:
[id2label[idx] for idx, label in enumerate(encoded_dataset["train"][0]['labels']) if label == 1.0]

['classroomoccupied', 'time']

In [24]:
encoded_dataset["train"][0]["labels"]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0]

We format it using torch. \
This turns our labels into tensors

In [25]:
encoded_dataset.set_format("torch")

In [26]:
encoded_dataset["train"][0]["labels"]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.])

Loading BERT model for multi label classification and passing all arguments it needs

In [27]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type = "multi_label_classification", 
                                                           num_labels = len(labels), 
                                                           id2label = id2label,
                                                           label2id = label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batch size represents the amount of data used per iteration \
f1_score is a metric commonly used in these problems in most sources I read

In [28]:
batch_size = 8
metric_name = "f1"

Setting training arguments, most of these I chose due to tutorials and recommendations \
and even after further reading on what they are, they seemed like good picks \
However, in this iteration I had more data than the previous one \
so I saw it fitting to up the number of epochs for better results.

In [29]:
args = TrainingArguments(
    f"bert-upy-questions",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

While f1 is the deciding metric, it is good to have multiple metrics to have better insight \
We will define a function with multiple recommended metrics

In [30]:
def metrics(predictions, labels, threshold = 0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true = y_true, y_pred = y_pred, average = "micro")
    roc_auc = roc_auc_score(y_true, y_pred, average = "micro")
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {"f1": f1_micro_average,
               "roc_auc": roc_auc,
               "acc": accuracy}
    return metrics

And another function that calculates them per evaluation

In [31]:
def calculate_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = metrics(predictions = preds, labels = p.label_ids)
    return result

Checking everything is in tensor format

In [32]:
encoded_dataset["train"][0]["labels"].type()

'torch.FloatTensor'

In [33]:
encoded_dataset["train"]["input_ids"][0]

tensor([ 101, 2054, 2003, 1996, 3570, 1997, 9823, 1043, 2683, 2620, 2475, 2006,
        2279, 2733, 1029,  102,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

Testing base model with an entry to see everything works

In [34]:
outputs = model(input_ids = encoded_dataset["train"]["input_ids"][0].unsqueeze(0), labels = encoded_dataset["train"][0]["labels"].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.7623, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.0262,  0.1523,  0.0620, -0.1493, -0.1069, -0.2126,  0.5220,  0.0862,
         -0.1255,  0.6532,  0.2547, -0.2019,  0.3443,  0.0439,  0.6699,  0.0794,
         -0.4032]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Now, a trainer instance is created using the training arguments previously created \
We pass the dataset divisions accordingly and our metrics function.

In [35]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer = tokenizer,
    compute_metrics=calculate_metrics
)

Training begins

In [36]:
trainer.train()

  0%|          | 0/800 [00:00<?, ?it/s]

                                                 
 12%|█▎        | 100/800 [07:29<38:38,  3.31s/it]

{'eval_loss': 0.27231353521347046, 'eval_f1': 0.0, 'eval_roc_auc': 0.5, 'eval_acc': 0.0, 'eval_runtime': 35.4148, 'eval_samples_per_second': 7.483, 'eval_steps_per_second': 0.96, 'epoch': 1.0}


                                                   
 25%|██▌       | 200/800 [15:03<33:19,  3.33s/it]

{'eval_loss': 0.20108956098556519, 'eval_f1': 0.546112115732369, 'eval_roc_auc': 0.6878109452736318, 'eval_acc': 0.09433962264150944, 'eval_runtime': 35.3291, 'eval_samples_per_second': 7.501, 'eval_steps_per_second': 0.962, 'epoch': 2.0}


                                                   
 38%|███▊      | 300/800 [22:43<27:38,  3.32s/it]

{'eval_loss': 0.16297130286693573, 'eval_f1': 0.6089965397923875, 'eval_roc_auc': 0.7189054726368159, 'eval_acc': 0.1471698113207547, 'eval_runtime': 35.3268, 'eval_samples_per_second': 7.501, 'eval_steps_per_second': 0.962, 'epoch': 3.0}


                                                   
 50%|█████     | 400/800 [30:25<26:09,  3.92s/it]

{'eval_loss': 0.1368376463651657, 'eval_f1': 0.6754530477759473, 'eval_roc_auc': 0.7549751243781094, 'eval_acc': 0.25660377358490566, 'eval_runtime': 36.7333, 'eval_samples_per_second': 7.214, 'eval_steps_per_second': 0.926, 'epoch': 4.0}


 62%|██████▎   | 500/800 [37:39<16:44,  3.35s/it]  

{'loss': 0.2265, 'learning_rate': 7.500000000000001e-06, 'epoch': 5.0}


                                                 
 62%|██████▎   | 500/800 [38:16<16:44,  3.35s/it]

{'eval_loss': 0.11812180280685425, 'eval_f1': 0.8053491827637445, 'eval_roc_auc': 0.8370646766169154, 'eval_acc': 0.5056603773584906, 'eval_runtime': 36.7887, 'eval_samples_per_second': 7.203, 'eval_steps_per_second': 0.924, 'epoch': 5.0}


                                                   
 75%|███████▌  | 600/800 [45:58<11:03,  3.32s/it]

{'eval_loss': 0.10685865581035614, 'eval_f1': 0.9090909090909091, 'eval_roc_auc': 0.9166666666666667, 'eval_acc': 0.7471698113207547, 'eval_runtime': 35.6327, 'eval_samples_per_second': 7.437, 'eval_steps_per_second': 0.954, 'epoch': 6.0}


                                                 
 88%|████████▊ | 700/800 [54:00<05:32,  3.32s/it]

{'eval_loss': 0.1001143828034401, 'eval_f1': 0.8940852819807428, 'eval_roc_auc': 0.9042288557213931, 'eval_acc': 0.7094339622641509, 'eval_runtime': 35.0853, 'eval_samples_per_second': 7.553, 'eval_steps_per_second': 0.969, 'epoch': 7.0}


                                                   
100%|██████████| 800/800 [1:01:35<00:00,  3.30s/it]

{'eval_loss': 0.09787110984325409, 'eval_f1': 0.9135135135135135, 'eval_roc_auc': 0.9203980099502487, 'eval_acc': 0.7584905660377359, 'eval_runtime': 35.8301, 'eval_samples_per_second': 7.396, 'eval_steps_per_second': 0.949, 'epoch': 8.0}


100%|██████████| 800/800 [1:01:39<00:00,  4.62s/it]

{'train_runtime': 3699.8822, 'train_samples_per_second': 1.715, 'train_steps_per_second': 0.216, 'train_loss': 0.182926983833313, 'epoch': 8.0}





TrainOutput(global_step=800, training_loss=0.182926983833313, metrics={'train_runtime': 3699.8822, 'train_samples_per_second': 1.715, 'train_steps_per_second': 0.216, 'train_loss': 0.182926983833313, 'epoch': 8.0})

Best model is saved after training \
Let's check its final score

In [37]:
trainer.evaluate()

100%|██████████| 34/34 [00:35<00:00,  1.05s/it]


{'eval_loss': 0.09787110984325409,
 'eval_f1': 0.9135135135135135,
 'eval_roc_auc': 0.9203980099502487,
 'eval_acc': 0.7584905660377359,
 'eval_runtime': 37.0883,
 'eval_samples_per_second': 7.145,
 'eval_steps_per_second': 0.917,
 'epoch': 8.0}

Seems very accurate 

In [50]:
prompt = "I would like to meet the director in the afternoon"
encoding = tokenizer(prompt, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
outputs = trainer.model(**encoding)

In [51]:
logits = outputs.logits
logits.shape

torch.Size([1, 17])

In [52]:
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['whereadmin', 'whereperson', 'time']


After testing multiple prompts, it seems to get it right most of the times \
Let's save our model

In [53]:
trainer.save_model()