In [30]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from datasets import load_dataset
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [3]:
df = pd.read_csv("taggedprompts.csv")
df

Unnamed: 0,text,wherestudent,whereadmin,whereprof,whereperson,wherebuild,whereclassroom,classesprofessor,subjectprofessors,studenttutor,grouptutor,classroomoccupied,groupstudents,groupclasses,groupclassroom,careergroups,careercoord,time
0,Where can I find the student Elizabeth Ramos?,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,How do I locate student Rayner Mathew?,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,I'm trying to find the student Rayner Mathew. ...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,I'm looking for this student: Emily Davis,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Where is Sandalio Leith currently located with...,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,Where does Cybersecurity 1E have their sessions?,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
268,I'm looking for the room assignment of Data 3J.,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
269,Identify the class location for Immersion 6F.,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
270,Which classroom is being used by Robotics 4K?,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [4]:
labels = list(df.columns)
labels.remove("text")
labels

['wherestudent',
 'whereadmin',
 'whereprof',
 'whereperson',
 'wherebuild',
 'whereclassroom',
 'classesprofessor',
 'subjectprofessors',
 'studenttutor',
 'grouptutor',
 'classroomoccupied',
 'groupstudents',
 'groupclasses',
 'groupclassroom',
 'careergroups',
 'careercoord',
 'time']

In [5]:
id2label = {idx:label for idx,label in enumerate(labels)}
id2label

{0: 'wherestudent',
 1: 'whereadmin',
 2: 'whereprof',
 3: 'whereperson',
 4: 'wherebuild',
 5: 'whereclassroom',
 6: 'classesprofessor',
 7: 'subjectprofessors',
 8: 'studenttutor',
 9: 'grouptutor',
 10: 'classroomoccupied',
 11: 'groupstudents',
 12: 'groupclasses',
 13: 'groupclassroom',
 14: 'careergroups',
 15: 'careercoord',
 16: 'time'}

In [6]:
label2id = {label:idx for idx, label in enumerate(labels)}
label2id

{'wherestudent': 0,
 'whereadmin': 1,
 'whereprof': 2,
 'whereperson': 3,
 'wherebuild': 4,
 'whereclassroom': 5,
 'classesprofessor': 6,
 'subjectprofessors': 7,
 'studenttutor': 8,
 'grouptutor': 9,
 'classroomoccupied': 10,
 'groupstudents': 11,
 'groupclasses': 12,
 'groupclassroom': 13,
 'careergroups': 14,
 'careercoord': 15,
 'time': 16}

In [7]:
# Creating a combined column for stratification
df['stratify_col'] = df['wherestudent'].astype(str) + "_" + df['whereadmin'].astype(str) + "_" + \
                     df['whereprof'].astype(str) + "_" + df['whereperson'].astype(str) + "_" + \
                     df['wherebuild'].astype(str) + "_" + df['whereclassroom'].astype(str) + "_" + \
                     df['classesprofessor'].astype(str) + "_" + df['subjectprofessors'].astype(str) + "_" + \
                     df['studenttutor'].astype(str) + "_" + df['grouptutor'].astype(str) + "_" + \
                     df['classroomoccupied'].astype(str) + "_" + df['groupstudents'].astype(str) + "_" + \
                     df['groupclasses'].astype(str) + "_" + df['groupclassroom'].astype(str) + "_" + \
                     df['careergroups'].astype(str) + "_" + df['careercoord'].astype(str) + "_" + \
                     df['time'].astype(str)

In [8]:
train_df, temp_df = train_test_split(df, test_size = 0.4, stratify=df["stratify_col"], random_state = 31)

In [9]:
test_df, val_df = train_test_split(temp_df, test_size = 0.5, stratify=temp_df["stratify_col"], random_state = 31)

In [10]:
train_df.head()

Unnamed: 0,text,wherestudent,whereadmin,whereprof,whereperson,wherebuild,whereclassroom,classesprofessor,subjectprofessors,studenttutor,grouptutor,classroomoccupied,groupstudents,groupclasses,groupclassroom,careergroups,careercoord,time,stratify_col
201,Can you find out if classroom H809 is being us...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_1
195,Can you tell me if classroom B203 is in use th...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_1
214,I want to know the members of the Embedded 2H ...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0_0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0
137,What is the current teaching timetable for Kev...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0_0_0_0_0_0_1_0_0_0_0_0_0_0_0_0_0
153,I'm interested in the faculty teaching Computa...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0_0_0_0_0_0_0_1_0_0_0_0_0_0_0_0_0


In [11]:
test_df.head()

Unnamed: 0,text,wherestudent,whereadmin,whereprof,whereperson,wherebuild,whereclassroom,classesprofessor,subjectprofessors,studenttutor,grouptutor,classroomoccupied,groupstudents,groupclasses,groupclassroom,careergroups,careercoord,time,stratify_col
173,I'm looking for the tutor of the Embedded 5E g...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_0_0
215,What are the names of students in the Cybersec...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0_0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0
249,What courses are on the timetable for Embedded...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0_0_0_0_0_0_0_0_0_0_0_0_1_0_0_0_0
67,"I need to speak with Emily Clark, the math tea...",0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0_0_1_1_0_0_0_0_0_0_0_0_0_0_0_0_1
164,Who is the academic tutor for Chris Lee this y...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0_0_0_0_0_0_0_0_1_0_0_0_0_0_0_0_0


In [12]:
val_df.head()

Unnamed: 0,text,wherestudent,whereadmin,whereprof,whereperson,wherebuild,whereclassroom,classesprofessor,subjectprofessors,studenttutor,grouptutor,classroomoccupied,groupstudents,groupclasses,groupclassroom,careergroups,careercoord,time,stratify_col
255,Need information on the subjects taken by Data...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0_0_0_0_0_0_0_0_0_0_0_0_1_0_0_0_0
154,Who are the professors involved in Network Sec...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0_0_0_0_0_0_0_1_0_0_0_0_0_0_0_0_0
39,"Is the finance manager, Alex Smith, in his off...",0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0_1_0_1_0_0_0_0_0_0_0_0_0_0_0_0_1
68,"Where is David Martinez, our chemistry profess...",0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0_0_1_1_0_0_0_0_0_0_0_0_0_0_0_0_1
196,Will classroom C304 be occupied on Monday at 9...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0_0_0_0_0_0_0_0_0_0_1_0_0_0_0_0_1


In [13]:
train_df.drop(columns = ["stratify_col"], inplace=True)
val_df.drop(columns = ["stratify_col"], inplace=True)
test_df.drop(columns = ["stratify_col"], inplace=True)

In [14]:
train_df.to_csv("train.csv", index=False)

In [15]:
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [16]:
dataset = load_dataset("csv", data_files={"train": "train.csv", "test": "test.csv", "validation": "val.csv"})

Downloading data files: 100%|██████████| 3/3 [00:00<00:00, 7864.32it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1608.86it/s]
Generating train split: 163 examples [00:00, 11918.54 examples/s]
Generating test split: 54 examples [00:00, 6368.23 examples/s]
Generating validation split: 0 examples [00:00, ? examples/s]

Generating validation split: 55 examples [00:00, 7709.34 examples/s]


In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'wherestudent', 'whereadmin', 'whereprof', 'whereperson', 'wherebuild', 'whereclassroom', 'classesprofessor', 'subjectprofessors', 'studenttutor', 'grouptutor', 'classroomoccupied', 'groupstudents', 'groupclasses', 'groupclassroom', 'careergroups', 'careercoord', 'time'],
        num_rows: 163
    })
    test: Dataset({
        features: ['text', 'wherestudent', 'whereadmin', 'whereprof', 'whereperson', 'wherebuild', 'whereclassroom', 'classesprofessor', 'subjectprofessors', 'studenttutor', 'grouptutor', 'classroomoccupied', 'groupstudents', 'groupclasses', 'groupclassroom', 'careergroups', 'careercoord', 'time'],
        num_rows: 54
    })
    validation: Dataset({
        features: ['text', 'wherestudent', 'whereadmin', 'whereprof', 'whereperson', 'wherebuild', 'whereclassroom', 'classesprofessor', 'subjectprofessors', 'studenttutor', 'grouptutor', 'classroomoccupied', 'groupstudents', 'groupclasses', 'groupclassroom', 'c

In [18]:
def preprocess_data(textgroup):
    text = textgroup["text"]
    encoding = tokenizer(text, padding = "max_length", truncation = True, max_length = 128)
    labels_batch = {k: textgroup[k] for k in textgroup.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()
    return encoding

In [19]:
encoded_dataset = dataset.map(preprocess_data, batched = True, remove_columns=dataset["train"].column_names)

Map: 100%|██████████| 163/163 [00:00<00:00, 2864.11 examples/s]
Map: 100%|██████████| 54/54 [00:00<00:00, 1305.10 examples/s]
Map: 100%|██████████| 55/55 [00:00<00:00, 2604.04 examples/s]


In [20]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 163
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 54
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 55
    })
})

In [21]:
encoded_dataset["train"][0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [22]:
tokenizer.decode(encoded_dataset["train"][0]["input_ids"])

'[CLS] can you find out if classroom h809 is being used this thursday at 11 : 00 am? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [23]:
[id2label[idx] for idx, label in enumerate(encoded_dataset["train"][0]['labels']) if label == 1.0]

['classroomoccupied', 'time']

In [24]:
encoded_dataset["train"][0]["labels"]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0]

In [25]:
encoded_dataset.set_format("torch")

In [26]:
encoded_dataset["train"][0]["labels"]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1.])

In [27]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type = "multi_label_classification", 
                                                           num_labels = len(labels), 
                                                           id2label = id2label,
                                                           label2id = label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
batch_size = 8
metric_name = "f1"

In [29]:
args = TrainingArguments(
    f"bert-upy-questions",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name
)

In [31]:
def metrics(predictions, labels, threshold = 0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels
    f1_micro_average = f1_score(y_true = y_true, y_pred = y_pred, average = "micro")
    roc_auc = roc_auc_score(y_true, y_pred, average = "micro")
    accuracy = accuracy_score(y_true, y_pred)
    metrics = {"f1": f1_micro_average,
               "roc_auc": roc_auc,
               "acc": accuracy}
    return metrics