A sequence classification approach - finetuned

In [None]:
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import torch

initialisation

In [None]:
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu" # This line checks if a GPU is available and sets the device to GPU (e.g., cuda:0) or CPU.
#device = torch.device("cpu")
print(device)


# Initialise the model and tokenizer to a pre-trained model. Suggestions: facebook/opt-350m, bigscience/bloom-560m

# model is made to classify sentence into 1/3 categories -> wa is a seq classification task
model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
model = model.to(device)

cpu


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data processing

In [None]:
from datasets import Dataset

fp = "../DATASETS/work_arrangements_development_set.csv"
df = pd.read_csv(fp)
df.drop("id", axis=1, inplace=True) #get rid of id column
df.rename(columns={"job_ad": "text"}, inplace=True)

#https://github.com/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb -> credit
id2label = {
    0:'Hybrid',
    1:'OnSite',
    2:'Remote'
            }
label2id = {val: key for key, val in id2label.items()}

df['labels'] = df['y_true'].map(label2id)    #convert label to enum
print(df)

                                                 text  y_true  labels
0   Job title: CEO\nAbstract: Exciting opportunity...  Remote       2
1   Job title: Home-Based Online ESL Teacher (Onli...  Remote       2
2   Job title: Safeguarding, De La Salle\nAbstract...  Hybrid       0
3   Job title: Delivery Driver\nAbstract: Pickup t...  OnSite       1
4   Job title: Store Supervisor\nAbstract: We are ...  OnSite       1
..                                                ...     ...     ...
94  Job title: Senior Pipeline Technical Director\...  Hybrid       0
95  Job title: Customer Support Administrator\nAbs...  OnSite       1
96  Job title: Remote Writing Evaluator for AI (As...  Remote       2
97  Job title: People & Culture Advisor\nAbstract:...  Hybrid       0
98  Job title: Draftsperson\nAbstract: Residential...  Hybrid       0

[99 rows x 3 columns]


tokenisation

In [28]:
#tokenised
data = Dataset.from_pandas(df)

data = data.map(lambda samples: tokenizer(samples["text"], truncation=True, padding="max_length", max_length=1024), batched=True)
data.set_format("torch")    #conversion to pyTorch tensors

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

finetuning

In [29]:
import torch

#GPU sanity check

print(torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))
    print("Memory allocated:", torch.cuda.memory_allocated())

2.6.0
CUDA Available: False


In [30]:
import os

training_args = transformers.TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=4,  #keep at 1, otherwise crash
    logging_dir="./logs",
    report_to="none", 
    logging_strategy="steps",
    logging_steps=10,
    logging_first_step=True, 
    disable_tqdm=False, 
)

# === 6. Trainer and training ===
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    tokenizer=tokenizer
)

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

trainer.train()

  trainer = transformers.Trainer(


Step,Training Loss
1,1.1831
10,1.0804
20,1.0778
30,0.9914
40,0.8339
50,0.8607
60,0.5318
70,0.5678
80,0.5251
90,0.2907


TrainOutput(global_step=100, training_loss=0.7098444986343384, metrics={'train_runtime': 3415.7337, 'train_samples_per_second': 0.116, 'train_steps_per_second': 0.029, 'total_flos': 260115109208064.0, 'train_loss': 0.7098444986343384, 'epoch': 4.0})

Evaluation

In [None]:
#get test data

fp = "../DATASETS/work_arrangements_test_set.csv"
df = pd.read_csv(fp)
df.drop("id", axis=1, inplace=True) #get rid of id column
df.rename(columns={"job_ad": "text"}, inplace=True)


data = Dataset.from_pandas(df)
data = data.map(lambda samples: tokenizer(samples["text"], truncation=True, padding="max_length", max_length=512), batched=True)
data.set_format("torch")    #conversion to pyTorch tensors

print(data)


Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'y_true', 'input_ids', 'attention_mask'],
    num_rows: 99
})


In [None]:
# Set the model in evaluation mode
model.eval()

correct = 0
predictions = []

with torch.no_grad():
    for i in range(len(data)):
        sample = data[i]

        input_ids = sample["input_ids"].unsqueeze(0).to(device)
        attention_mask = sample["attention_mask"].unsqueeze(0).to(device)
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask)
        
        logits = outputs.logits
        #print(logits)
        pred = torch.argmax(logits, dim=1)  #returns tensor
        predicted_label = id2label[pred.item()]
        predictions.append(predicted_label)

        if predicted_label == sample['y_true']:
            correct+=1

print(f"Accuracy = {correct/len(data)}")


CORRECT: prediction == label
tensor([[-1.7949,  3.1180, -1.3399]])
CORRECT: OnSite == OnSite
tensor([[-1.8689,  3.0330, -1.1010]])
CORRECT: OnSite == OnSite
tensor([[ 1.2589, -1.5104,  1.1052]])
INCORRECT: Hybrid == Remote
tensor([[ 1.4791, -1.0479,  0.5959]])
CORRECT: Hybrid == Hybrid
tensor([[ 1.7898, -1.4003,  0.7262]])
INCORRECT: Hybrid == Remote
tensor([[-1.8158,  3.0434, -1.2447]])
CORRECT: OnSite == OnSite
tensor([[ 2.3689, -1.7348,  0.1627]])
CORRECT: Hybrid == Hybrid
tensor([[ 0.5900, -1.2542,  1.4120]])
CORRECT: Remote == Remote
tensor([[ 0.1287, -1.1434,  1.5993]])
INCORRECT: Remote == Hybrid
tensor([[-1.8989,  3.0042, -1.2466]])
INCORRECT: OnSite == Remote
tensor([[ 1.1834, -1.6790,  1.3833]])
INCORRECT: Remote == Hybrid
tensor([[ 2.1870, -1.5552,  0.3940]])
INCORRECT: Hybrid == OnSite
tensor([[-1.9725,  2.6050, -0.6393]])
INCORRECT: OnSite == Remote
tensor([[ 1.8091, -1.5738,  0.8714]])
INCORRECT: Hybrid == OnSite
tensor([[ 1.8864, -1.3016,  0.4077]])
INCORRECT: Hybrid == 

In [None]:
from eval import *

p = precision(["Remote", "Hybrid", "OnSite"], data['y_true'], predictions)
print(p)
r = recall(["Remote", "Hybrid", "OnSite"], data['y_true'], predictions)
print(r)