A sequence classification approach - finetuned

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers
!pip install datasets
!pip install pandas
!pip install 'accelerate>=0.26.0'

Looking in indexes: https://download.pytorch.org/whl/cu121


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


initialisation

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # This line checks if a GPU is available and sets the device to GPU (e.g., cuda:0) or CPU.
#device = torch.device("cpu")
print(device)


# Initialise the model and tokenizer to a pre-trained model. Suggestions: facebook/opt-350m, bigscience/bloom-560m

# model is made to classify sentence into 1/3 categories -> wa is a seq classification task
model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
model = model.to(device)

cuda


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data processing

In [None]:
from datasets import Dataset

fp = "../../MISC/work_arrangements_development_set.csv"
df = pd.read_csv(fp)
df.drop("id", axis=1, inplace=True) #get rid of id column
df.rename(columns={"job_ad": "text"}, inplace=True)

#https://github.com/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb -> credit
id2label = {
    0:'Hybrid',
    1:'OnSite',
    2:'Remote'
            }
label2id = {val: key for key, val in id2label.items()}

df['labels'] = df['y_true'].map(label2id)    #convert label to enum
print(df)

                                                 text  y_true  labels
0   Job title: CEO\nAbstract: Exciting opportunity...  Remote       2
1   Job title: Home-Based Online ESL Teacher (Onli...  Remote       2
2   Job title: Safeguarding, De La Salle\nAbstract...  Hybrid       0
3   Job title: Delivery Driver\nAbstract: Pickup t...  OnSite       1
4   Job title: Store Supervisor\nAbstract: We are ...  OnSite       1
..                                                ...     ...     ...
94  Job title: Senior Pipeline Technical Director\...  Hybrid       0
95  Job title: Customer Support Administrator\nAbs...  OnSite       1
96  Job title: Remote Writing Evaluator for AI (As...  Remote       2
97  Job title: People & Culture Advisor\nAbstract:...  Hybrid       0
98  Job title: Draftsperson\nAbstract: Residential...  Hybrid       0

[99 rows x 3 columns]


tokenisation

In [5]:
#tokenised
data = Dataset.from_pandas(df)

data = data.map(lambda samples: tokenizer(samples["text"], truncation=True, padding="max_length", max_length=1024), batched=True)
data.set_format("torch")    #conversion to pyTorch tensors

Map: 100%|██████████| 99/99 [00:00<00:00, 1217.82 examples/s]


finetuning

In [6]:
import torch

#GPU sanity check

print(torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))
    print("Memory allocated:", torch.cuda.memory_allocated())

2.5.1+cu121
CUDA Available: True
Device: NVIDIA GeForce GTX 1080
Memory allocated: 595422720


In [7]:


training_args = transformers.TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=3,  #keep at 1, otherwise crash
    logging_dir="./logs",
    report_to="none", 
    logging_strategy="steps",
    logging_steps=10,
    logging_first_step=True, 
    disable_tqdm=False, 
)

# === 6. Trainer and training ===
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    tokenizer=tokenizer
)



trainer.train()

  trainer = transformers.Trainer(
Initializing global attention on CLS token...


Step,Training Loss
1,1.0935
10,1.1142
20,1.0752
30,1.0292
40,0.9999
50,1.0789
60,1.025
70,1.1407
80,0.8131
90,0.6821


TrainOutput(global_step=132, training_loss=0.9015891759684591, metrics={'train_runtime': 203.3871, 'train_samples_per_second': 1.947, 'train_steps_per_second': 0.649, 'total_flos': 260115109208064.0, 'train_loss': 0.9015891759684591, 'epoch': 4.0})

Evaluation

In [None]:
#get test data

fp = "../../MISC/work_arrangements_test_set.csv"
df = pd.read_csv(fp)
df.drop("id", axis=1, inplace=True) #get rid of id column
df.rename(columns={"job_ad": "text"}, inplace=True)


data = Dataset.from_pandas(df)
data = data.map(lambda samples: tokenizer(samples["text"], truncation=True, padding="max_length", max_length=512), batched=True)
data.set_format("torch")    #conversion to pyTorch tensors

print(data)


Map: 100%|██████████| 99/99 [00:00<00:00, 1832.20 examples/s]

Dataset({
    features: ['text', 'y_true', 'input_ids', 'attention_mask'],
    num_rows: 99
})





In [9]:
# Set the model in evaluation mode
model.eval()

correct = 0
predictions = []

with torch.no_grad():
    for i in range(len(data)):
        sample = data[i]

        input_ids = sample["input_ids"].unsqueeze(0).to(device)
        attention_mask = sample["attention_mask"].unsqueeze(0).to(device)
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask)
        
        logits = outputs.logits
        #print(logits)
        pred = torch.argmax(logits, dim=1)  #returns tensor
        predicted_label = id2label[pred.item()]
        predictions.append(predicted_label)

        if predicted_label == sample['y_true']:
            correct+=1

print(f"Accuracy = {correct/len(data)}")


Accuracy = 0.5656565656565656


In [10]:
from eval import *

p = precision(["Remote", "Hybrid", "OnSite"], data['y_true'], predictions)
print(p)
r = recall(["Remote", "Hybrid", "OnSite"], data['y_true'], predictions)
print(r)

{'Remote': 0.43902439024390244, 'Hybrid': 0.5714285714285714, 'OnSite': 0.6666666666666666, 'AVERAGE': 0.5590398761130468}
{'Remote': 0.6923076923076923, 'Hybrid': 0.14814814814814814, 'OnSite': 0.7391304347826086, 'AVERAGE': 0.5265287584128163}
