Heaviy adapted from Week5 tutorial

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import LongformerTokenizer, LongformerForSequenceClassification
import torch
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

In [2]:
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu" # This line checks if a GPU is available and sets the device to GPU (e.g., cuda:0) or CPU.
print(device)

# Initialise the model and tokenizer to a pre-trained model. Suggestions: facebook/opt-350m, bigscience/bloom-560m

# model is made to classify sentence into 1/3 categories -> wa is a seq classification task
model = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

#model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m").to(device)
#tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

cpu


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Finetuning

In [3]:
#Getting PEFT model and it's tasktype

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1,
    target_modules=["query", "value"]   #??
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  #fine tuning is efficient
print(type(model))

trainable params: 887,042 || all params: 149,548,036 || trainable%: 0.5931
<class 'peft.peft_model.PeftModelForSequenceClassification'>


In [4]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [5]:
import transformers
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder



fp = "../DATASETS/work_arrangements_development_set.csv"
df = pd.read_csv(fp)
df.drop("id", axis=1, inplace=True) #get rid of id column

le = LabelEncoder() # more efficient/relevant enum

df["label"] = le.fit_transform(df["y_true"])
df.rename(columns={"job_ad": "text"}, inplace=True)

label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_map)

print(df)



{'Hybrid': 0, 'OnSite': 1, 'Remote': 2}
                                                 text  y_true  label
0   Job title: CEO\nAbstract: Exciting opportunity...  Remote      2
1   Job title: Home-Based Online ESL Teacher (Onli...  Remote      2
2   Job title: Safeguarding, De La Salle\nAbstract...  Hybrid      0
3   Job title: Delivery Driver\nAbstract: Pickup t...  OnSite      1
4   Job title: Store Supervisor\nAbstract: We are ...  OnSite      1
..                                                ...     ...    ...
94  Job title: Senior Pipeline Technical Director\...  Hybrid      0
95  Job title: Customer Support Administrator\nAbs...  OnSite      1
96  Job title: Remote Writing Evaluator for AI (As...  Remote      2
97  Job title: People & Culture Advisor\nAbstract:...  Hybrid      0
98  Job title: Draftsperson\nAbstract: Residential...  Hybrid      0

[99 rows x 3 columns]


In [6]:
#tokenised
data = Dataset.from_pandas(df)

data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)
data.set_format("torch")

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

In [None]:
training_args = transformers.TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    logging_dir="./logs",
    evaluation_strategy="no",  # no evaluation since test set is external
    report_to="none"
)

# === 6. Trainer and training ===
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=data,
    tokenizer=tokenizer
)

trainer.train()

  trainer = transformers.Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Initializing global attention on CLS token...
Input ids are automatically padded to be a multiple of `config.attention_window`: 512


Eval