ft via lora

In [None]:
# Source: https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_opt_bnb_peft.ipynb

from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import pandas as pd
import torch
from datasets import Dataset
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

In [19]:
device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu" # This line checks if a GPU is available and sets the device to GPU (e.g., cuda:0) or CPU.
#device = "cpu"
print(device)

# Initialise the model and tokenizer to a pre-trained model. Suggestions: facebook/opt-350m, bigscience/bloom-560m
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m").to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

GPU safety checks

In [20]:
import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

print(torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))
    print("Memory allocated:", torch.cuda.memory_allocated())

2.6.0
CUDA Available: False


Lora Training

In [21]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = get_peft_model(model, peft_config)

#checks
model.print_trainable_parameters()
type(model)

trainable params: 786,432 || all params: 331,982,848 || trainable%: 0.2369


peft.peft_model.PeftModelForCausalLM

Training Data

In [22]:
import pandas as pd

fp = "/Users/eddiezhang/NLP_project-2/DATASETS/work_arrangements_development_set.csv"
df = pd.read_csv(fp)
df.drop("id", axis=1, inplace=True) #get rid of id column
df.rename(columns={"job_ad": "text"}, inplace=True)

data = Dataset.from_pandas(df)

print(df)

                                                 text  y_true
0   Job title: CEO\nAbstract: Exciting opportunity...  Remote
1   Job title: Home-Based Online ESL Teacher (Onli...  Remote
2   Job title: Safeguarding, De La Salle\nAbstract...  Hybrid
3   Job title: Delivery Driver\nAbstract: Pickup t...  OnSite
4   Job title: Store Supervisor\nAbstract: We are ...  OnSite
..                                                ...     ...
94  Job title: Senior Pipeline Technical Director\...  Hybrid
95  Job title: Customer Support Administrator\nAbs...  OnSite
96  Job title: Remote Writing Evaluator for AI (As...  Remote
97  Job title: People & Culture Advisor\nAbstract:...  Hybrid
98  Job title: Draftsperson\nAbstract: Residential...  Hybrid

[99 rows x 2 columns]


Training

In [23]:
import transformers

def train_prompt_format(example):
    label = example['y_true']
    return f"{example['text']}\nWhat is the work arrangement of this job ad? You must return either Onsite, Remote or Hybrid\n Label: {label}"

data = data.map(lambda samples: {"prompt": train_prompt_format(samples)}, batched=False,remove_columns=["text", "y_true"])   #remove unneeded labels(both saved into prompt)   #remove unneeded labels(both saved into prompt)
data = data.map(lambda samples: tokenizer(samples['prompt']), batched=True)

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

In [24]:


trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=transformers.TrainingArguments(
        max_steps=2,
        learning_rate=2e-5,
        gradient_accumulation_steps=4,
        per_device_train_batch_size=4,
        output_dir="./outputs",
        logging_steps=50,           # Log every 50 steps to monitor progress
        disable_tqdm=False,         # Enable progress bar (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss


IndexError: index out of range in self

eval

In [None]:
#get test data

fp = "../DATASETS/work_arrangements_test_set.csv"
df = pd.read_csv(fp)
df.drop("id", axis=1, inplace=True) #get rid of id column
df.rename(columns={"job_ad": "text"}, inplace=True)

data = Dataset.from_pandas(df)
#data = data.map(lambda samples: tokenizer(samples["text"], truncation=True, padding="max_length", max_length=512), batched=True)
#data.set_format("torch")    #conversion to pyTorch tensors

print(data)
print(data[0])

In [None]:
def prompt_format(example):
    prompt = f"{example['text']}\nWhat is the work arrangement of this job ad? You must return either Onsite, Remote or Hybrid\n Label:"
    return prompt

In [None]:
# Set the model in evaluation mode
model.eval()

valid = 0
correct = 0
answers = ["Onsite", "Remote", "Hybrid"]

with torch.no_grad():
    for i in range(len(data)):
    #for i in range(10):
        sample = prompt_format(data[i])
        input = tokenizer(sample, return_tensors="pt").to(device)
        output_tokens = model.generate(**input, do_sample=False, num_beams=5, no_repeat_ngram_size=2, early_stopping=True, max_new_tokens=10)

        out_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
        out_text = out_text[len(sample):]   #raw new output
        #print(out_text)

        if out_text[0] in answers:
            valid += 1
            if out_text[0] == sample['y_true']:
                correct += 1

print(f"Valid: {valid} / {len(data)} == {valid / len(data)}")
print(f"correct(Accuracy): {correct} / {len(data)} == {correct / len(data)}")
print(f"correct/valid: {correct / valid}")