In [None]:
! pip install --upgrade transformers==4.50.1 trl
! pip install 'accelerate>=0.26.0'

In [None]:
import copy
import json
import os
import pathlib
import warnings

from datasets import Dataset, load_dataset
import pandas as pd
from peft import LoraConfig, get_peft_model
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer
from tqdm import tqdm
from typing import Optional, Callable
import wandb

# Create Dataset

In [9]:
data = pd.read_csv('data/seniority_labelled_development_set_cleaned.csv')

In [10]:
data

Unnamed: 0,job_id,job_title,job_summary,job_ad_details,classification_name,subclassification_name,y_true
0,30765949,Electrical BIM modeller - Contract role - Sydney,Multinational consultancy seeks Electrical BIM...,About the company Multinational consultancy...,Engineering,Engineering Drafting,intermediate
1,38536238,Marketing Manager - Tourism,A leading global lifestyle & entertainment gro...,A true global leader in the lifestyle and ente...,Marketing & Communications,Event Management,senior
2,29958739,Quality Coordinator Registered Nurse - Aged care,Quality Systems/Process-Driven Registered Nurse,We are seeking to appoint a Quality System...,Healthcare & Medical,Nursing - Aged Care,lead
3,33276107,National Lean and Quality Manager,Highly influential change and quality leadersh...,Champion Real Change Strong Management Commi...,"Manufacturing, Transport & Logistics","Warehousing, Storage & Distribution",head
4,31399187,Experienced Registered Nurses: casual positions,Experienced Medical/Surgical Nurses required f...,Princess Alexandra Hospital is currently emplo...,Healthcare & Medical,Nursing - General Medical & Surgical,experienced
...,...,...,...,...,...,...,...
2747,35851466,Truck Driver - Pneumatic Tanker,Truck Drivers required for tanker work out of ...,Qube is Australia’s largest integrated provide...,"Manufacturing, Transport & Logistics",Road Transport,intermediate
2748,32610438,Senior UI Designer | Front End Developer,Tired of contracting and having to look for a ...,I’m on the hunt for a super star UI Designer a...,Information & Communication Technology,Developers/Programmers,senior
2749,35925277,Virtual Pharmaceutical Sales Representative,An exciting pharmaceutical telesales position....,Location: North Sydney Hours: Monday to Friday...,Call Centre & Customer Service,Sales - Outbound,entry-level
2750,40117762,Electrical Design and Draftsperson,We have an opportunity that has become availab...,Electrical Design and Draftsperson (Bunbury an...,Engineering,Electrical/Electronic Engineering,experienced


In [11]:
dataset = []

for i in tqdm(range(data.shape[0])):
    desc = {
        "job_title": data.iloc[i].job_title,
        "job_summary": data.iloc[i].job_summary,
        "job_ad_details": data.iloc[i].job_ad_details,
        "classification_name": data.iloc[i].classification_name,
        "subclassification_name": data.iloc[i].subclassification_name,
    }
    desc_str = str(desc)
    
    label = f'{{"seniority_label": "{data.iloc[i].y_true}"}}.'
    
    messages = []
    messages.append(
        {
            'role': 'system',
            'content': 'You are an expert job ad annotator. Your task is to infer the seniority information from job descriptions. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous, intermediate to senior, senior executive] . If not present in the set, then create a label.'
        }
    )
    messages.append(
        {
            'role': 'user',
            'content': (
                f"{desc_str} Extract seniority label from this job description. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous, intermediate to senior, senior executive] . If not present in the set, then create a label. "
                "Respond in JSON: {\"seniority_label\": \"\"}."
            )
        }
    )
    messages.append(
        {
            'role': 'assistant',
            'content': label
        }
    )
    
    dataset.append({'messages': messages})

100%|██████████| 2752/2752 [00:01<00:00, 1601.38it/s]


In [12]:
dataset[0]

{'messages': [{'role': 'system',
   'content': 'You are an expert job ad annotator. Your task is to infer the seniority information from job descriptions. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-inte

In [13]:
with open('seniority_dataset.json', 'w') as f:
    json.dump(dataset, f)

# Finetune Gemma 3 1B

In [3]:
dataset = load_dataset("json", data_files="seniority_dataset.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 2752
    })
})

In [5]:
model_name = 'google/gemma-3-1b-it'

In [6]:
# Load model and tokenizer.
os.environ['HF_TOKEN'] = 'hf_...' # put your HF access token here if needed
# it is adviced to train Gemma3 with eager attention
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", attn_implementation="eager")
tokenizer = AutoTokenizer.from_pretrained(
   model_name, padding=True, truncation=True
)

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [7]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [8]:
seed = 123

num_train_epochs = 5
max_steps = -1
bf16 = False
output_dir = 'finetune_gemma3_results'
run_name = f"{model_name.split('/')[-1]}-seniority"
output_dir_final = os.path.join(output_dir, run_name)
pathlib.Path(output_dir_final).mkdir(parents=True, exist_ok=True)

# Adjust tokenizer settings as warned by the trainer
tokenizer.padding_side = 'right'

print("Creating trainer...")
training_args = SFTConfig(
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    bf16=bf16,
    tf32=False, # use tf32 for faster training on Ampere GPUs or newer.
    dataloader_pin_memory=False,
    torch_compile=False,
    warmup_steps=50,
    max_steps=max_steps,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    logging_strategy="steps",
    save_strategy="steps",
    save_steps=1500,
    save_total_limit=10,
    logging_steps=50,
    output_dir=output_dir_final,
    optim="paged_adamw_8bit",
    remove_unused_columns=True,
    seed=seed,
    run_name=run_name,
    report_to="wandb",
    push_to_hub=False,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    processing_class=tokenizer,
)

print("Training...")
trainer.train()

Creating trainer...


Converting train dataset to ChatML:   0%|          | 0/2752 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2752 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2752 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/2752 [00:00<?, ? examples/s]

[2025-04-22 23:41:38,848] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[34m[1mwandb[0m: Currently logged in as: [33mhuwarr[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
50,1.5759
100,1.476
150,1.4196
200,1.4314
250,1.4098
300,1.3582
350,1.3121
400,1.0173
450,1.0011
500,1.0559


TrainOutput(global_step=1720, training_loss=0.6465190015906511, metrics={'train_runtime': 14057.1108, 'train_samples_per_second': 0.979, 'train_steps_per_second': 0.122, 'total_flos': 5.180814681598464e+16, 'train_loss': 0.6465190015906511})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fa95d3d3f90>> (for post_run_cell), with arguments args (<ExecutionResult object at 7faad1e79a10, execution_count=8 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7faab87cff90, raw_cell="seed = 123

num_train_epochs = 5
max_steps = -1
bf.." store_history=True silent=False shell_futures=True cell_id=7b8e63c6-b5e3-4e36-a372-2a0ce14b75fa> result=TrainOutput(global_step=1720, training_loss=0.6465190015906511, metrics={'train_runtime': 14057.1108, 'train_samples_per_second': 0.979, 'train_steps_per_second': 0.122, 'total_flos': 5.180814681598464e+16, 'train_loss': 0.6465190015906511})>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [9]:
wandb.finish()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fa95d3d3f90>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7fa936f4b950, raw_cell="wandb.finish()" store_history=True silent=False shell_futures=True cell_id=4e3cb0f3-2e57-4c1e-b94b-5c95d7641424>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

0,1
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▅▅▄▄▄▄▄▃▄▄▃▄▅▄▄▅▄▄▄▅▄▄▄▄▄▄▁▂▂▂▁▂▂
train/learning_rate,██████▇▇▇▇▇▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
train/loss,██▇▇▇▇▇▅▅▆▆▆▆▅▃▃▃▃▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁
train/mean_token_accuracy,▁▁▂▂▂▂▂▃▃▃▃▃▃▃▅▅▅▅▄▅▅▇▇▇▇▇▇████████
train/num_tokens,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███

0,1
total_flos,5.180814681598464e+16
train/epoch,5.0
train/global_step,1720.0
train/grad_norm,0.87303
train/learning_rate,0.0
train/loss,0.0316
train/mean_token_accuracy,0.99225
train/num_tokens,12372460.0
train_loss,0.64652
train_runtime,14057.1108


# Inference

In [2]:
max_new_tokens = 256
model_name = 'google/gemma-3-1b-it'
checkpoint_path = 'finetune_gemma3_results/gemma-3-1b-it-seniority/checkpoint-1720/'

In [3]:
os.environ['HF_TOKEN'] = 'hf_...' # put your HF access token here if needed
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, device_map="cuda:0", attn_implementation="eager")
tokenizer = AutoTokenizer.from_pretrained(
   model_name, padding=True, truncation=True
)

In [4]:
os.environ['HF_TOKEN'] = 'hf_...'
model.push_to_hub('gemma_seniority')

[2025-04-27 14:08:06,983] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


model.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/huwar/gemma_seniority/commit/4d01a530a22410afeefacfb67cb95a10b10abd17', commit_message='Upload Gemma3ForCausalLM', commit_description='', oid='4d01a530a22410afeefacfb67cb95a10b10abd17', pr_url=None, repo_url=RepoUrl('https://huggingface.co/huwar/gemma_seniority', endpoint='https://huggingface.co', repo_type='model', repo_id='huwar/gemma_seniority'), pr_revision=None, pr_num=None)

In [5]:
tokenizer.push_to_hub('gemma_seniority')

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/huwar/gemma_seniority/commit/c791d9594730764e2509f076225ad2b1991755a9', commit_message='Upload tokenizer', commit_description='', oid='c791d9594730764e2509f076225ad2b1991755a9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/huwar/gemma_seniority', endpoint='https://huggingface.co', repo_type='model', repo_id='huwar/gemma_seniority'), pr_revision=None, pr_num=None)

In [12]:
test_df = pd.read_csv('data/seniority_labelled_test_set_cleaned.csv')

Example:

In [13]:
messages_static = [
    {"role": "system", "content": "You are an expert job ad annotator. Your task is to infer the seniority information from job descriptions. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous, intermediate to senior, senior executive] . If not present in the set, then create a label."},
]

In [14]:
i = 0

desc = {
    "job_title": test_df.iloc[i].job_title,
    "job_summary": test_df.iloc[i].job_summary,
    "job_ad_details": test_df.iloc[i].job_ad_details,
    "classification_name": test_df.iloc[i].classification_name,
    "subclassification_name": test_df.iloc[i].subclassification_name,
}
desc_str = str(desc)

messages = copy.deepcopy(messages_static)
messages.append({
    "role": "user",
    "content": (
        f"{desc_str} Extract seniority label from this job description. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous, intermediate to senior, senior executive] . If not present in the set, then create a label. "
        "Respond in JSON: {\"seniority_label\": \"\"}."
    )
})

In [15]:
messages

[{'role': 'system',
  'content': 'You are an expert job ad annotator. Your task is to infer the seniority information from job descriptions. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous

In [16]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False)

In [17]:
prompt

'<bos><start_of_turn>user\nYou are an expert job ad annotator. Your task is to infer the seniority information from job descriptions. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous, inter

In [18]:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
outputs = outputs[:, inputs['input_ids'].shape[-1]:]
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [19]:
response

'model\n{"seniority_label": "senior"}.'

In [20]:
assistant_token = 'model\n'
response[response.find(assistant_token) + len(assistant_token):]

'{"seniority_label": "senior"}.'

Test set:

In [21]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [22]:
messages_static = [
    {"role": "system", "content": "You are an expert job ad annotator. Your task is to infer the seniority information from job descriptions. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous, intermediate to senior, senior executive] . If not present in the set, then create a label."},
]

In [23]:
for i in tqdm(range(len(test_df)), position=0, leave=True):
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_summary": test_df.iloc[i].job_summary,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "classification_name": test_df.iloc[i].classification_name,
        "subclassification_name": test_df.iloc[i].subclassification_name,
    }
    desc_str = str(desc)

    messages = copy.deepcopy(messages_static)
    messages.append({
        "role": "user",
        "content": (
            f"{desc_str} Extract seniority label from this job description. The seniority label may be present in the set: [intermediate, senior, lead, head, experienced, entry-level, executive, assistant, senior/lead, deputy, director, trainee, associate, graduate, junior, general-manager, coordinator, student, chief, principal, apprentice, qualified, entry-level to intermediate, senior associate, standard, senior assistant, specialist, mid-level, entry level assistant, experienced assistant, manager, graduate/junior, independent, 1st year apprentice, senior-executive, junior assistant, assistant manager, supervisor, second-in-command, associate director, board, 4th year apprentice, mid-senior, regional head, middle-management, advanced, 2nd year apprentice, intermediate apprentice, level 2, assistant head, owner, post-doctoral, owner-operator, middle management, senior head, assistant director, junior-intermediate, sous, intermediate to senior, senior executive] . If not present in the set, then create a label. "
        "Respond in JSON: {\"seniority_label\": \"\"}."
        )
    })
    
    prompt = tokenizer.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
    outputs = outputs[:, inputs['input_ids'].shape[-1]:]
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    assistant_token = 'model\n'
    answer_str = response[response.find(assistant_token) + len(assistant_token):]

    # format the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)
        
        label = answer['seniority_label']

    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    test_pred_df.loc[len(test_pred_df)] = label

100%|██████████| 689/689 [07:38<00:00,  1.50it/s]


In [24]:
# export the dataframe to a new csv file
test_pred_df.to_csv('seniority_labelled_test_set_gemma3_finetune_preds.csv', index=False)

# Metrics

In [1]:
import json
import string
import pandas as pd
import numpy as np
import ast

In [2]:
def categories(label):
    lab = str(label).lower()
    if 'entry' in lab:
        return 'Entry'
    elif 'junior' in lab or 'assistant' in lab:
        return 'Junior'
    elif 'intermediate' in lab or 'experienced' in lab or 'mid' in lab:
        return 'Mid'
    elif 'senior' in lab or 'lead' in lab:
        return 'Senior'
    elif any(x in lab for x in ['manager','director','chief','head','executive','principal']):
        return 'Leadership'
    else:
        return 'Other'
    
def process(row):
    if "error" not in row:
        row = row.strip().lower()
        if '[' in row and ']' in row:
            row = ast.literal_eval(row[row.find('['):row.find(']')+1])
            row = '/'.join(row)
        return row
    else:
        row = row.strip().lower()
        try:
            row = row[row.find('{'):row.find('}')+1]
            row_data = json.loads(row)
        except Exception:
            row = row[len('error'):]
            row = row.translate(str.maketrans('', '', string.punctuation))
            row = row.strip()
            if len(row.split(' ')) == 1:
                return row
            return row
        row_data = row_data['clue'] #if 'clue' in row_data else 
        if row_data == 'yes':
            return ""
        return row_data

def get_accuracy(path_to_preds):
    preds = pd.read_csv(path_to_preds)
    test_df = pd.read_csv('data/seniority_labelled_test_set_cleaned.csv')
    
    test_df['y_pred'] = preds.values.reshape(-1)
    test_df = test_df.fillna('')
    
    test_df['y_pred'] = test_df['y_pred'].map(process)
    test_df['y_true'] = test_df['y_true'].map(process)
    
    test_df['y_pred_cat'] = test_df['y_pred'].map(categories)
    test_df['y_true_cat'] = test_df['y_true'].map(categories)
    
    exact = (test_df['y_pred'] == test_df['y_true']).mean() * 100
    cat = ((test_df['y_pred'] != test_df['y_true']) & (test_df['y_pred_cat'] == test_df['y_true_cat'])).mean() * 100
    overall = ((test_df['y_pred'] == test_df['y_true']) | (test_df['y_pred_cat'] == test_df['y_true_cat'])).mean() * 100
    
    exact_count = (test_df['y_pred'] == test_df['y_true']).sum()
    cat_count = ((test_df['y_pred'] != test_df['y_true']) & (test_df['y_pred_cat'] == test_df['y_true_cat'])).sum()
    overall_count = ((test_df['y_pred'] == test_df['y_true']) | (test_df['y_pred_cat'] == test_df['y_true_cat'])).sum()
    
    print(f'Exact: {exact_count}/{test_df.shape[0]}')
    print(f'Similar: {cat_count}/{test_df.shape[0]}')
    print(f'Overall: {overall_count}/{test_df.shape[0]}')
    
    res = pd.DataFrame(
        {
            'Exact': round(exact, 2),
            'Similar': round(cat, 2),
            'Overall': round(overall, 2),
        },
        index=['Accuracy (%)']
    )
    
    return res

In [4]:
path_to_preds = 'seniority_labelled_test_set_gemma3_finetune_preds.csv'
get_accuracy(path_to_preds)

Exact: 449/689
Similar: 80/689
Overall: 529/689


Unnamed: 0,Exact,Similar,Overall
Accuracy (%),65.17,11.61,76.78
