# Loading

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import pipeline
import torch
import string
import re
from bs4 import BeautifulSoup
import json
import copy

In [2]:
pipe = pipeline(
    "text-generation", model="google/gemma-3-1b-it", device="cuda", torch_dtype=torch.bfloat16,
    token='' # put your HF access token here
)

Device set to use cuda


# Salary

In [3]:
train_df = pd.read_csv('data/salary_labelled_development_set_cleaned.csv')
test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')

In [4]:
train_df.iloc[1131]

job_id                                                             51839829
job_title                                        Senior Key Account Manager
job_ad_details            At RB, we’re driven by our fight to make acces...
nation_short_desc                                                        NZ
salary_additional_text     Competitive base + car allowance + bonus + MORE!
y_true                                                        0-0-None-None
Name: 1131, dtype: object

# RBIC Functions

### RB


In [5]:
#== step 1: role explanation
def step_1():
    messages = [
        {"role": "system", "content": "You are an expert job ad annotator. Your task is to extract structured salary information from job descriptions in the format: min-max-currency-frequency. If salary is not found, return: 0-0-None-None."},
        {"role": "user", "content": "Based on your role, can you briefly explain what constitutes a salary range, currency and frequency in a job listing?"}
    ]

    response = pipe(messages, max_new_tokens=1000)
    messages.append(response[0]["generated_text"][-1])

    return response[0]["generated_text"][-1]["content"], messages

In [6]:
#== few-shot examples
def fewshot(messages):
    # messages.append({
    #     "role": "user",
    #     "content": "I will provide you with some examples on how to accomplish your task"
    # })

    # response = pipe(messages, max_new_tokens=1000)
    # messages.append(response[0]["generated_text"][-1])

    few_shot_indices = [1030, 1041, 1025, 1022, 1011]

    for i in few_shot_indices:
        desc = {
          "job_title": train_df.iloc[i].job_title,
          "job_ad_details": train_df.iloc[i].job_ad_details,
          "nation_short_desc": train_df.iloc[i].nation_short_desc,
          "salary_additional_text": train_df.iloc[i].salary_additional_text,
        }
        desc_str = str(desc)

        # add the description
        messages.append({
          "role": "user",
          "content": desc_str
        })

        label = train_df.iloc[i].y_true
        label_str = str(label)

        # add the output
        messages.append({
            "role": "assistant",
            "content": label_str
        })

    return messages

In [7]:
#== step 2: setting sub-task --> ask for salary patterns
def step_2(messages):
    messages.append({
        "role": "user",
        "content": "As a salary extractor, what are some common phrases or numeric patterns that indicate a salary range, currency and frequency in a job description?"
    })

    response = pipe(messages, max_new_tokens=1000)
    messages.append(response[0]["generated_text"][-1])

    return response[0]["generated_text"][-1]["content"], messages

### IC

In [31]:
#== step 3: ask if salary info is in the description
def step_3(messages_static, desc_str):
    messages = copy.deepcopy(messages_static)

    messages.append({
        "role": "user",
        "content": f"{desc_str} does this job description include any salary-related information? Just respond with 'Yes' or 'No'."
    })

    response = pipe(messages, max_new_tokens=1000)
    messages.append(response[0]["generated_text"][-1])

    p3_content = response[0]["generated_text"][-1]["content"]

    # clean and check the response
    response_p3 = p3_content.translate(str.maketrans('', '', string.punctuation))
    response_p3 = response_p3.strip().lower()[:3]
    #print(f"step3 messages: {len(messages)}")
    return response_p3, messages

In [32]:
#== step 4: iterative coaching/finding clues to prevent hallucination
def step_4(response_p3, messages):
    if response_p3 == "yes":
        messages.append({
            "role": "user",
            "content": "Extract the salary range or salary-related phrases from the text verbatim. Respond in JSON: {\"Clue\": \"\"}."
        })
    elif response_p3 == "no":
        messages.append({
            "role": "user",
            "content": "Briefly explain why there is no salary information (e.g., 'No mention of pay or compensation'). Respond in JSON: {\"Clue\": \"\"}."
        })
    else:
        raise Exception(f"Unexpected model output: {response_p3}")

    response = pipe(messages, max_new_tokens=1000)
    messages.append(response[0]["generated_text"][-1])
    #print(f"step4 messages: {len(messages)}")

    return response[0]["generated_text"][-1]["content"], messages

In [50]:
#== step 5: use the clue to generate the final output
def step_5(messages):
    messages.append({
        "role": "user",
        "content": (
            "Based on the extracted salary clue, return a structured salary in JSON. "
            "Use 'nation_short_desc' to determine the correct currency. "
            "Use this format: {\"MinSalary\": \"\", \"MaxSalary\": \"\", \"Currency\": \"\", \"Frequency\": \"\"}. "
            "If not provided explicitly, output 0 for \"MinSalary\" and \"MaxSalary\", and \"None\" for \"Currency\" and \"Frequency\". "
            "If the salary is mentioned, always output a range, where MinSalary and MaxSalary can be equal. "
            "Output the currency as 3 letters. Use adverb to output frequency (annually, monthly, daily or hourly)."
        )
    })

    response = pipe(messages, max_new_tokens=1000)
    answer_str = response[0]["generated_text"][-1]["content"]

    #print(f"step5 messages: {len(messages)}")

    # format and print the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)

        # min_salary = int(round(float(answer['MinSalary'])))
        # max_salary = int(round(float(answer['MaxSalary'])))

        # if  min_salary == 0 and  max_salary == 0:
        #   label = "0-0-None-None"
        # else:
        #   label = f"{min_salary}-{max_salary}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"
        # label = f"{min_salary}-{max_salary}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"

        label = f"{answer['MinSalary'] if 'MinSalary' in answer else 0}-{answer['MaxSalary'] if 'MaxSalary' in answer else 0}-{answer['Currency'].upper() if 'Currency' in answer else 'None'}-{answer['Frequency'].upper() if 'Frequency' in answer else  'None'}"



    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    return label, answer_str

In [46]:
def RBIC_static_messages(verbose=False, add_fewshot=True):
    response, messages = step_1()
    if verbose: print(f"RB step 1: {response}\n")

    response, messages = step_2(messages)
    if verbose: print(f"RB step 2: {response}\n")

    if add_fewshot:
        messages = fewshot(messages)
        if verbose: print(f"Fewshot examples added\n")
    return messages

In [47]:
def RBIC(messages, desc_str, verbose=False):
    response_p3, messages_local = step_3(messages, desc_str)
    if verbose: print(f"IC step 1: {response_p3}\n")

    response, messages_local = step_4(response_p3, messages_local)
    if verbose: print(f"IC step 2: {response}\n")

    label, answer_str = step_5(messages_local)
    if verbose: print(f"IC step 3 (Final): {label}\n")
    if verbose: print(f"IC step 3 (Final Raw): {answer_str}\n")

    return label

# Testing

### Qualitative Tests

In [13]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=True)

RB step 1: Okay, let's tackle this job description parsing task. As an expert job ad annotator, I understand the importance of accurately extracting salary data. Here’s a breakdown of what constitutes a salary range, currency, and frequency, as it's crucial for a robust and useful job posting system:

**1. Salary Range:**

*   **Definition:** A salary range indicates a plausible range of salaries for the position. It’s not a fixed number, but rather an estimated upper and lower bound.
*   **What to look for:** It's usually expressed as a range (e.g., "$60,000 - $80,000 annually," "$75,000 - $90,000," "$55,000 - $70,000").  It can also be represented as a percentage (e.g., "55-65% of average salary").
*   **Important:**  The range should be *reasonable* and reflect the expected job level and responsibilities.  Too narrow a range might be misleading, while too broad can be confusing.

**2. Currency:**

*   **Definition:** This specifies the currency in which the salary is expressed (e.g.

In [14]:
ind = 345

desc = {
    "job_title": train_df.iloc[ind].job_title,
    "job_ad_details": train_df.iloc[ind].job_ad_details,
    "nation_short_desc": train_df.iloc[ind].nation_short_desc,
    "salary_additional_text": train_df.iloc[ind].salary_additional_text,
  }
desc_str = str(desc)

label = train_df.iloc[ind].y_true
label_str = str(label)

print(f"len of messages_static {len(messages_static)}")
label_pred = RBIC(messages_static, desc_str, verbose=True)


print(f"pred = {label_pred}")
print(f"truth = {label_str}")

len of messages_static 15
IC step 1: yes

IC step 2: ```json
{"Clue": "Salary: $70049.73 - $98351.39 + SuperHours Per Week: 38"}
```

IC step 3 (Final): 70049.73-98351.39-TH-YEARLY

IC step 3 (Final Raw): {"MinSalary": "70049.73", "MaxSalary": "98351.39", "Currency": "TH", "Frequency": "yearly"}

pred = 70049.73-98351.39-TH-YEARLY
truth = 70049-98351-AUD-ANNUAL


### Quantitative Tests

In [15]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [16]:
test_pred_df

Unnamed: 0,y_pred


In [17]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "nation_short_desc": test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('salary_labelled_test_set_gemma3_rbic_fewshot_preds.csv', index=False)

  0%|          | 1/567 [00:04<42:10,  4.47s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 567/567 [44:34<00:00,  4.72s/it]


# No few-shot

In [41]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=False)

RB step 1: Okay, let's break down salary ranges, currencies, and frequency in a job description, as I understand it. 

**Here’s my understanding of the key components:**

* **Salary Range:** This is the core of the salary request. It’s a *range* of acceptable salaries, typically expressed as a minimum and maximum.  It's more nuanced than just a single number. It often includes a *base* salary and potentially a *plus* or *minus* amount for bonuses, stock options, or other incentives.  The range often subtly communicates the expectation – is it a high-end, competitive role, or something more modest?

* **Currency:** This clearly identifies the monetary value of the salary. Common currencies include USD, EUR, GBP, CAD, etc.  It's *essential* to be precise; vague terms like “good salary” don’t convey anything meaningful.

* **Frequency:**  This refers to how often the salary is mentioned. It’s typically expressed as “per year,” “monthly,” “weekly,” “hourly,” or “occasionally.”  More freque

In [51]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [52]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "nation_short_desc": test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('salary_labelled_test_set_gemma3_rbic_preds.csv', index=False)

 12%|█▏        | 67/567 [03:59<29:16,  3.51s/it] 

Failed to parse model output as JSON: ```json
{
"Clue": "Salary expectations: $X - $Y”",
"MinSalary": "0",
"MaxSalary": "0",
"Currency": "TH",
"Frequency": "None"
}
```


 19%|█▊        | 106/567 [07:19<2:04:04, 16.15s/it]

Failed to parse model output as JSON: Okay, I understand. Please provide the job description text. I will then extract the salary information and return a JSON formatted output according to your specifications.


 72%|███████▏  | 410/567 [30:20<11:37,  4.44s/it]  

Failed to parse model output as JSON: ```json
{
  "Clue": "Salary expectations: $X - $Y””
}
```

```json
{
  "MinSalary": "X - Y",
  "MaxSalary": "Y",
  "Currency": "USD",
  "Frequency": "None"
}
```


 77%|███████▋  | 439/567 [32:40<08:35,  4.03s/it]

Failed to parse model output as JSON: ```json
{
  "Clue": "“Salary is a key factor in our decision””
}
```


 86%|████████▌ | 489/567 [36:49<19:37, 15.10s/it]

Failed to parse model output as JSON: Okay, I understand. Please provide the job description text. I will then analyze it and return the structured salary information in the JSON format you’ve specified.


100%|██████████| 567/567 [41:30<00:00,  4.39s/it]


# Regular prompting

In [56]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [57]:
messages_static = [
    {"role": "system", "content": "You are an expert job ad annotator."},
]

In [58]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "nation_short_desc": test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    messages = copy.deepcopy(messages_static)
    messages.append({
        "role": "user",
        "content": (
            f"{desc_str} Extract structured salary information from this job description in the format: min-max-currency-frequency. "
            "Respond in JSON: {\"MinSalary\": \"\", \"MaxSalary\": \"\", \"Currency\": \"\", \"Frequency\": \"\"}. "
            "If not provided explicitly, output 0 for \"MinSalary\" and \"MaxSalary\", and \"None\" for \"Currency\" and \"Frequency\". "
            "If the salary is mentioned, always output a range, where MinSalary and MaxSalary can be equal. "
            "Use 'nation_short_desc' to determine the correct currency. "
            "Output the currency as 3 letters. Use adverb to output frequency (annually, monthly, daily or hourly)."
        )
    })
    
    response = pipe(messages, max_new_tokens=1000)
    answer_str = response[0]["generated_text"][-1]["content"]

    # format the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)

        label = f"{answer['MinSalary'] if 'MinSalary' in answer else 0}-{answer['MaxSalary'] if 'MaxSalary' in answer else 0}-{answer['Currency'].upper() if 'Currency' in answer else 'None'}-{answer['Frequency'].upper() if 'Frequency' in answer else  'None'}"

    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    test_pred_df.loc[len(test_pred_df)] = label

# export the dataframe to a new csv file
test_pred_df.to_csv('salary_labelled_test_set_gemma3_preds.csv', index=False)

 84%|████████▍ | 477/567 [14:19<02:38,  1.77s/it]

Failed to parse model output as JSON: Okay, I'm ready to analyze the job description and extract the salary information in the specified JSON format. Please provide the job description text.



100%|██████████| 567/567 [17:00<00:00,  1.80s/it]


# Metrics

In [5]:
import json
import string
import pandas as pd

In [6]:
def simple_post_process(row):
    if 'ERROR' not in row:
        row = row.translate(str.maketrans('', '', '"'))
        try:
            mn, mx, cur, freq = row.split('-')
        except Exception:
            row_split = row.split('-')
            mn = row_split[0]
            mx = ''.join(c for c in mn if c.isdigit())
            mx = row_split[-3]
            mx = ''.join(c for c in mx if c.isdigit())
            cur = row_split[-2]
            freq = row_split[-1]
        # 'NONE' -> 'None'
        cur = 'None' if cur == 'NONE' else cur
        freq = 'None' if freq == 'NONE' else freq
        # cast min and max salary to int
        try:
            mn = int(round(float(mn)))
        except Exception:
            mn = 0
        try:
            mx = int(round(float(mx)))
        except Exception:
            mx = 0
        return str(mn), str(mx), cur, freq, f'{mn}-{mx}-{cur}-{freq}'
    else:
        row = row.strip()
        try:
            row = row[row.find('{'):-4]
            row_data = json.loads(row)
        except Exception:
            return 0, 0, 'None', 'None', '0-0-None-None'
        row_data = row_data['Full Time'] if 'Full Time' in row_data else (row_data['Full-timer'] if 'Full-timer' in row_data else row_data)
        try:
            mn = row_data['MinSalary']
            mx = row_data['MaxSalary']
            cur = row_data['Currency']
            freq = row_data['Frequency']
        except Exception:
            return 0, 0, 'None', 'None', '0-0-None-None'
        # 'NONE' -> 'None'
        cur = 'None' if cur == 'NONE' else cur
        freq = 'None' if freq == 'NONE' else freq
        # cast min and max salary to int
        mn = int(round(float(mn)))
        mx = int(round(float(mx)))
        return str(mn), str(mx), cur, freq, f'{mn}-{mx}-{cur}-{freq}'

def split_target(row):
    mn, mx, cur, freq = row.split('-')
    return mn, mx, cur, freq

def get_accuracy(path_to_preds):
    preds = pd.read_csv(path_to_preds)
    test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')
    
    test_df['y_pred'] = preds.values.reshape(-1)
    
    test_df['min_salary_pred'], test_df['max_salary_pred'], test_df['currency_pred'], test_df['freq_pred'], test_df['y_pred'] = zip(*test_df['y_pred'].map(simple_post_process))
    test_df['min_salary_true'], test_df['max_salary_true'], test_df['currency_true'], test_df['freq_true'] = zip(*test_df['y_true'].map(split_target))
    
    acc_overall = (test_df['y_pred'] == test_df['y_true']).mean() * 100
    acc_min = (test_df['min_salary_pred'] == test_df['min_salary_true']).mean() * 100
    acc_max = (test_df['max_salary_pred'] == test_df['max_salary_true']).mean() * 100
    acc_curr = (test_df['currency_pred'] == test_df['currency_true']).mean() * 100
    acc_freq = (test_df['freq_pred'] == test_df['freq_true']).mean() * 100
    
    res = pd.DataFrame(
        {
            'Overall': round(acc_overall, 2),
            'Min Salary': round(acc_min, 2),
            'Max Salary': round(acc_max, 2),
            'Currency': round(acc_curr, 2),
            'Frequency': round(acc_freq, 2),
        },
        index=['Accuracy (%)']
    )
    
    return res

In [9]:
get_accuracy('salary_labelled_test_set_gemma3_rbic_fewshot_preds.csv')

Unnamed: 0,Overall,Min Salary,Max Salary,Currency,Frequency
Accuracy (%),3.17,65.78,56.79,8.47,32.8


In [10]:
get_accuracy('salary_labelled_test_set_gemma3_rbic_preds.csv')

Unnamed: 0,Overall,Min Salary,Max Salary,Currency,Frequency
Accuracy (%),4.76,81.48,72.49,21.69,58.38


In [11]:
get_accuracy('salary_labelled_test_set_gemma3_preds.csv')

Unnamed: 0,Overall,Min Salary,Max Salary,Currency,Frequency
Accuracy (%),8.99,51.85,52.91,35.8,47.44
