# Loading

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import openai
from openai import OpenAI
import string
import re
from bs4 import BeautifulSoup
import json
import copy

In [2]:
client = OpenAI(api_key="", base_url="https://api.deepseek.com")
model="deepseek-chat" # DeepSeek-V3

# Salary

In [3]:
train_df = pd.read_csv('data/salary_labelled_development_set_cleaned.csv')
test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')

In [4]:
train_df.iloc[1131]

job_id                                                             51839829
job_title                                        Senior Key Account Manager
job_ad_details            At RB, we’re driven by our fight to make acces...
nation_short_desc                                                        NZ
salary_additional_text     Competitive base + car allowance + bonus + MORE!
y_true                                                        0-0-None-None
Name: 1131, dtype: object

# RBIC Functions

### RB


In [5]:
#== step 1: role explanation
def step_1():
    messages = [
        {"role": "system", "content": "You are an expert job ad annotator. Your task is to extract structured salary information from job descriptions in the format: min-max-currency-frequency. If salary is not found, return: 0-0-None-None."},
        {"role": "user", "content": "Based on your role, can you briefly explain what constitutes a salary range, currency and frequency in a job listing?"}
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    return response.choices[0].message.content, messages

In [6]:
#== few-shot examples
def fewshot(messages):
    # messages.append({
    #     "role": "user",
    #     "content": "I will provide you with some examples on how to accomplish your task"
    # })

    # response = client.chat.completions.create(
    #     model=model,
    #     messages=messages
    # )
    # messages.append({"role": "assistant", "content": response.choices[0].message.content})

    few_shot_indices = [1030, 1041, 1025, 1022, 1011]

    for i in few_shot_indices:
        desc = {
          "job_title": train_df.iloc[i].job_title,
          "job_ad_details": train_df.iloc[i].job_ad_details,
          "nation_short_desc": train_df.iloc[i].nation_short_desc,
          "salary_additional_text": train_df.iloc[i].salary_additional_text,
        }
        desc_str = str(desc)

        # add the description
        messages.append({
          "role": "user",
          "content": desc_str
        })

        label = train_df.iloc[i].y_true
        label_str = str(label)

        # add the output
        messages.append({
            "role": "assistant",
            "content": label_str
        })

    return messages

In [7]:
#== step 2: setting sub-task --> ask for salary patterns
def step_2(messages):
    messages.append({
        "role": "user",
        "content": "As a salary extractor, what are some common phrases or numeric patterns that indicate a salary range, currency and frequency in a job description?"
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    return response.choices[0].message.content, messages

### IC

In [8]:
#== step 3: ask if salary info is in the description
def step_3(messages_static, desc_str):
    messages = copy.deepcopy(messages_static)

    messages.append({
        "role": "user",
        "content": f"{desc_str} does this job description include any salary-related information? Just respond with 'Yes' or 'No'."
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})

    p3_content = response.choices[0].message.content

    # clean and check the response
    response_p3 = p3_content.translate(str.maketrans('', '', string.punctuation))
    response_p3 = response_p3.strip().lower()[:3]
    #print(f"step3 messages: {len(messages)}")
    return response_p3, messages

In [9]:
#== step 4: iterative coaching/finding clues to prevent hallucination
def step_4(response_p3, messages):
    if response_p3 == "yes":
        messages.append({
            "role": "user",
            "content": "Extract the salary range or salary-related phrases from the text verbatim. Respond in JSON: {\"Clue\": \"\"}."
        })
    elif response_p3 == "no":
        messages.append({
            "role": "user",
            "content": "Briefly explain why there is no salary information (e.g., 'No mention of pay or compensation'). Respond in JSON: {\"Clue\": \"\"}."
        })
    else:
        raise Exception(f"Unexpected model output: {response_p3}")

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    messages.append({"role": "assistant", "content": response.choices[0].message.content})
    #print(f"step4 messages: {len(messages)}")

    return response.choices[0].message.content, messages

In [10]:
#== step 5: use the clue to generate the final output
def step_5(messages):
    messages.append({
        "role": "user",
        "content": (
            "Based on the extracted salary clue, return a structured salary in JSON. "
            "Use 'nation_short_desc' to determine the correct currency. "
            "Use this format: {\"MinSalary\": \"\", \"MaxSalary\": \"\", \"Currency\": \"\", \"Frequency\": \"\"}. "
            "If not provided explicitly, output 0 for \"MinSalary\" and \"MaxSalary\", and \"None\" for \"Currency\" and \"Frequency\". "
            "If the salary is mentioned, always output a range, where MinSalary and MaxSalary can be equal. "
            "Output the currency as 3 letters. Use adverb to output frequency (annual, monthly, daily or hourly)."
        )
    })

    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    answer_str = response.choices[0].message.content

    #print(f"step5 messages: {len(messages)}")

    # format and print the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)

        # min_salary = int(round(float(answer['MinSalary'])))
        # max_salary = int(round(float(answer['MaxSalary'])))

        # if  min_salary == 0 and  max_salary == 0:
        #   label = "0-0-None-None"
        # else:
        #   label = f"{min_salary}-{max_salary}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"
        # label = f"{min_salary}-{max_salary}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"

        label = f"{answer['MinSalary']}-{answer['MaxSalary']}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"



    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    return label, answer_str

In [11]:
def RBIC_static_messages(verbose=False, add_fewshot=True):
    response, messages = step_1()
    if verbose: print(f"RB step 1: {response}\n")

    response, messages = step_2(messages)
    if verbose: print(f"RB step 2: {response}\n")

    if add_fewshot:
        messages = fewshot(messages)
        if verbose: print(f"Fewshot examples added\n")
    return messages

In [12]:
def RBIC(messages, desc_str, verbose=False):
    response_p3, messages_local = step_3(messages, desc_str)
    if verbose: print(f"IC step 1: {response_p3}\n")

    response, messages_local = step_4(response_p3, messages_local)
    if verbose: print(f"IC step 2: {response}\n")

    label, answer_str = step_5(messages_local)
    if verbose: print(f"IC step 3 (Final): {label}\n")
    if verbose: print(f"IC step 3 (Final Raw): {answer_str}\n")

    return label

# Testing

### Qualitative Tests

In [13]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=True)

RB step 1: Certainly! Here’s a breakdown of the key components for structured salary extraction in job listings:  

1. **Salary Range**:  
   - Typically expressed as a **min-max** (e.g., "$50,000 - $70,000" → `50000-70000`).  
   - May appear as a single figure (e.g., "Salary: €60,000" → `60000-60000`).  

2. **Currency**:  
   - Explicit symbols (e.g., $, €, £) or codes (e.g., USD, EUR, GBP).  
   - Sometimes implied by location (e.g., "£" for UK roles, "$" for US/Canada/AUD depending on context).  

3. **Frequency**:  
   - **Annual** (e.g., "per year," "annually," "yr").  
   - **Monthly** (e.g., "per month," "monthly").  
   - **Hourly** (e.g., "$20/hour," "per hour").  
   - **Other** (e.g., "daily," "weekly," or one-time "bonus").  

**Example Annotations**:  
- "$70k - $90k per year" → `70000-90000-USD-annual`  
- "€45,000 annually" → `45000-45000-EUR-annual`  
- "£15/hour" → `15-15-GBP-hourly`  

If no salary is mentioned, return `0-0-None-None`. Let me know if you'd like furt

In [14]:
ind = 345

desc = {
    "job_title": train_df.iloc[ind].job_title,
    "job_ad_details": train_df.iloc[ind].job_ad_details,
    "nation_short_desc": train_df.iloc[ind].nation_short_desc,
    "salary_additional_text": train_df.iloc[ind].salary_additional_text,
  }
desc_str = str(desc)

label = train_df.iloc[ind].y_true
label_str = str(label)

print(f"len of messages_static {len(messages_static)}")
label_pred = RBIC(messages_static, desc_str, verbose=True)


print(f"pred = {label_pred}")
print(f"truth = {label_str}")

len of messages_static 15
IC step 1: yes

IC step 2: ```json
{
  "Clue": "$70049.73 - $98351.39 + Super"
}
```

IC step 3 (Final): 70049.73-98351.39-AUD-ANNUAL

IC step 3 (Final Raw): ```json
{
  "MinSalary": "70049.73",
  "MaxSalary": "98351.39",
  "Currency": "AUD",
  "Frequency": "annual"
}
```

pred = 70049.73-98351.39-AUD-ANNUAL
truth = 70049-98351-AUD-ANNUAL


### Quantitative Tests

In [15]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [16]:
test_pred_df

Unnamed: 0,y_pred


In [17]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "nation_short_desc": test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('salary_labelled_test_set_deepseek_rbic_fewshot_preds.csv', index=False)

100%|███████████████████████████████████████| 567/567 [2:30:05<00:00, 15.88s/it]


# No few-shot

In [18]:
messages_static = RBIC_static_messages(verbose=True, add_fewshot=False)

RB step 1: Certainly! Here’s a breakdown of the key components for structured salary extraction in job listings:  

1. **Salary Range (min-max)**:  
   - The minimum and maximum values of the offered compensation (e.g., *$50,000 - $70,000* → **50000-70000**).  
   - If only a single figure is given (e.g., *$80,000*), treat it as **80000-80000**.  

2. **Currency**:  
   - The monetary unit (e.g., USD, EUR, GBP, CAD, INR).  
   - Symbols ($, €, £) or codes (USD, EUR) should be normalized to standard ISO codes (e.g., *$* → **USD**, *€* → **EUR**).  

3. **Frequency**:  
   - The time unit for the salary (e.g., *per year*, *monthly*, *hourly*).  
   - Common terms:  
     - *yearly/annually/per year* → **yearly**  
     - *monthly/per month* → **monthly**  
     - *hourly/per hour* → **hourly**  

**Example**:  
- *"Salary: €60,000 - €75,000 per year"* → **60000-75000-EUR-yearly**  
- *"Pay: $30/hour"* → **30-30-USD-hourly**  
- No salary mentioned → **0-0-None-None**  

Let me know if yo

In [19]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [20]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "nation_short_desc": test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    label = test_df.iloc[i].y_true
    label_str = str(label)

    label_pred = RBIC(messages_static, desc_str)

    test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('salary_labelled_test_set_deepseek_rbic_preds.csv', index=False)

 72%|████████████████████████████▏          | 409/567 [1:43:17<41:09, 15.63s/it]

Failed to parse model output as JSON: ```json
{
  "Full Time": {
    "MinSalary": "1500",
    "MaxSalary": "1900",
    "Currency": "MYR",
    "Frequency": "monthly"
  },
  "Part Time": {
    "MinSalary": "8",
    "MaxSalary": "8",
    "Currency": "MYR",
    "Frequency": "hourly"
  },
  "Freelancer": {
    "MinSalary": "0",
    "MaxSalary": "0",
    "Currency": "None",
    "Frequency": "None"
  }
}
```


100%|███████████████████████████████████████| 567/567 [2:21:44<00:00, 15.00s/it]


# Regular prompting

In [21]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [22]:
messages_static = [
    {"role": "system", "content": "You are an expert job ad annotator. Your task is to extract structured salary information from job descriptions in the format: min-max-currency-frequency. If salary is not found, return: 0-0-None-None."},
]

In [23]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "nation_short_desc": test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    messages = copy.deepcopy(messages_static)
    messages.append({
        "role": "user",
        "content": (
            f"{desc_str} Extract structured salary information from this job descriptions in the format: min-max-currency-frequency. "
            "Respond in JSON: {\"MinSalary\": \"\", \"MaxSalary\": \"\", \"Currency\": \"\", \"Frequency\": \"\"}. "
            "If not provided explicitly, output 0 for \"MinSalary\" and \"MaxSalary\", and \"None\" for \"Currency\" and \"Frequency\". "
            "If the salary is mentioned, always output a range, where MinSalary and MaxSalary can be equal. "
            "Use 'nation_short_desc' to determine the correct currency. "
            "Output the currency as 3 letters. Use adverb to output frequency (annual, monthly, daily or hourly)."
        )
    })
    
    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    answer_str = response.choices[0].message.content

    # format the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)

        label = f"{answer['MinSalary']}-{answer['MaxSalary']}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"

    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    test_pred_df.loc[len(test_pred_df)] = label

# export the dataframe to a new csv file
test_pred_df.to_csv('salary_labelled_test_set_deepseek_preds.csv', index=False)

 40%|████████████████▌                        | 229/567 [19:34<45:34,  8.09s/it]

Failed to parse model output as JSON: ```json
{
  "Full-timer": {
    "MinSalary": "1500",
    "MaxSalary": "1700",
    "Currency": "MYR",
    "Frequency": "monthly"
  },
  "Part-timer Jadual 1": {
    "MinSalary": "400",
    "MaxSalary": "500",
    "Currency": "MYR",
    "Frequency": "monthly"
  },
  "Part-timer Jadual 2": {
    "MinSalary": "400",
    "MaxSalary": "500",
    "Currency": "MYR",
    "Frequency": "monthly"
  },
  "Part-timer Jadual 3": {
    "MinSalary": "400",
    "MaxSalary": "400",
    "Currency": "MYR",
    "Frequency": "monthly"
  },
  "Combination Example 1": {
    "MinSalary": "1200",
    "MaxSalary": "1400",
    "Currency": "MYR",
    "Frequency": "monthly"
  },
  "Combination Example 2": {
    "MinSalary": "1900",
    "MaxSalary": "2100",
    "Currency": "MYR",
    "Frequency": "monthly"
  }
}
```


 72%|█████████████████████████████▌           | 409/567 [35:08<16:06,  6.12s/it]

Failed to parse model output as JSON: ```json
{
  "Full Time": {
    "MinSalary": "1500",
    "MaxSalary": "1900",
    "Currency": "MYR",
    "Frequency": "monthly"
  },
  "Part Time": {
    "MinSalary": "8",
    "MaxSalary": "8",
    "Currency": "MYR",
    "Frequency": "hourly"
  },
  "Freelancer": {
    "MinSalary": "0",
    "MaxSalary": "0",
    "Currency": "None",
    "Frequency": "None"
  }
}
```


100%|█████████████████████████████████████████| 567/567 [48:37<00:00,  5.15s/it]


# Metrics

In [1]:
import json
import string
import pandas as pd

In [2]:
def simple_post_process(row):
    if 'ERROR' not in row:
        row = row.translate(str.maketrans('', '', '"'))
        try:
            mn, mx, cur, freq = row.split('-')
        except Exception:
            row_split = row.split('-')
            mn = row_split[0]
            mx = ''.join(c for c in mn if c.isdigit())
            mx = row_split[-3]
            mx = ''.join(c for c in mx if c.isdigit())
            cur = row_split[-2]
            freq = row_split[-1]
        # 'NONE' -> 'None'
        cur = 'None' if cur == 'NONE' else cur
        freq = 'None' if freq == 'NONE' else freq
        # cast min and max salary to int
        try:
            mn = int(round(float(mn)))
        except Exception:
            mn = 0
        try:
            mx = int(round(float(mx)))
        except Exception:
            mx = 0
        return str(mn), str(mx), cur, freq, f'{mn}-{mx}-{cur}-{freq}'
    else:
        row = row.strip()
        try:
            row = row[row.find('{'):-4]
            row_data = json.loads(row)
        except Exception:
            return 0, 0, 'None', 'None', '0-0-None-None'
        row_data = row_data['Full Time'] if 'Full Time' in row_data else (row_data['Full-timer'] if 'Full-timer' in row_data else row_data)
        try:
            mn = row_data['MinSalary']
            mx = row_data['MaxSalary']
            cur = row_data['Currency']
            freq = row_data['Frequency']
        except Exception:
            return 0, 0, 'None', 'None', '0-0-None-None'
        # 'NONE' -> 'None'
        cur = 'None' if cur == 'NONE' else cur
        freq = 'None' if freq == 'NONE' else freq
        # cast min and max salary to int
        mn = int(round(float(mn)))
        mx = int(round(float(mx)))
        return str(mn), str(mx), cur, freq, f'{mn}-{mx}-{cur}-{freq}'

def split_target(row):
    mn, mx, cur, freq = row.split('-')
    return mn, mx, cur, freq

def get_accuracy(path_to_preds):
    preds = pd.read_csv(path_to_preds)
    test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')
    
    test_df['y_pred'] = preds.values.reshape(-1)
    
    test_df['min_salary_pred'], test_df['max_salary_pred'], test_df['currency_pred'], test_df['freq_pred'], test_df['y_pred'] = zip(*test_df['y_pred'].map(simple_post_process))
    test_df['min_salary_true'], test_df['max_salary_true'], test_df['currency_true'], test_df['freq_true'] = zip(*test_df['y_true'].map(split_target))
    
    acc_overall = (test_df['y_pred'] == test_df['y_true']).mean() * 100
    acc_min = (test_df['min_salary_pred'] == test_df['min_salary_true']).mean() * 100
    acc_max = (test_df['max_salary_pred'] == test_df['max_salary_true']).mean() * 100
    acc_curr = (test_df['currency_pred'] == test_df['currency_true']).mean() * 100
    acc_freq = (test_df['freq_pred'] == test_df['freq_true']).mean() * 100
    
    res = pd.DataFrame(
        {
            'Overall': round(acc_overall, 2),
            'Min Salary': round(acc_min, 2),
            'Max Salary': round(acc_max, 2),
            'Currency': round(acc_curr, 2),
            'Frequency': round(acc_freq, 2),
        },
        index=['Accuracy (%)']
    )
    
    return res

In [3]:
get_accuracy('salary_labelled_test_set_deepseek_rbic_fewshot_preds.csv')

Unnamed: 0,Overall,Min Salary,Max Salary,Currency,Frequency
Accuracy (%),88.36,91.36,90.83,97.53,97.71


In [4]:
get_accuracy('salary_labelled_test_set_deepseek_rbic_preds.csv')

Unnamed: 0,Overall,Min Salary,Max Salary,Currency,Frequency
Accuracy (%),87.13,91.01,90.65,97.0,96.65


In [5]:
get_accuracy('salary_labelled_test_set_deepseek_preds.csv')

Unnamed: 0,Overall,Min Salary,Max Salary,Currency,Frequency
Accuracy (%),87.3,91.36,90.48,97.18,96.65
