# Loading

In [1]:
import pandas as pd
import numpy as np
import openai
from openai import OpenAI
import string
import re
from bs4 import BeautifulSoup
import json
import copy

In [None]:
train_df = pd.read_csv('data/salary_labelled_development_set_cleaned.csv')

In [None]:
test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')

In [None]:
client = OpenAI(api_key="add-key-here")

In [None]:
model="gpt-3.5-turbo"

# RBIC Functions

### RB


In [None]:
#== step 1: role explanation
def step_1():
  messages = [
      {"role": "system", "content": "You are an expert job ad annotator. Your task is to extract structured salary information from job descriptions in the format: min-max-currency-frequency. If salary is not found, return: 0-0-None-None."},
      {"role": "user", "content": "Based on your role, can you briefly explain what constitutes a salary range, currency and frequency in a job listing?"}
  ]

  response = client.chat.completions.create(
      model=model,
      messages=messages
  )
  messages.append({"role": "assistant", "content": response.choices[0].message.content})

  return response.choices[0].message.content, messages

In [None]:
#== few-shot examples
def fewshot(messages):
  # messages.append({
  #     "role": "user",
  #     "content": "I will provide you with some examples on how to accomplish your task"
  # })

  # response = client.chat.completions.create(
  #     model=model,
  #     messages=messages
  # )
  # messages.append({"role": "assistant", "content": response.choices[0].message.content})

  # for explanation of examples see appendix at the end of this section
  # these indices are also from the cleaned salary training set
  few_shot_indices = [1030, 1041, 1025, 1022, 1011]
  # few_shot_indices = [1046, 945]

  for i in few_shot_indices:
    desc = {
      "job_title": train_df.iloc[i].job_title,
      "job_ad_details": train_df.iloc[i].job_ad_details,
      "nation_short_desc": train_df.iloc[i].nation_short_desc,
      "salary_additional_text": train_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    # add the description
    messages.append({
      "role": "user",
      "content": desc_str
    })

    label = train_df.iloc[i].y_true
    label_str = str(label)

    # add the output
    messages.append({
        "role": "assistant",
        "content": label_str
    })

  return messages

In [None]:
#== step 2: setting sub-task --> ask for salary patterns
def step_2(messages):
  messages.append({
      "role": "user",
      "content": "As a salary extractor, what are some common phrases or numeric patterns that indicate a salary range, currency and frequency in a job description?"
  })

  response = client.chat.completions.create(
      model=model,
      messages=messages
  )
  messages.append({"role": "assistant", "content": response.choices[0].message.content})
  return response.choices[0].message.content, messages

### IC

In [None]:
#== step 3: ask if salary info is in the description
def step_3(messages_static, desc_str):
  messages = copy.deepcopy(messages_static)

  messages.append({
      "role": "user",
      "content": f"{desc_str} does this job description include any salary-related information? Just respond with 'Yes' or 'No'."
  })

  response = client.chat.completions.create(
      model=model,
      messages=messages
  )
  messages.append({"role": "assistant", "content": response.choices[0].message.content})

  p3_content = response.choices[0].message.content

  # clean and check the response
  response_p3 = p3_content.strip().lower()
  response_p3 = response_p3.translate(str.maketrans('', '', string.punctuation))
  return response_p3, messages

In [None]:
#== step 4: iterative coaching/finding clues to prevent hallucination
def step_4(response_p3, messages):
  if response_p3 == "yes":
      messages.append({
          "role": "user",
          "content": "Extract the salary range or salary-related phrases from the text verbatim. Respond in JSON: {\"Clue\": \"\"}."
      })
  elif response_p3 == "no":
      messages.append({
          "role": "user",
          "content": "Briefly explain why there is no salary information (e.g., 'No mention of pay or compensation'). Respond in JSON: {\"Clue\": \"\"}."
      })
  else:
      raise Exception(f"Unexpected model output: {response_p3}")

  response = client.chat.completions.create(
      model=model,
      messages=messages
  )
  messages.append({"role": "assistant", "content": response.choices[0].message.content})

  return response.choices[0].message.content, messages

In [None]:
#== step 5: use the clue to generate the final output
def step_5(messages):
  messages.append({
      "role": "user",
      "content": (
          "Based on the extracted salary clue, return a structured salary in JSON. "
          "Use 'nation_short_desc' to determine the correct currency. "
          "Use this format: {\"MinSalary\": \"\", \"MaxSalary\": \"\", \"Currency\": \"\", \"Frequency\": \"\"}. "
          "If not provided explicitly, output 0 for \"MinSalary\" and \"MaxSalary\", and \"None\" for \"Currency\" and \"Frequency\". "
          "If the salary is mentioned, always output a range, where MinSalary and MaxSalary can be equal. "
          "Output the currency as 3 letters. Use adverb to output frequency (yearly, monthly, daily or hourly)."
      )
  })

  response = client.chat.completions.create(model=model, messages=messages)
  answer_str = response.choices[0].message.content

  # format and print the output
  try:
      answer = json.loads(answer_str)
      label = f"{answer['MinSalary']}-{answer['MaxSalary']}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"

  except json.JSONDecodeError:
      print("Failed to parse model output as JSON:")
      label = "ERROR " + answer_str

  return label, answer_str

In [None]:
def RBIC_static_messages(verbose=False, add_fewshot=True):
  response, messages = step_1()
  if verbose: print(f"RB step 1: {response}\n")

  response, messages = step_2(messages)
  if verbose: print(f"RB step 2: {response}\n")

  if add_fewshot:
    messages = fewshot(messages)
    if verbose: print(f"Fewshot examples added\n")
  return messages

In [None]:
def RBIC(messages, desc_str, verbose=False):
  response_p3, messages_local = step_3(messages, desc_str)
  if verbose: print(f"IC step 1: {response_p3}\n")

  response, messages_local = step_4(response_p3, messages_local)
  if verbose: print(f"IC step 2: {response}\n")

  label, answer_str = step_5(messages_local)
  if verbose: print(f"IC step 3 (Final): {label}\n")
  if verbose: print(f"IC step 3 (Final Raw): {answer_str}\n")

  return label

## Appendix:

* 74199064 (1030): Description in USD, but the salary output is in THB

* 68984005 (1041): Description in dollars, but the output is in singaporean dollar (so must inference that it’s singaporean dollar from the nationality)

* 59878787 (1025): Salary range includes decimals and also includes casual loading

* 72778335 (1022): Includes up to around 35k, but mentions no other range, so that ends up being the final answer

* 76246949 (1011): Not too high inference but it’s a 0-0-None-None case where there are no empty fields or “-”, so negative case based only on inference

* 64869750 (1046): Generic ‘-’ case

* 78593545 (945): Generic ‘empty/NaN’ 0-0-None-None case



In [None]:
train_df.iloc[1131]

Unnamed: 0,1131
job_id,51839829
job_title,Senior Key Account Manager
job_ad_details,"At RB, we’re driven by our fight to make acces..."
nation_short_desc,NZ
salary_additional_text,Competitive base + car allowance + bonus + MORE!
y_true,0-0-None-None


# Testing

### Qualitative Tests

In [None]:
messages_static = RBIC_static_messages(verbose=True)

RB step 1: Sure! In a job listing, a salary range refers to the span between the minimum and maximum amount of compensation offered for a particular position. The currency specifies the unit in which the salary is denoted, such as USD (US Dollar), EUR (Euro), GBP (British Pound), etc. The frequency indicates how often the salary is paid, which could be yearly, monthly, weekly, or hourly.

RB step 2: Some common phrases and numeric patterns that indicate salary information in a job description include:

Salary Range:
- "salary range"
- "pay scale"
- "compensation"
- numeric ranges (e.g., $50,000 - $70,000)

Currency:
- currency symbols (e.g., $, €, £)
- country names (e.g., USD, EUR, GBP)
- words like "dollar," "euro," "pound"

Frequency:
- "per year"
- "annually"
- "per month"
- "monthly"
- "per hour"
- "hourly"

Fewshot examples added



In [None]:
ind = 345

desc = {
    "job_title": train_df.iloc[ind].job_title,
    "job_ad_details": train_df.iloc[ind].job_ad_details,
    "nation_short_desc": train_df.iloc[ind].nation_short_desc,
    "salary_additional_text": train_df.iloc[ind].salary_additional_text,
  }
desc_str = str(desc)

label = train_df.iloc[ind].y_true
label_str = str(label)

print(f"len of messages_static {len(messages_static)}")
label_pred = RBIC(messages_static, desc_str, verbose=True)


print(f"pred = {label_pred}")
print(f"truth = {label_str}")

len of messages_static 15
step3 messages: 17
IC step 1: yes

step4 messages: 19
IC step 2: {"Clue": "Salary: $27.36 - $27.65 per hour + 25% casual loading - Casual (up to 30 September 2023) - WHA2"}

step5 messages: 20
IC step 3 (Final): 27.36-27.65-USD-HOURLY

IC step 3 (Final Raw): {"MinSalary": 27.36, "MaxSalary": 27.65, "Currency": "USD", "Frequency": "Hourly"}

pred = 27.36-27.65-USD-HOURLY
truth = 70049-98351-AUD-ANNUAL


### Quantitative Tests

In [None]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [None]:
test_pred_df

Unnamed: 0,y_pred


In [None]:
for i in range(len(test_df)):
  desc = {
    "job_title": test_df.iloc[i].job_title,
    "job_ad_details": test_df.iloc[i].job_ad_details,
    "nation_short_desc": test_df.iloc[i].nation_short_desc,
    "salary_additional_text": test_df.iloc[i].salary_additional_text,
  }
  desc_str = str(desc)

  label = test_df.iloc[i].y_true
  label_str = str(label)

  label_pred = RBIC(messages_static, desc_str)
  test_pred_df.loc[len(test_pred_df)] = label_pred

# export the dataframe to a new csv file
test_pred_df.to_csv('salary_labelled_test_set_gpt3-5_preds.csv', index=False)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [None]:
test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')
test_pred_df = pd.read_csv('data/salary_labelled_test_set_gpt3-5_preds.csv')
train_df = pd.read_csv('data/salary_labelled_development_set_cleaned.csv')

In [None]:
# editing this manually to be 0-0-None-None for post-processing
test_pred_df.iloc[415].y_pred = "0-0-None-None"

In [None]:
def extract_salary_format(text):
    """
    Extracts a JSON salary block from text and formats it as 'min-max-currency-frequency'
    """
    try:
        # Extract JSON block after ```json or at end of text
        match = re.search(r'\{.*?\}', text, re.DOTALL)

        salary_data = json.loads(match.group())

        min_salary = salary_data.get("MinSalary", 0)
        max_salary = salary_data.get("MaxSalary", 0)
        currency = salary_data.get("Currency", "None")
        frequency = salary_data.get("Frequency", "None")

        return f"{min_salary}-{max_salary}-{currency}-{frequency}"
    except Exception as e:
        # print("Error parsing salary block:", e)
        raise Exception("No JSON block found")

In [None]:
import math

for i in range(len(test_pred_df)):
  pred = test_pred_df.iloc[i].y_pred

  if re.search(r"^ERROR", pred):
    pred = extract_salary_format(pred)

  fields = pred.split('-')

  if int(round(float(fields[0]))) == 0:
    test_pred_df.iloc[i].y_pred = "0-0-None-None"
  else:
    minSalary = int(round(float(fields[0])))
    maxSalary = int(round(float(fields[1])))
    currency = fields[2].upper()
    frequency = fields[3].upper()

    if currency == "NONE":
      currency = "None"

    if frequency == "YEARLY":
      frequency = "ANNUAL"
    elif frequency == "NONE":
      frequency = "None"

    test_pred_df.iloc[i].y_pred = f"{minSalary}-{maxSalary}-{currency}-{frequency}"

In [None]:
test_pred_df.to_csv('salary__clean_gpt3-5_preds.csv', index=False)

# Metrics


In [2]:
preds = pd.read_csv('data/salary__clean_gpt3-5_preds.csv')
test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')

In [3]:
preds_formatted = pd.DataFrame(columns=["y_pred", "y_pred_min", "y_pred_max", "y_pred_currency", "y_pred_frequency"])
test_formatted = pd.DataFrame(columns=["y_true", "y_true_min", "y_true_max", "y_true_currency", "y_true_frequency"])

In [4]:
def salary_info(salary_string):
  salary_string = salary_string.lower()
  salary_fields = salary_string.split("-")
  min_salary = salary_fields[0]
  max_salary = salary_fields[1]
  currency = salary_fields[2].lower()
  frequency = salary_fields[3].lower()

  salary_info = [salary_string, min_salary, max_salary, currency, frequency]

  return salary_info

In [5]:
for i in range(len(preds)):
    # post process prediction
    preds_formatted.loc[len(preds_formatted)] = salary_info(preds.loc[i, 'y_pred'])
    test_formatted.loc[len(test_formatted)] = salary_info(test_df.loc[i, 'y_true'])

In [6]:
test_formatted

Unnamed: 0,y_true,y_true_min,y_true_max,y_true_currency,y_true_frequency
0,1500-1800-myr-monthly,1500,1800,myr,monthly
1,60-60-hkd-hourly,60,60,hkd,hourly
2,0-0-none-none,0,0,none,none
3,0-0-none-none,0,0,none,none
4,0-0-none-none,0,0,none,none
...,...,...,...,...,...
562,26-26-nzd-hourly,26,26,nzd,hourly
563,0-0-none-none,0,0,none,none
564,0-0-none-none,0,0,none,none
565,1500-2500-myr-monthly,1500,2500,myr,monthly


In [7]:
preds_formatted

Unnamed: 0,y_pred,y_pred_min,y_pred_max,y_pred_currency,y_pred_frequency
0,1500-1800-myr-monthly,1500,1800,myr,monthly
1,60-60-hkd-hourly,60,60,hkd,hourly
2,0-0-none-none,0,0,none,none
3,0-0-none-none,0,0,none,none
4,0-0-none-none,0,0,none,none
...,...,...,...,...,...
562,26-26-nzd-hourly,26,26,nzd,hourly
563,0-0-none-none,0,0,none,none
564,0-0-none-none,0,0,none,none
565,1500-2500-myr-monthly,1500,2500,myr,monthly


In [8]:
acc_overall = (preds_formatted['y_pred'] == test_formatted['y_true']).mean() * 100
acc_min = (preds_formatted['y_pred_min'] == test_formatted['y_true_min']).mean() * 100
acc_max = (preds_formatted['y_pred_max'] == test_formatted['y_true_max']).mean() * 100
acc_curr = (preds_formatted['y_pred_currency'] == test_formatted['y_true_currency']).mean() * 100
acc_freq = (preds_formatted['y_pred_frequency'] == test_formatted['y_true_frequency']).mean() * 100

In [9]:
pd.DataFrame(
    {
        'Overall': acc_overall,
        'Min Salary': acc_min,
        'Max Salary': acc_max,
        'Currency': acc_curr,
        'Frequency': acc_freq
    },
    index=['Accuracy (%)']
)

Unnamed: 0,Overall,Min Salary,Max Salary,Currency,Frequency
Accuracy (%),85.008818,90.123457,89.770723,96.296296,95.767196
