# Loading

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import openai
from openai import OpenAI
import string
import re
from bs4 import BeautifulSoup
import json
import copy

In [2]:
client = OpenAI(api_key="", base_url="https://api.deepseek.com")
model="deepseek-reasoner" # DeepSeek-R1

# Salary

In [3]:
train_df = pd.read_csv('data/salary_labelled_development_set_cleaned.csv')
test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')

In [4]:
train_df.iloc[1131]

job_id                                                             51839829
job_title                                        Senior Key Account Manager
job_ad_details            At RB, we’re driven by our fight to make acces...
nation_short_desc                                                        NZ
salary_additional_text     Competitive base + car allowance + bonus + MORE!
y_true                                                        0-0-None-None
Name: 1131, dtype: object

# Regular prompting

In [13]:
# df to store model predictions
test_pred_df = pd.DataFrame(columns=["y_pred"])

In [14]:
messages_static = [
    {"role": "system", "content": "You are an expert job ad annotator. Your task is to extract structured salary information from job descriptions in the format: min-max-currency-frequency. If salary is not found, return: 0-0-None-None."},
]

In [15]:
for i in tqdm(range(len(test_df))):
    desc = {
        "job_title": test_df.iloc[i].job_title,
        "job_ad_details": test_df.iloc[i].job_ad_details,
        "nation_short_desc": test_df.iloc[i].nation_short_desc,
        "salary_additional_text": test_df.iloc[i].salary_additional_text,
    }
    desc_str = str(desc)

    messages = copy.deepcopy(messages_static)
    messages.append({
        "role": "user",
        "content": (
            f"{desc_str} Extract structured salary information from this job descriptions in the format: min-max-currency-frequency. "
            "Respond in JSON: {\"MinSalary\": \"\", \"MaxSalary\": \"\", \"Currency\": \"\", \"Frequency\": \"\"}. "
            "If not provided explicitly, output 0 for \"MinSalary\" and \"MaxSalary\", and \"None\" for \"Currency\" and \"Frequency\". "
            "If the salary is mentioned, always output a range, where MinSalary and MaxSalary can be equal. "
            "Use 'nation_short_desc' to determine the correct currency. "
            "Output the currency as 3 letters. Use adverb to output frequency (annual, monthly, daily or hourly)."
        )
    })
    
    response = client.chat.completions.create(
        model=model,
        messages=messages
    )
    answer_str = response.choices[0].message.content

    # format the output
    try:
        answer_str_ = answer_str[answer_str.find('{'):answer_str.find('}') + 1]
        answer_str_ = answer_str_.replace('“', '"')
        answer_str_ = answer_str_.replace('”', '"')
        answer = json.loads(answer_str_)

        label = f"{answer['MinSalary']}-{answer['MaxSalary']}-{answer['Currency'].upper()}-{answer['Frequency'].upper()}"

    except json.JSONDecodeError:
        print(f"Failed to parse model output as JSON: {answer_str}")
        label = "ERROR " + answer_str

    test_pred_df.loc[len(test_pred_df)] = label

# export the dataframe to a new csv file
test_pred_df.to_csv('salary_labelled_test_set_deepseek_r1_preds.csv', index=False)

100%|███████████████████████████████████████| 567/567 [3:10:25<00:00, 20.15s/it]


# Metrics

In [2]:
import json
import string
import pandas as pd

In [4]:
def simple_post_process(row):
    if 'ERROR' not in row:
        row = row.translate(str.maketrans('', '', '"'))
        try:
            mn, mx, cur, freq = row.split('-')
        except Exception:
            row_split = row.split('-')
            mn = row_split[0]
            mx = ''.join(c for c in mn if c.isdigit())
            mx = row_split[-3]
            mx = ''.join(c for c in mx if c.isdigit())
            cur = row_split[-2]
            freq = row_split[-1]
        # 'NONE' -> 'None'
        cur = 'None' if cur == 'NONE' else cur
        freq = 'None' if freq == 'NONE' else freq
        # cast min and max salary to int
        try:
            mn = int(round(float(mn)))
        except Exception:
            mn = 0
        try:
            mx = int(round(float(mx)))
        except Exception:
            mx = 0
        return str(mn), str(mx), cur, freq, f'{mn}-{mx}-{cur}-{freq}'
    else:
        row = row.strip()
        try:
            row = row[row.find('{'):-4]
            row_data = json.loads(row)
        except Exception:
            return 0, 0, 'None', 'None', '0-0-None-None'
        row_data = row_data['Full Time'] if 'Full Time' in row_data else (row_data['Full-timer'] if 'Full-timer' in row_data else row_data)
        try:
            mn = row_data['MinSalary']
            mx = row_data['MaxSalary']
            cur = row_data['Currency']
            freq = row_data['Frequency']
        except Exception:
            return 0, 0, 'None', 'None', '0-0-None-None'
        # 'NONE' -> 'None'
        cur = 'None' if cur == 'NONE' else cur
        freq = 'None' if freq == 'NONE' else freq
        # cast min and max salary to int
        mn = int(round(float(mn)))
        mx = int(round(float(mx)))
        return str(mn), str(mx), cur, freq, f'{mn}-{mx}-{cur}-{freq}'

def split_target(row):
    mn, mx, cur, freq = row.split('-')
    return mn, mx, cur, freq

def get_accuracy(path_to_preds):
    preds = pd.read_csv(path_to_preds)
    test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')
    
    test_df['y_pred'] = preds.values.reshape(-1)
    
    test_df['min_salary_pred'], test_df['max_salary_pred'], test_df['currency_pred'], test_df['freq_pred'], test_df['y_pred'] = zip(*test_df['y_pred'].map(simple_post_process))
    test_df['min_salary_true'], test_df['max_salary_true'], test_df['currency_true'], test_df['freq_true'] = zip(*test_df['y_true'].map(split_target))
    
    acc_overall = (test_df['y_pred'] == test_df['y_true']).mean() * 100
    acc_min = (test_df['min_salary_pred'] == test_df['min_salary_true']).mean() * 100
    acc_max = (test_df['max_salary_pred'] == test_df['max_salary_true']).mean() * 100
    acc_curr = (test_df['currency_pred'] == test_df['currency_true']).mean() * 100
    acc_freq = (test_df['freq_pred'] == test_df['freq_true']).mean() * 100
    
    res = pd.DataFrame(
        {
            'Overall': round(acc_overall, 2),
            'Min Salary': round(acc_min, 2),
            'Max Salary': round(acc_max, 2),
            'Currency': round(acc_curr, 2),
            'Frequency': round(acc_freq, 2),
        },
        index=['Accuracy (%)']
    )
    
    return res

In [5]:
preds = pd.read_csv('salary_labelled_test_set_deepseek_r1_preds.csv')
test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')
test_df['y_pred'] = preds.values.reshape(-1)
test_df['min_salary_pred'], test_df['max_salary_pred'], test_df['currency_pred'], test_df['freq_pred'], test_df['y_pred'] = zip(*test_df['y_pred'].map(simple_post_process))
test_df['min_salary_true'], test_df['max_salary_true'], test_df['currency_true'], test_df['freq_true'] = zip(*test_df['y_true'].map(split_target))

acc_overall = (test_df['y_pred'] == test_df['y_true']).mean() * 100
acc_min = (test_df['min_salary_pred'] == test_df['min_salary_true']).mean() * 100
acc_max = (test_df['max_salary_pred'] == test_df['max_salary_true']).mean() * 100
acc_curr = (test_df['currency_pred'] == test_df['currency_true']).mean() * 100
acc_freq = (test_df['freq_pred'] == test_df['freq_true']).mean() * 100

preds = pd.read_csv('salary_labelled_test_set_deepseek_preds.csv')
test_df = pd.read_csv('data/salary_labelled_test_set_cleaned.csv')
test_df['y_pred'] = preds.values.reshape(-1)
test_df['min_salary_pred'], test_df['max_salary_pred'], test_df['currency_pred'], test_df['freq_pred'], test_df['y_pred'] = zip(*test_df['y_pred'].map(simple_post_process))
test_df['min_salary_true'], test_df['max_salary_true'], test_df['currency_true'], test_df['freq_true'] = zip(*test_df['y_true'].map(split_target))

acc_overall_v3 = (test_df['y_pred'] == test_df['y_true']).mean() * 100
acc_min_v3 = (test_df['min_salary_pred'] == test_df['min_salary_true']).mean() * 100
acc_max_v3 = (test_df['max_salary_pred'] == test_df['max_salary_true']).mean() * 100
acc_curr_v3 = (test_df['currency_pred'] == test_df['currency_true']).mean() * 100
acc_freq_v3 = (test_df['freq_pred'] == test_df['freq_true']).mean() * 100


pd.DataFrame(
    {
        'Overall': [round(acc_overall_v3, 2), round(acc_overall, 2)],
        'Min Salary': [round(acc_min_v3, 2), round(acc_min, 2)],
        'Max Salary': [round(acc_max_v3, 2), round(acc_max, 2)],
        'Currency': [round(acc_curr_v3, 2), round(acc_curr, 2)],
        'Frequency': [round(acc_freq_v3, 2), round(acc_freq, 2)],
        'Time (seconds)': [2917, 11425]
    },
    index=['DeepSeek-V3', 'DeepSeek-R1']
).style.set_caption('Accuracy (%), Salary, Regular prompting')

Unnamed: 0,Overall,Min Salary,Max Salary,Currency,Frequency,Time (seconds)
DeepSeek-V3,87.3,91.36,90.48,97.18,96.65,2917
DeepSeek-R1,86.77,91.01,91.18,98.59,94.36,11425
