In [None]:

from transformers import pipeline
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import sys
sys.path.append("../CODE-Baseline")  
import warnings
warnings.filterwarnings('ignore')


from salary_baseline import extract_salary_with_inference


file_path = '../../MISC/salary_labelled_development_set.csv'
test_file_path = '../../MISC/salary_labelled_test_set.csv'
df = pd.read_csv(file_path)

model_name = "deepset/roberta-base-squad2"

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer
)

Development dataset:
Precision: 0.7811
Recall: 0.9370
F1 Score: 0.8519
Accuracy: 0.8489

Test dataset:
Precision: 0.7412
Recall: 0.9206
F1 Score: 0.8212
Accuracy: 0.8219


Device set to use mps:0


In [2]:
import re
country_currency_map = {
    "PH": "PHP", "AUS": "AUD", "NZ": "NZD", "SG": "SGD",
    "MY": "MYR", "TH": "THB", "ID": "IDR", "HK": "HKD"
}
salary_keywords=['待遇', 'salary', 'wage', 'compensation', 'remuneration', 'gaji', 'bermula', 'basic', 'pokok',
                      'income']
def convert_k_to_number(text):
    # 匹配形如 20k、16.5k、30K 的数字
    def replace(match):
        num = float(match.group(1))
        return str(int(num * 1000))

    return re.sub(r'(\d+(?:\.\d+)?)k', replace, text, flags=re.IGNORECASE)
# html -> text
def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    for tag in soup(["script", "style"]):
        tag.decompose()
    text = soup.get_text(separator="\n", strip=True)
    
    # formate text
    text = re.sub(r"[•●▪►◆★♦✓✔⬤❖]", "", text)
    text = text.replace(",", "")
    for _, value in country_currency_map.items():
      text = text.replace(value, "$")
    text = text.replace("RM", "$")
    text = text.replace("฿", "$")
    text = text.replace("AU", "$")
    text = text.replace("$$", "$")
    text = text.replace("  ", " ")
    text = re.sub(r'\b[Tt][Oo]\b', '-', text)
    text = text.replace("and", "-")
    text = text.replace("至", "-")
    text = text.replace("hingga ke", "-")
    text = text.replace("hingga", "-")
    text = text.replace("Hingga", "-")
    text = text.replace("HINGGA", "-")
    text = convert_k_to_number(text)
    # 构建正则表达式，\b表示单词边界，|表示“或”
    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in salary_keywords) + r')\b'

    # 替换为 compensation，flags=re.IGNORECASE 表示不区分大小写
    text = re.sub(pattern, 'compensation', text, flags=re.IGNORECASE)
    return text
  

def get_period(text):
    unit_patterns = {
        "HOURLY": r'(per\s*hour|hourly|hr\b|/hr\b|/hour\b|時薪|每小時|每小時薪資|每節)',
        "DAILY": r'(per\s*day|daily|/day\b|日薪|每天|每日薪資)',
        "WEEKLY": r'(per\s*week|weekly|/week\b|週薪|每週|每周薪資|周薪)',
        "MONTHLY": r'(per\s*month|monthly|/month\b|/Mth\b|月薪|每月|每月薪資|sebulan)',
        "ANNUAL": r'(per\s*year|yearly|annually|remuneration|super|annum|p\.a\.|p/a|/year\b|年薪|每年|年度薪資)'
    }

    text = text.lower()
    for period, pattern in unit_patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return period
    return None

In [4]:
salary_question = "what is the salary?"
pay_freq_question = "Is the salary paid monthly, hourly, yearly, weekly, or daily?"

def get_salary_using_nFT_RoBerta(text,nation_code):
  context=clean_html_tags(text)
  salary_result = qa_pipeline({"context": context, "question": salary_question})
  t_result=extract_salary_with_inference(salary_result["answer"],nation_code)
  # print(t_result)
  if t_result!='0-0-None-None':
    pay_freq_result = qa_pipeline({"context": context, "question": pay_freq_question})
    period = get_period(pay_freq_result["answer"])
    if period is not None:
        pattern = r"(\d+)-(\d+)-([A-Z]+)-([A-Z]+)"
        match = re.match(pattern, t_result)
        if not match:
            return '0-0-None-None'
          
        min_salary, max_salary, currency, _ = match.groups()
        return f"{min_salary}-{max_salary}-{currency}-{period}"
    
    
  return t_result

    
    
    
df['predicted_salary'] = df.apply(
    lambda row: get_salary_using_nFT_RoBerta(
        f"{row['job_title']} {row['job_ad_details']}",
        row['nation_short_desc']
    ),
    axis=1
)
    


In [5]:
# TP, FP, TN, FN
TP = np.sum((df['predicted_salary'] == df['y_true']) & (df['y_true'] != "0-0-None-None"))
FP = np.sum((df['predicted_salary'] != df['y_true']) & (df['predicted_salary'] != "0-0-None-None"))
FN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] != "0-0-None-None"))
TN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] == "0-0-None-None"))

precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

# Print prediction vs ground truth
# print("\n🔍 Prediction vs Ground Truth:\n")
# for i, row in df.iterrows():
#     predicted = row['predicted_salary']
#     expected = row['y_true']
#     if predicted != expected:
#         print(f"[{i}] ❌ Predicted: {predicted} | Expected: {expected}")
#         print(f"{row['job_id']} {row['job_title']} {row['job_ad_details']}")
#         print()
    # else:
    #     print(f"[{i}] ✅ Matched:   {predicted}")

print("Development dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print()

'''
Development dataset:
Precision: 0.7220
Recall: 0.7746
F1 Score: 0.7474
Accuracy: 0.7653
'''

Development dataset:
Precision: 0.6990
Recall: 0.7819
F1 Score: 0.7382
Accuracy: 0.7587



'\nDevelopment dataset:\nPrecision: 0.7220\nRecall: 0.7746\nF1 Score: 0.7474\nAccuracy: 0.7653\n'

In [6]:
# test set
df = pd.read_csv(test_file_path)
df['predicted_salary'] = df.apply(
    lambda row: get_salary_using_nFT_RoBerta(
        f"{row['job_title']} {row['job_ad_details']}",
        row['nation_short_desc']
    ),
    axis=1
)

TP = np.sum((df['predicted_salary'] == df['y_true']) & (df['y_true'] != "0-0-None-None"))
FP = np.sum((df['predicted_salary'] != df['y_true']) & (df['predicted_salary'] != "0-0-None-None"))
FN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] != "0-0-None-None"))
TN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] == "0-0-None-None"))


precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

print("Test dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

'''
Test dataset:
Precision: 0.6903
Recall: 0.7520
F1 Score: 0.7198
Accuracy: 0.7460
'''

Test dataset:
Precision: 0.6728
Recall: 0.7625
F1 Score: 0.7148
Accuracy: 0.7425


'\nTest dataset:\nPrecision: 0.6903\nRecall: 0.7520\nF1 Score: 0.7198\nAccuracy: 0.7460\n'