In [1]:

from transformers import pipeline
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import sys
import json
import evaluate
import torch
from sklearn.metrics import f1_score
sys.path.append("../CODE-Baseline")  
import warnings
warnings.filterwarnings('ignore')
from salary_baseline import infer_period_by_amount




Development dataset:
Precision: 0.8357
Recall: 0.9524
F1 Score: 0.8902
Accuracy: 0.8889

Test dataset:
Precision: 0.7412
Recall: 0.9206
F1 Score: 0.8212
Accuracy: 0.8219


In [2]:
import re
country_currency_map = {
    "PH": "PHP", "AUS": "AUD", "NZ": "NZD", "SG": "SGD",
    "MY": "MYR", "TH": "THB", "ID": "IDR", "HK": "HKD"
}
salary_keywords=['待遇', 'salary', 'wage', 'compensation', 'remuneration', 'gaji', 'bermula', 'basic', 'pokok',
                      'income']
def convert_k_to_number(text):
    # 匹配形如 20k、16.5k、30K 的数字
    def replace(match):
        num = float(match.group(1))
        return str(int(num * 1000))

    return re.sub(r'(\d+(?:\.\d+)?)k', replace, text, flags=re.IGNORECASE)
# html -> text
def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    for tag in soup(["script", "style"]):
        tag.decompose()
    text = soup.get_text(separator="\n", strip=True)
    
    # formate text
    text = re.sub(r"[•●▪►◆★♦✓✔⬤❖]", "", text)
    text = text.replace(",", "")
    for _, value in country_currency_map.items():
      text = text.replace(value, "$")
    text = text.replace("RM", "$")
    text = text.replace("฿", "$")
    text = text.replace("AU", "$")
    text = text.replace("$$", "$")
    text = text.replace("  ", " ")
    text = re.sub(r'\b[Tt][Oo]\b', '-', text)
    text = text.replace("and", "-")
    text = text.replace("&", "-")
    text = text.replace("至", "-")
    text = text.replace("hingga ke", "-")
    text = text.replace("hingga", "-")
    text = text.replace("Hingga", "-")
    text = text.replace("HINGGA", "-")
    text = convert_k_to_number(text)
    # 构建正则表达式，\b表示单词边界，|表示“或”
    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in salary_keywords) + r')\b'

    # 替换为 compensation，flags=re.IGNORECASE 表示不区分大小写
    text = re.sub(pattern, 'compensation', text, flags=re.IGNORECASE)
    return text

In [3]:
'''
{
  "context": "Financial Account - Call Center Agent - Up - 34k\n...（省略）...",
  "question": "what is fixed compensation",
  "answers": {
    "text": ["17500"],
    "answer_start": [1257]
  },
  "is_impossible": false
}
'''
# from labelled json -> Squad2 json 
with open('../DATASETS/1900_labelled.json', "r") as f:
    raw_data = json.load(f)
    
processed = []
for item in raw_data:
    context = item["raw_text"]

    # fixed salary
    if "answer_1" in item and item["answer_1"]:
        processed.append({
            "context": context,
            "question": "what is fixed compensation?",
            "answers": {
                "text": [item["answer_1"][0]["text"]],
                "answer_start": [item["answer_1"][0]["start"]]
            },
            "is_impossible": False
        })
    else:
        processed.append({
            "context": context,
            "question": "what is fixed compensation?",
            "answers": {
                "text": [],
                "answer_start": []
            },
            "is_impossible": True
        })

    # salary range
    if "answer_2" in item and item["answer_2"]:
        processed.append({
            "context": context,
            "question": "what is compensation range?",
            "answers": {
                "text": [item["answer_2"][0]["text"]],
                "answer_start": [item["answer_2"][0]["start"]]
            },
            "is_impossible": False
        })
    else:
        processed.append({
            "context": context,
            "question": "what is compensation range?",
            "answers": {
                "text": [],
                "answer_start": []
            },
            "is_impossible": True
        })

    # pay period
    if "answer_3" in item and item["answer_3"]:
        processed.append({
            "context": context,
            "question": "what is pay period?",
            "answers": {
                "text": [item["answer_3"][0]["text"]],
                "answer_start": [item["answer_3"][0]["start"]]
            },
            "is_impossible": False
        })
    else:
        processed.append({
            "context": context,
            "question": "what is pay period?",
            "answers": {
                "text": [],
                "answer_start": []
            },
            "is_impossible": True
        })


with open("qa_dataset_squad2-1900.json", "w", encoding="utf-8") as f:
    json.dump(processed, f, indent=2, ensure_ascii=False)

In [None]:
model_name = "deepset/roberta-base-squad2"

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

with open("qa_dataset_squad2-1900.json", "r", encoding="utf-8") as f:
    squad_data = json.load(f)

# Step 2: 转换为 HuggingFace Dataset
dataset = Dataset.from_list(squad_data)
dataset = dataset.train_test_split(test_size=0.1,seed=38)  

# Step 3: Tokenizer 处理函数（包含 offset 映射）
def preprocess(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = example["answers"][sample_index]
        if example["is_impossible"][sample_index]:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # 找到 token 的 start & end
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # 找到实际的 token span
            start_idx = cls_index
            end_idx = cls_index
            for idx in range(token_start_index, token_end_index + 1):
                if offsets[idx] is None:
                    continue
                start, end = offsets[idx]
                if start <= start_char < end:
                    start_idx = idx
                if start < end_char <= end:
                    end_idx = idx
                    break

            start_positions.append(start_idx)
            end_positions.append(end_idx)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

# Step 4: 预处理数据
tokenized_datasets = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)

def compute_metrics(p):
    predictions, labels = p
    
    # 将预测和标签转换为 Tensor
    start_preds = torch.tensor(predictions[0])
    end_preds = torch.tensor(predictions[1])
    start_labels = torch.tensor(labels[0])
    end_labels = torch.tensor(labels[1])
    
    # 获取 start 和 end 的预测位置
    start_pred = torch.argmax(start_preds, dim=1)
    end_pred = torch.argmax(end_preds, dim=1)
    
    # 计算损失
    start_loss = torch.nn.CrossEntropyLoss()(start_preds, start_labels)
    end_loss = torch.nn.CrossEntropyLoss()(end_preds, end_labels)
    total_loss = start_loss + end_loss
    
    # 计算 exact match
    exact_match = (start_pred == start_labels).float().mean() * (end_pred == end_labels).float().mean()

    # F1 score
    # 将 start_pred, start_labels, end_pred, end_labels 转换为 numpy 数组
    start_pred_np = start_pred.cpu().numpy()
    start_labels_np = start_labels.cpu().numpy()
    end_pred_np = end_pred.cpu().numpy()
    end_labels_np = end_labels.cpu().numpy()

    # 计算 F1 score
    f1_start = f1_score(start_labels_np, start_pred_np, average="micro")
    f1_end = f1_score(end_labels_np, end_pred_np, average="micro")

    # 计算总的 F1 score（可以根据需求自己决定如何结合 start 和 end 的 F1 分数）
    f1 = (f1_start + f1_end) / 2

    return {
        "start_loss": start_loss.item(),
        "end_loss": end_loss.item(),
        "total_loss": total_loss.item(),
        "exact_match": exact_match.item(),
        "f1": f1,
    }
  
# Step 5: 训练参数
training_args = TrainingArguments(
    output_dir="./roberta-qa",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=8,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,  
    metric_for_best_model="f1",  
    greater_is_better=True,  
)

# Step 6: Trainer 初始化
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  
)

# Step 7: 开始训练
trainer.train()

In [None]:
trainer.save_model("./roberta-qa-1900")
tokenizer.save_pretrained("./roberta-qa-1900")

In [4]:
import re

def get_fixed_salary(text):
    text = re.sub(r'[^\d\-.]', '', text)
    pattern = r'(\d+(?:\.\d+)?)\s*'  # 匹配数字或小数
    match = re.search(pattern, text)
    if match:
        salary = round(float(match.group(1)))
        return salary
    return None  
  
def get_salary_range(text):
    text = text.replace("\n", "-")
    text = re.sub(r'[^\d\-.]', '', text)
    pattern = r'(\d+(?:\.\d+)?)(?:[-–—]+)(\d+(?:\.\d+)?)?'
    match = re.search(pattern, text)
    if match:
        min_salary = match.group(1)
        if match.group(2):
            is_range = True
        else:
          return None
        max_salary = match.group(2) if match.group(2) else match.group(1)

        min_salary = round(float(min_salary))
        max_salary = round(float(max_salary))

        if min_salary > max_salary:
            return None
        return (min_salary,max_salary)
    return None 

import re

def get_period(text):
    unit_patterns = {
        "HOURLY": r'(per\s*hour|hourly|hr\b|/hr\b|/hour\b|時薪|每小時|每小時薪資|每節)',
        "DAILY": r'(per\s*day|daily|/day\b|日薪|每天|每日薪資)',
        "WEEKLY": r'(per\s*week|weekly|/week\b|週薪|每週|每周薪資|周薪)',
        "MONTHLY": r'(per\s*month|monthly|/month\b|/Mth\b|月薪|每月|每月薪資|sebulan|bulanan)',
        "ANNUAL": r'(per\s*year|yearly|annually|remuneration|super|annum|p\.a\.|p/a|/year\b|年薪|每年|年度薪資)'
    }

    text = text.lower()
    for period, pattern in unit_patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return period
    return None


In [11]:
# 加载你 fine-tuned 的模型和 tokenizer
model_path = "./roberta-qa-1900-seed48"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

# 构造 pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

questions = [
    "What is fixed compensation?",
    "What is compensation range?",
    "What is pay period?"
]
file_path = '../DATASETS/salary_labelled_development_set.csv'
test_file_path = '../DATASETS/salary_labelled_test_set.csv'

country_currency_map = {
    "PH": "PHP", "AUS": "AUD", "NZ": "NZD", "SG": "SGD",
    "MY": "MYR", "TH": "THB", "ID": "IDR", "HK": "HKD"
}

def get_salary_using_FT_RoBerta(text, nation_code=None):
    context = clean_html_tags(text)
    currency = country_currency_map.get(nation_code, "None")
    questions = [
        "What is fixed compensation?",
        "What is compensation range?",
        "What is pay period?"
    ]

    results = []
    for q in questions:
        output = qa_pipeline(question=q, context=context)
        answer = output['answer']
        score = output['score']
        results.append((q, answer, score))


    fixed_salary = get_fixed_salary(results[0][1])
    salary_range = get_salary_range(results[1][1])
    if salary_range:
        min_salary, max_salary = salary_range
    else:
        min_salary, max_salary = None, None
    period = get_period(results[2][1])
    # period = None
    if period is None and (salary_range or fixed_salary) :
      if fixed_salary:
        if salary_range:
          if min_salary < fixed_salary < max_salary:
            avg = fixed_salary
          else:
            avg = (int(min_salary) + int(max_salary)) / 2
        else:
          avg = fixed_salary
      else:
        if salary_range:
          avg = (int(min_salary) + int(max_salary)) / 2
      period = infer_period_by_amount(avg, currency)



    if fixed_salary:
      if salary_range:
        if min_salary < fixed_salary < max_salary:
          return f"{fixed_salary}-{fixed_salary}-{currency}-{period}"
        return f"{min_salary}-{max_salary}-{currency}-{period}"
      else:
        return f"{fixed_salary}-{fixed_salary}-{currency}-{period}"
    else:
      if salary_range:
        return f"{min_salary}-{max_salary}-{currency}-{period}"
      else:
        return "0-0-None-None"
    return "0-0-None-None"
    

Device set to use mps:0


In [6]:
def fuzzy_equal(predict, y_true):
    pattern = r"(\d+)-(\d+)-([A-Z]+)-([A-Z]+)"

    if predict=="0-0-None-None" and y_true=="0-0-None-None":
      return True
    match_pred = re.match(pattern, predict)
    match_true = re.match(pattern, y_true)

    if not match_pred or not match_true:
      return False  

    min_pred, max_pred, currency_pred, period_pred = match_pred.groups()
    min_true, max_true, currency_true, period_true = match_true.groups()

    # 转为整数
    min_pred, max_pred = int(min_pred), int(max_pred)
    min_true, max_true = int(min_true), int(max_true)

    # 允许1的误差
    min_ok = abs(min_pred - min_true) <= 1
    max_ok = abs(max_pred - max_true) <= 1

    # 货币和周期必须严格相等
    currency_ok = currency_pred == currency_true
    period_ok = period_pred == period_true

    return min_ok and max_ok and currency_ok and period_ok

In [7]:

df = pd.read_csv(file_path)

    
    
df['predicted_salary'] = df.apply(
    lambda row: get_salary_using_FT_RoBerta(
        f"{row['job_title']} {row['job_ad_details']}",
        row['nation_short_desc']
    ),
    axis=1
)

# example = df.iloc[2]
# predicted_salary = get_salary_using_FT_RoBerta(example['job_ad_details'],example['nation_short_desc'])


# TP, FP, TN, FN
TP = np.sum((df['predicted_salary'] == df['y_true']) & (df['y_true'] != "0-0-None-None"))
FP = np.sum((df['predicted_salary'] != df['y_true']) & (df['predicted_salary'] != "0-0-None-None"))
FN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] != "0-0-None-None"))
TN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] == "0-0-None-None"))

precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

# # Print prediction vs ground truth
# print("\n🔍 Prediction vs Ground Truth:\n")
# for i, row in df.iterrows():
#     predicted = row['predicted_salary']
#     expected = row['y_true']
#     if predicted != expected:
#         print(f"[{i}] ❌ Predicted: {predicted} | Expected: {expected}")
#     else:
#         print(f"[{i}] ✅ Matched:   {predicted}")

print("Development dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print()

'''
roberta-qa-1145 seed1145
Development dataset:
Precision: 0.7799
Recall: 0.9545
F1 Score: 0.8584
Accuracy: 0.8474

roberta-qa-1463 seed 1463
Development dataset:
Precision: 0.7284
Recall: 0.9797
F1 Score: 0.8356
Accuracy: 0.8156

roberta-qa-1463 seed 1611
Development dataset:
Precision: 0.8161
Recall: 0.9586
F1 Score: 0.8816
Accuracy: 0.8738

roberta-qa-1900 seed 2000 
Development dataset:
Precision: 0.7982
Recall: 0.9696
F1 Score: 0.8756
Accuracy: 0.8641

roberta-qa-1900 seed 48
Development dataset:
Precision: 0.8266
Recall: 0.9784
F1 Score: 0.8961
Accuracy: 0.8888
'''

# the id whose y_true is wrong
ignore_id_path = '../DATASETS/err_salary_develpment.csv'
df_ignore = pd.read_csv(ignore_id_path)
# some y_true is impossible
df = df[~df['job_id'].isin(df_ignore['job_id'])]
df['is_positive'] = df['predicted_salary'] != "0-0-None-None"
# fuzzy 匹配
df['is_fuzzy_match'] = df.apply(lambda row: fuzzy_equal(row['predicted_salary'], row['y_true']), axis=1)

# 计算 TP / FP / FN / TN
TP = np.sum(df['is_fuzzy_match'] & df['is_positive'])
FP = np.sum(~df['is_fuzzy_match'] & df['is_positive'])
FN = np.sum(~df['is_fuzzy_match'] & ~df['is_positive'])
TN = np.sum(df['is_fuzzy_match'] & ~df['is_positive'])


precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

print("Development dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")


Development dataset:
Precision: 0.8266
Recall: 0.9784
F1 Score: 0.8961
Accuracy: 0.8888

Development dataset:
Precision: 0.8899
Recall: 0.9944
F1 Score: 0.9393
Accuracy: 0.9353


In [10]:
# test set
df = pd.read_csv(test_file_path)

# Apply extractor
df['predicted_salary'] = df.apply(
    lambda row: get_salary_using_FT_RoBerta(
        f"{row['job_title']} {row['job_ad_details']}",
        row['nation_short_desc']
    ),
    axis=1
)


TP = np.sum((df['predicted_salary'] == df['y_true']) & (df['y_true'] != "0-0-None-None"))
FP = np.sum((df['predicted_salary'] != df['y_true']) & (df['predicted_salary'] != "0-0-None-None"))
FN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] != "0-0-None-None"))
TN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] == "0-0-None-None"))


precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

print("Test dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
'''
roberta-qa-1145 seed 1145
Test dataset:
Precision: 0.7599
Recall: 0.9294
F1 Score: 0.8361
Accuracy: 0.8272

roberta-qa-1463 seed 1145
Test dataset:
Precision: 0.7265
Recall: 0.9623
F1 Score: 0.8279
Accuracy: 0.8131

roberta-qa-1463 seed 1611
Test dataset:
Precision: 0.7636
Recall: 0.9582
F1 Score: 0.8499
Accuracy: 0.8430

roberta-qa-1900 seed 2000 
Test dataset:
Precision: 0.7529
Recall: 0.9774
F1 Score: 0.8506
Accuracy: 0.8395

roberta-qa-1900 seed 48
Test dataset:
Precision: 0.7879
Recall: 0.9665
F1 Score: 0.8681
Accuracy: 0.8607
'''


Test dataset:
Precision: 0.7364
Recall: 0.9643
F1 Score: 0.8351
Accuracy: 0.8307


'\nroberta-qa-1145 seed 1145\nTest dataset:\nPrecision: 0.7599\nRecall: 0.9294\nF1 Score: 0.8361\nAccuracy: 0.8272\n\nroberta-qa-1463 seed 1145\nTest dataset:\nPrecision: 0.7265\nRecall: 0.9623\nF1 Score: 0.8279\nAccuracy: 0.8131\n\nroberta-qa-1463 seed 1611\nTest dataset:\nPrecision: 0.7636\nRecall: 0.9582\nF1 Score: 0.8499\nAccuracy: 0.8430\n\nroberta-qa-1900 seed 2000 \nTest dataset:\nPrecision: 0.7529\nRecall: 0.9774\nF1 Score: 0.8506\nAccuracy: 0.8395\n\nroberta-qa-1900 seed 48\nTest dataset:\nPrecision: 0.7879\nRecall: 0.9665\nF1 Score: 0.8681\nAccuracy: 0.8607\n'

In [None]:
# # Print prediction vs ground truth
# print("\n🔍 Prediction vs Ground Truth:\n")
# for i, row in df.iterrows():
#     predicted = row['predicted_salary']
#     expected = row['y_true']
#     if predicted != expected:
#         print(f"[{i}] ❌ Predicted: {predicted} | Expected: {expected}")
#     else:
#         print(f"[{i}] ✅ Matched:   {predicted}")

In [None]:
df['is_positive'] = df['predicted_salary'] != "0-0-None-None"

# fuzzy 匹配
df['is_fuzzy_match'] = df.apply(lambda row: fuzzy_equal(row['predicted_salary'], row['y_true']), axis=1)

# 计算 TP / FP / FN / TN
TP = np.sum(df['is_fuzzy_match'] & df['is_positive'])
FP = np.sum(~df['is_fuzzy_match'] & df['is_positive'])
FN = np.sum(~df['is_fuzzy_match'] & ~df['is_positive'])
TN = np.sum(df['is_fuzzy_match'] & ~df['is_positive'])


precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

print("Test dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")