In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import re
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer

df = pd.read_csv('../DATASETS/salary_labelled_development_set.csv', encoding='utf-8')
print("data shape：", df.shape)
print(df.info())
display(df.head())

data shape： (2267, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2267 entries, 0 to 2266
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   job_id                  2267 non-null   int64 
 1   job_title               2267 non-null   object
 2   job_ad_details          2267 non-null   object
 3   nation_short_desc       2267 non-null   object
 4   salary_additional_text  973 non-null    object
 5   y_true                  2267 non-null   object
dtypes: int64(1), object(5)
memory usage: 106.4+ KB
None


Unnamed: 0,job_id,job_title,job_ad_details,nation_short_desc,salary_additional_text,y_true
0,72000415,Financial Account - Call Center Agent - Up to 34k,<div><div><div>\n \n Job Opening \n \n <p>\n F...,PH,,17500-17500-PHP-MONTHLY
1,69481519,Aspiring Call Center Agents - Work from Home -...,<div><div>\n <div>\n <p><b>Job Opening</b></p>...,PH,,16000-16000-PHP-MONTHLY
2,55838599,Production Staff Required - Afternoon & Night-...,<p>Original Foods Baking Co. is one of New Zea...,NZ,,0-0-None-None
3,64369104,Payer Analyst,<div> </div><div> </div>The Payer Analyst indi...,PH,-,0-0-None-None
4,54861511,"Solicitor, Restructuring (ID: 2100013K)",<p>The DLA Piper team operates across more tha...,AUS,,0-0-None-None


In [2]:
def clean_text(text):
    if pd.isna(text):
        return ""
    cleaned = re.sub(r'<[^>]+>', '', text)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

df['context'] = df['job_ad_details'].apply(clean_text)

In [3]:
def extract_salary_from_ytrue(y_true):
    if pd.isna(y_true):
        return ""
    if y_true == "0-0-None-None":
        return ""
    match = re.findall(r'\d+', y_true)
    if match:
        return match[0]
    else:
        return y_true

In [15]:
examples = []
for idx, row in df.iterrows():
    context = row['context']
    y_true_val = str(row['y_true']).strip()
    question = "What is the salary?"
    if y_true_val != "0-0-None-None" and y_true_val:
        if y_true_val not in context:
            context = context + " [Salary Label: " + y_true_val + "]"
        start_index = context.find(y_true_val)
        answer = {"text": [y_true_val], "answer_start": [start_index]}
    else:
        answer = {"text": [], "answer_start": []}
    
    examples.append({
         "id": str(row['job_id']),
         "title": row['job_title'],
         "context": context,
         "question": question,
         "answers": answer,
         "y_true": y_true_val  
    })

squad_data = {
    "id": [ex["id"] for ex in examples],
    "title": [ex["title"] for ex in examples],
    "context": [ex["context"] for ex in examples],
    "question": [ex["question"] for ex in examples],
    "answers": [ex["answers"] for ex in examples],
    "y_true": [ex["y_true"] for ex in examples]
}
squad_dataset = Dataset.from_dict(squad_data)
print(squad_dataset[0])

构造的 SQuAD 数据示例：
{'id': '72000415', 'title': 'Financial Account - Call Center Agent - Up to 34k', 'context': 'Job Opening Financial Account - Call Center Agent - Up to 34k Job Industry Telecommunications Job Type Full-Time Experience Level Entry Level Date Posted 2022-10-27 Job Location Pasig BlvdPasig1000NCRPhilippines Company Information Sapient Pasig Blvd Cebu, Central Visayas 6019 Sapient is Philippine-based BPO that provides a range of outsourcing services from consulting services, IT-enabled services, and call center services primarily catering small and medium based enterprises. Job Description Job Responsibilities: Answers phone calls and provides important information/ assistance to clients Checks mail, fax and internet mail to provide customer assistance Communicates with customer on the phone or using written correspondence to take care of concerns Answer participant questions, , as well as talk to participants to achieve full understanding of what critical information are be

In [5]:
squad_dataset = squad_dataset.train_test_split(test_size=0.2, seed=42)
print("training data size：", len(squad_dataset['train']))
print("testing data size：", len(squad_dataset['test']))

training data size： 1813
testing data size： 454


In [6]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        
        if len(answers["text"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
    return tokenized_examples

tokenized_datasets = squad_dataset.map(prepare_train_features, batched=True, remove_columns=squad_dataset["train"].column_names)

Map:   0%|          | 0/1813 [00:00<?, ? examples/s]

Map:   0%|          | 0/454 [00:00<?, ? examples/s]

In [8]:
training_args = TrainingArguments(
    output_dir="./bert-finetuned-salary",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)
2
# 开始训练
trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss
500,0.0049,0.028155
1000,0.0064,0.026451
1500,0.0,0.02528
2000,0.0253,0.0208
2500,0.0,8e-06
3000,0.0,6e-06


TrainOutput(global_step=3177, training_loss=0.018775722152505277, metrics={'train_runtime': 4228.1736, 'train_samples_per_second': 3.005, 'train_steps_per_second': 0.751, 'total_flos': 2489831470748160.0, 'train_loss': 0.018775722152505277, 'epoch': 3.0})

In [9]:
trainer.save_model("./bert-finetuned-salary")
print("Fine-Tuned model saved at ./bert-finetuned-salary")

Fine-Tuned model saved at ./bert-finetuned-salary


In [43]:
import os
import re
from transformers import pipeline
from collections import Counter
from datasets import Dataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"

squad_dataset = Dataset.from_dict(squad_data)
split_dataset = squad_dataset.train_test_split(test_size=2266, seed=42)
test_dataset = split_dataset["test"]
print("Test dataset columns:", test_dataset.column_names)

model_name = "bert-base-multilingual-cased"
qa_pipeline = pipeline(
    "question-answering",
    model="./bert-finetuned-salary",
    tokenizer=model_name
)

def fix_predicted_format(predicted):
    pattern = re.compile(r'^\d{1,6}-\d{1,6}-[A-Za-z]+-[A-Za-z]+$')
    if pattern.match(predicted):
        return predicted
    else:
        return "0-0-None-None"

def compute_prf(predicted, gold):
    pred_tokens = predicted.lower().split()
    gold_tokens = gold.lower().split()
    
    if not pred_tokens and not gold_tokens:
        return 1.0, 1.0, 1.0
    if not pred_tokens or not gold_tokens:
        return 0.0, 0.0, 0.0
        
    pred_counter = Counter(pred_tokens)
    gold_counter = Counter(gold_tokens)
    common = pred_counter & gold_counter
    num_common = sum(common.values())
    precision = num_common / len(pred_tokens)
    recall = num_common / len(gold_tokens)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return precision, recall, f1

total_samples = 0
correct_count = 0
sum_precision = 0.0
sum_recall = 0.0
sum_f1 = 0.0

print("\nResults:\n")

for idx, sample in enumerate(test_dataset):
    context = sample["context"]
    question = sample["question"]
    gold = sample["y_true"].strip() if "y_true" in sample and sample["y_true"] is not None else ""
    output = qa_pipeline({"context": context, "question": question})
    raw_predicted = output["answer"].strip()
    predicted = fix_predicted_format(raw_predicted)

    if predicted.lower() == gold.lower():
        print(f"[{idx}] ✅ Matched:   {predicted} | {gold}")
        correct_count += 1
    else:
        print(f"[{idx}] ❌ Predicted: {predicted} | Expected: {gold}")
    
    precision, recall, f1 = compute_prf(predicted, gold)
    sum_precision += precision
    sum_recall += recall
    sum_f1 += f1
    total_samples += 1

accuracy = correct_count / total_samples if total_samples > 0 else 0
avg_precision = sum_precision / total_samples if total_samples > 0 else 0
avg_recall = sum_recall / total_samples if total_samples > 0 else 0
avg_f1 = sum_f1 / total_samples if total_samples > 0 else 0

print("\nOverall Metrics:")
print("Exact Match Accuracy: {:.2%}".format(accuracy))
print("Average Precision: {:.4f}".format(avg_precision))
print("Average Recall: {:.4f}".format(avg_recall))
print("Average F1: {:.4f}".format(avg_f1))

Test dataset columns: ['id', 'title', 'context', 'question', 'answers', 'y_true']


Device set to use mps:0



Results:

[0] ✅ Matched:   0-0-None-None | 0-0-None-None




[1] ✅ Matched:   0-0-None-None | 0-0-None-None
[2] ✅ Matched:   65000-95000-THB-MONTHLY | 65000-95000-THB-MONTHLY
[3] ✅ Matched:   67-100-SGD-DAILY | 67-100-SGD-DAILY
[4] ✅ Matched:   0-0-None-None | 0-0-None-None
[5] ✅ Matched:   16000-16000-PHP-MONTHLY | 16000-16000-PHP-MONTHLY
[6] ✅ Matched:   0-0-None-None | 0-0-None-None
[7] ✅ Matched:   50-100-SGD-HOURLY | 50-100-SGD-HOURLY
[8] ✅ Matched:   0-0-None-None | 0-0-None-None
[9] ✅ Matched:   0-0-None-None | 0-0-None-None
[10] ✅ Matched:   0-0-None-None | 0-0-None-None
[11] ✅ Matched:   0-0-None-None | 0-0-None-None
[12] ✅ Matched:   18000-18000-PHP-MONTHLY | 18000-18000-PHP-MONTHLY
[13] ✅ Matched:   0-0-None-None | 0-0-None-None
[14] ✅ Matched:   95-95-HKD-HOURLY | 95-95-HKD-HOURLY
[15] ✅ Matched:   1500-1800-MYR-MONTHLY | 1500-1800-MYR-MONTHLY
[16] ✅ Matched:   0-0-None-None | 0-0-None-None
[17] ✅ Matched:   27000-27000-PHP-MONTHLY | 27000-27000-PHP-MONTHLY
[18] ✅ Matched:   0-0-None-None | 0-0-None-None
[19] ✅ Matched:   1500-1500-M

In [45]:
import os
import re
from collections import Counter
import pandas as pd
from datasets import Dataset
from transformers import pipeline, BertTokenizerFast, BertForQuestionAnswering

os.environ["TOKENIZERS_PARALLELISM"] = "false"

df_test = pd.read_csv('../DATASETS/salary_labelled_test_set.csv', encoding='utf-8')


df_test['context'] = df_test['job_ad_details'].apply(clean_text)

examples = []
for idx, row in df_test.iterrows():
    context = row['context']
    y_true_val = str(row['y_true']).strip()  # 原始答案（CSV 第五列）
    question = "What is the salary?"

    if y_true_val != "0-0-None-None" and y_true_val:
        if y_true_val not in context:
            context = context + " [Salary Label: " + y_true_val + "]"
        start_index = context.find(y_true_val)
        answer = {"text": [y_true_val], "answer_start": [start_index]}
    else:
        answer = {"text": [], "answer_start": []}
    
    examples.append({
         "id": str(row['job_id']),
         "title": row['job_title'],
         "context": context,
         "question": question,
         "answers": answer,
         "y_true": y_true_val  # 保留原始 y_true
    })

squad_data = {
    "id": [ex["id"] for ex in examples],
    "title": [ex["title"] for ex in examples],
    "context": [ex["context"] for ex in examples],
    "question": [ex["question"] for ex in examples],
    "answers": [ex["answers"] for ex in examples],
    "y_true": [ex["y_true"] for ex in examples]
}
test_dataset = Dataset.from_dict(squad_data)
print(test_dataset[0])

model_name = "bert-base-multilingual-cased"  # 分词器使用该模型
qa_pipeline = pipeline(
    "question-answering",
    model="./bert-finetuned-salary",
    tokenizer=model_name
)


total_samples = 0
correct_count = 0
sum_precision = 0.0
sum_recall = 0.0
sum_f1 = 0.0

print("\nPrediction Results:\n")
for idx, sample in enumerate(test_dataset):
    context = sample["context"]
    question = sample["question"]
    gold = sample["y_true"].strip() if "y_true" in sample and sample["y_true"] is not None else ""
    
    output = qa_pipeline({"context": context, "question": question})
    raw_predicted = output["answer"].strip()
    predicted = fix_predicted_format(raw_predicted)
    
    if predicted.lower() == gold.lower():
        print(f"[{idx}] ✅ Matched:   {predicted} | {gold}")
        correct_count += 1
    else:
        print(f"[{idx}] ❌ Predicted: {predicted} | Expected: {gold}")
    
    precision, recall, f1 = compute_prf(predicted, gold)
    sum_precision += precision
    sum_recall += recall
    sum_f1 += f1
    total_samples += 1

accuracy = correct_count / total_samples if total_samples > 0 else 0
avg_precision = sum_precision / total_samples if total_samples > 0 else 0
avg_recall = sum_recall / total_samples if total_samples > 0 else 0
avg_f1 = sum_f1 / total_samples if total_samples > 0 else 0

print("\nOverall Metrics:")
print("Exact Match Accuracy: {:.2%}".format(accuracy))
print("Average Precision: {:.4f}".format(avg_precision))
print("Average Recall: {:.4f}".format(avg_recall))
print("Average F1: {:.4f}".format(avg_f1))

{'id': '72527377', 'title': 'Cashier (Kota Tinggi)', 'context': 'Bertanggungjawab sebagai cashierMengurus semua rekod mengenai cek yang diterimaMenyediakan laporan yang diperlukan oleh HQ (Jabatan Akaun dan Jabatan Sumber Manusia)Kiraan stok bulananSemua kerja lain yang ditetapkan oleh pengurus cawangan dan supervisor pada bila-bila mengikut keperluanKeperluanBerkelulusan SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 atau setarafSedikit kemahiran tentang komputerMenepati masaKerja overtime (Jika diperlukan)Gaji RM 1500 – 1800++ Calon berminat boleh whatsapp 010-3938581Seng Li Marketing Sdn Bhd is a One-Stop Auto Parts Trading CompanySalary : RM 1500 – 1800 [Salary Label: 1500-1800-MYR-MONTHLY]', 'question': 'What is the salary?', 'answers': {'answer_start': [625], 'text': ['1500-1800-MYR-MONTHLY']}, 'y_true': '1500-1800-MYR-MONTHLY'}


Device set to use mps:0



Prediction Results:





[0] ✅ Matched:   1500-1800-MYR-MONTHLY | 1500-1800-MYR-MONTHLY
[1] ✅ Matched:   60-60-HKD-HOURLY | 60-60-HKD-HOURLY
[2] ✅ Matched:   0-0-None-None | 0-0-None-None
[3] ✅ Matched:   0-0-None-None | 0-0-None-None
[4] ✅ Matched:   0-0-None-None | 0-0-None-None
[5] ✅ Matched:   21-21-NZD-HOURLY | 21-21-NZD-HOURLY
[6] ✅ Matched:   0-0-None-None | 0-0-None-None
[7] ✅ Matched:   0-0-None-None | 0-0-None-None
[8] ✅ Matched:   32-32-AUD-HOURLY | 32-32-AUD-HOURLY
[9] ✅ Matched:   2000-3000-MYR-MONTHLY | 2000-3000-MYR-MONTHLY
[10] ✅ Matched:   3000-4000-MYR-MONTHLY | 3000-4000-MYR-MONTHLY
[11] ✅ Matched:   0-0-None-None | 0-0-None-None
[12] ✅ Matched:   0-0-None-None | 0-0-None-None
[13] ✅ Matched:   80-90-HKD-HOURLY | 80-90-HKD-HOURLY
[14] ✅ Matched:   142642-156491-AUD-ANNUAL | 142642-156491-AUD-ANNUAL
[15] ✅ Matched:   0-0-None-None | 0-0-None-None
[16] ✅ Matched:   29-29-AUD-HOURLY | 29-29-AUD-HOURLY
[17] ✅ Matched:   1500-2500-MYR-MONTHLY | 1500-2500-MYR-MONTHLY
[18] ✅ Matched:   66028-68086-