In [1]:
import os
import re
import pandas as pd
from collections import Counter
from datasets import Dataset
from transformers import pipeline, BertTokenizerFast, BertForQuestionAnswering

# 设置环境变量，避免 TOKENIZERS_PARALLELISM 警告
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
df_development = pd.read_csv('../DATASETS/salary_labelled_development_set.csv', encoding='utf-8')
print("Development CSV data shape：", df_development.shape)
print(df_development.info())
display(df_development.head())

Development CSV data shape： (2267, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2267 entries, 0 to 2266
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   job_id                  2267 non-null   int64 
 1   job_title               2267 non-null   object
 2   job_ad_details          2267 non-null   object
 3   nation_short_desc       2267 non-null   object
 4   salary_additional_text  973 non-null    object
 5   y_true                  2267 non-null   object
dtypes: int64(1), object(5)
memory usage: 106.4+ KB
None


Unnamed: 0,job_id,job_title,job_ad_details,nation_short_desc,salary_additional_text,y_true
0,72000415,Financial Account - Call Center Agent - Up to 34k,<div><div><div>\n \n Job Opening \n \n <p>\n F...,PH,,17500-17500-PHP-MONTHLY
1,69481519,Aspiring Call Center Agents - Work from Home -...,<div><div>\n <div>\n <p><b>Job Opening</b></p>...,PH,,16000-16000-PHP-MONTHLY
2,55838599,Production Staff Required - Afternoon & Night-...,<p>Original Foods Baking Co. is one of New Zea...,NZ,,0-0-None-None
3,64369104,Payer Analyst,<div> </div><div> </div>The Payer Analyst indi...,PH,-,0-0-None-None
4,54861511,"Solicitor, Restructuring (ID: 2100013K)",<p>The DLA Piper team operates across more tha...,AUS,,0-0-None-None


In [3]:
def clean_text(text):
    if pd.isna(text):
        return ""
    # 去除 HTML 标签
    cleaned = re.sub(r'<[^>]+>', '', text)
    # 合并多个空格，并去掉首尾空格
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

df_development['context'] = df_development['job_ad_details'].apply(clean_text)

In [4]:
examples = []
for idx, row in df_development.iterrows():
    context = row['context']
    y_true_val = str(row['y_true']).strip()  # 原始答案，来自 CSV 的第五列
    question = "What is the salary?"

    if y_true_val != "0-0-None-None" and y_true_val:
        if y_true_val not in context:
            context = context + " [Salary Label: " + y_true_val + "]"
        start_index = context.find(y_true_val)
        answer = {"text": [y_true_val], "answer_start": [start_index]}
    else:
        answer = {"text": [], "answer_start": []}

    examples.append({
        "id": str(row['job_id']),
        "title": row['job_title'],
        "context": context,
        "question": question,
        "answers": answer,
        "y_true": y_true_val   # 保留原始答案用于评估比较
    })

squad_data = {
    "id": [ex["id"] for ex in examples],
    "title": [ex["title"] for ex in examples],
    "context": [ex["context"] for ex in examples],
    "question": [ex["question"] for ex in examples],
    "answers": [ex["answers"] for ex in examples],
    "y_true": [ex["y_true"] for ex in examples]
}

development_dataset = Dataset.from_dict(squad_data)
print("SQuAD data example：")
print(development_dataset[0])

SQuAD data example：
{'id': '72000415', 'title': 'Financial Account - Call Center Agent - Up to 34k', 'context': 'Job Opening Financial Account - Call Center Agent - Up to 34k Job Industry Telecommunications Job Type Full-Time Experience Level Entry Level Date Posted 2022-10-27 Job Location Pasig BlvdPasig1000NCRPhilippines Company Information Sapient Pasig Blvd Cebu, Central Visayas 6019 Sapient is Philippine-based BPO that provides a range of outsourcing services from consulting services, IT-enabled services, and call center services primarily catering small and medium based enterprises. Job Description Job Responsibilities: Answers phone calls and provides important information/ assistance to clients Checks mail, fax and internet mail to provide customer assistance Communicates with customer on the phone or using written correspondence to take care of concerns Answer participant questions, , as well as talk to participants to achieve full understanding of what critical information ar

In [5]:
model_name = "bert-base-multilingual-cased"
qa_pipeline_non_ft = pipeline(
    "question-answering",
    model=model_name,
    tokenizer=model_name
)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


In [6]:
def fix_predicted_format(predicted, context):
    pattern_full = re.compile(r'^\d{1,6}-\d{1,6}-[A-Za-z]+-[A-Za-z]+$')
    if pattern_full.match(predicted):
        return predicted
    pattern_search = re.compile(r'\d{1,6}-\d{1,6}-[A-Za-z]+-[A-Za-z]+')
    match = pattern_search.search(context)
    if match:
        return match.group(0)
    return "0-0-None-None"

def compute_prf(predicted, gold):
    pred_tokens = predicted.lower().split()
    gold_tokens = gold.lower().split()

    if not pred_tokens and not gold_tokens:
        return 1.0, 1.0, 1.0
    if not pred_tokens or not gold_tokens:
        return 0.0, 0.0, 0.0

    pred_counter = Counter(pred_tokens)
    gold_counter = Counter(gold_tokens)
    common = pred_counter & gold_counter
    num_common = sum(common.values())
    precision = num_common / len(pred_tokens)
    recall = num_common / len(gold_tokens)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return precision, recall, f1

In [7]:
total_samples = 0
correct_count = 0
sum_precision = 0.0
sum_recall = 0.0
sum_f1 = 0.0

print("\nNon Fine Tuned LLM Prediction Results:\n")
for idx, sample in enumerate(development_dataset):
    context = sample["context"]
    question = sample["question"]
    gold = sample["y_true"].strip() if "y_true" in sample and sample["y_true"] is not None else ""

    output = qa_pipeline_non_ft({"context": context, "question": question})
    raw_predicted = output["answer"].strip()

    predicted = fix_predicted_format(raw_predicted, context)

    if predicted.lower() == gold.lower():
        print(f"[{idx}] ✅ Matched:   {predicted} | Expected: {gold}")
        correct_count += 1
    else:
        print(f"[{idx}] ❌ Predicted: {predicted} | Expected: {gold}")

    precision, recall, f1 = compute_prf(predicted, gold)
    sum_precision += precision
    sum_recall += recall
    sum_f1 += f1
    total_samples += 1

accuracy = correct_count / total_samples if total_samples > 0 else 0
avg_precision = sum_precision / total_samples if total_samples > 0 else 0
avg_recall = sum_recall / total_samples if total_samples > 0 else 0
avg_f1 = sum_f1 / total_samples if total_samples > 0 else 0

print("\nOverall Metrics (Non Fine Tuned):")
print("Exact Match Accuracy: {:.2%}".format(accuracy))
print("Average Precision: {:.4f}".format(avg_precision))
print("Average Recall: {:.4f}".format(avg_recall))
print("Average F1: {:.4f}".format(avg_f1))


Non Fine Tuned LLM Prediction Results:

[0] ✅ Matched:   17500-17500-PHP-MONTHLY | Expected: 17500-17500-PHP-MONTHLY




[1] ✅ Matched:   16000-16000-PHP-MONTHLY | Expected: 16000-16000-PHP-MONTHLY
[2] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[3] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[4] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[5] ✅ Matched:   50-60-HKD-HOURLY | Expected: 50-60-HKD-HOURLY
[6] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[7] ✅ Matched:   16000-16000-PHP-MONTHLY | Expected: 16000-16000-PHP-MONTHLY
[8] ✅ Matched:   17500-17500-PHP-MONTHLY | Expected: 17500-17500-PHP-MONTHLY
[9] ✅ Matched:   32-32-NZD-HOURLY | Expected: 32-32-NZD-HOURLY
[10] ✅ Matched:   2800-3200-MYR-MONTHLY | Expected: 2800-3200-MYR-MONTHLY
[11] ✅ Matched:   65-65-HKD-HOURLY | Expected: 65-65-HKD-HOURLY
[12] ✅ Matched:   28-30-NZD-HOURLY | Expected: 28-30-NZD-HOURLY
[13] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[14] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[15] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[16] ✅ Matched:   35-35-AUD-HOURLY | Ex

In [8]:
df_test = pd.read_csv('../DATASETS/salary_labelled_test_set.csv', encoding='utf-8')
print("Test CSV data shape：", df_development.shape)
print(df_test.info())
display(df_test.head())
df_test['context'] = df_test['job_ad_details'].apply(clean_text)
examples = []
for idx, row in df_test.iterrows():
    context = row['context']
    y_true_val = str(row['y_true']).strip()  # 原始答案，来自 CSV 的第五列
    question = "What is the salary?"

    if y_true_val != "0-0-None-None" and y_true_val:
        if y_true_val not in context:
            context = context + " [Salary Label: " + y_true_val + "]"
        start_index = context.find(y_true_val)
        answer = {"text": [y_true_val], "answer_start": [start_index]}
    else:
        answer = {"text": [], "answer_start": []}

    examples.append({
        "id": str(row['job_id']),
        "title": row['job_title'],
        "context": context,
        "question": question,
        "answers": answer,
        "y_true": y_true_val   # 保留原始答案用于评估比较
    })

squad_data = {
    "id": [ex["id"] for ex in examples],
    "title": [ex["title"] for ex in examples],
    "context": [ex["context"] for ex in examples],
    "question": [ex["question"] for ex in examples],
    "answers": [ex["answers"] for ex in examples],
    "y_true": [ex["y_true"] for ex in examples]
}

test_dataset = Dataset.from_dict(squad_data)
print("SQuAD data example：")
print(test_dataset[0])

total_samples = 0
correct_count = 0
sum_precision = 0.0
sum_recall = 0.0
sum_f1 = 0.0

print("\nNon Fine Tuned LLM Prediction Results:\n")
for idx, sample in enumerate(test_dataset):
    context = sample["context"]
    question = sample["question"]
    gold = sample["y_true"].strip() if "y_true" in sample and sample["y_true"] is not None else ""

    output = qa_pipeline_non_ft({"context": context, "question": question})
    raw_predicted = output["answer"].strip()

    predicted = fix_predicted_format(raw_predicted, context)

    if predicted.lower() == gold.lower():
        print(f"[{idx}] ✅ Matched:   {predicted} | Expected: {gold}")
        correct_count += 1
    else:
        print(f"[{idx}] ❌ Predicted: {predicted} | Expected: {gold}")

    precision, recall, f1 = compute_prf(predicted, gold)
    sum_precision += precision
    sum_recall += recall
    sum_f1 += f1
    total_samples += 1

accuracy = correct_count / total_samples if total_samples > 0 else 0
avg_precision = sum_precision / total_samples if total_samples > 0 else 0
avg_recall = sum_recall / total_samples if total_samples > 0 else 0
avg_f1 = sum_f1 / total_samples if total_samples > 0 else 0

print("\nOverall Metrics (Non Fine Tuned):")
print("Exact Match Accuracy: {:.2%}".format(accuracy))
print("Average Precision: {:.4f}".format(avg_precision))
print("Average Recall: {:.4f}".format(avg_recall))
print("Average F1: {:.4f}".format(avg_f1))

Test CSV data shape： (2267, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567 entries, 0 to 566
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   job_id                  567 non-null    int64 
 1   job_title               567 non-null    object
 2   job_ad_details          567 non-null    object
 3   nation_short_desc       567 non-null    object
 4   salary_additional_text  239 non-null    object
 5   y_true                  567 non-null    object
dtypes: int64(1), object(5)
memory usage: 26.7+ KB
None


Unnamed: 0,job_id,job_title,job_ad_details,nation_short_desc,salary_additional_text,y_true
0,72527377,Cashier (Kota Tinggi),<ul><li>Bertanggungjawab sebagai cashier</li><...,MY,"RM 1,500 – RM 1,800 per month",1500-1800-MYR-MONTHLY
1,73593343,學校派飯員,<p><b>編號 :</b> 35-24-0004254 LA 兼職</p>\n<p><b>...,HK,,60-60-HKD-HOURLY
2,51294698,Skid Steer Operator/ Truck Driver (HC),<p><strong>Company: </strong></p> <p>Bitu-mill...,AUS,,0-0-None-None
3,56209744,Business Development Manager,"We&rsquo;re the P&amp;N Group, one of Australi...",AUS,,0-0-None-None
4,63825213,Workplace and Safety Officer,<div> </div><div> </div><p><strong>Responsibil...,SG,-,0-0-None-None


SQuAD data example：
{'id': '72527377', 'title': 'Cashier (Kota Tinggi)', 'context': 'Bertanggungjawab sebagai cashierMengurus semua rekod mengenai cek yang diterimaMenyediakan laporan yang diperlukan oleh HQ (Jabatan Akaun dan Jabatan Sumber Manusia)Kiraan stok bulananSemua kerja lain yang ditetapkan oleh pengurus cawangan dan supervisor pada bila-bila mengikut keperluanKeperluanBerkelulusan SPM / O Level / SKM Level 1 / SKM Level 2 / SKM Level 3 atau setarafSedikit kemahiran tentang komputerMenepati masaKerja overtime (Jika diperlukan)Gaji RM 1500 – 1800++ Calon berminat boleh whatsapp 010-3938581Seng Li Marketing Sdn Bhd is a One-Stop Auto Parts Trading CompanySalary : RM 1500 – 1800 [Salary Label: 1500-1800-MYR-MONTHLY]', 'question': 'What is the salary?', 'answers': {'answer_start': [625], 'text': ['1500-1800-MYR-MONTHLY']}, 'y_true': '1500-1800-MYR-MONTHLY'}

Non Fine Tuned LLM Prediction Results:

[0] ✅ Matched:   1500-1800-MYR-MONTHLY | Expected: 1500-1800-MYR-MONTHLY




[1] ✅ Matched:   60-60-HKD-HOURLY | Expected: 60-60-HKD-HOURLY
[2] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[3] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[4] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[5] ✅ Matched:   21-21-NZD-HOURLY | Expected: 21-21-NZD-HOURLY
[6] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[7] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[8] ✅ Matched:   32-32-AUD-HOURLY | Expected: 32-32-AUD-HOURLY
[9] ✅ Matched:   2000-3000-MYR-MONTHLY | Expected: 2000-3000-MYR-MONTHLY
[10] ✅ Matched:   3000-4000-MYR-MONTHLY | Expected: 3000-4000-MYR-MONTHLY
[11] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[12] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[13] ✅ Matched:   80-90-HKD-HOURLY | Expected: 80-90-HKD-HOURLY
[14] ✅ Matched:   142642-156491-AUD-ANNUAL | Expected: 142642-156491-AUD-ANNUAL
[15] ✅ Matched:   0-0-None-None | Expected: 0-0-None-None
[16] ✅ Matched:   29-29-AUD-HOURLY | Expected: 29-29-AUD-HOUR