In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from datasets import Dataset
import pandas as pd
import re
from bs4 import BeautifulSoup
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

df = pd.read_csv('../../MISC/salary_labelled_development_set.csv', encoding='utf-8')
df.columns = ["job_id", "job_title", "job_ad_details", "nation_short_desc", "salary_additional_text", "y_true"]
df = df[df['y_true'].notna()]
df['y_true'] = (
    df['y_true']
    .fillna("0-0-None-None")
    .str.strip()
    .str.replace(
        r'(\d+)-(\d+)-([A-Za-z]+)-([A-Za-z]+)',
        r'\1 \2 \3 \4',
        regex=True
    )
)

def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    for tag in soup(["script", "style"]):
        tag.decompose()
    return soup.get_text(separator="\n", strip=True)

examples = []
for _, row in df.iterrows():
    context = str(row["job_ad_details"])
    question = (
        "Extract the salary info from the job ad below and return it as: "
        '"MinimumSalary MaximumSalary Currency PayPeriod"'
    )
    input_text = f"{question}\n\n{clean_html_tags(context)}"
    output_text = row["y_true"] if row["y_true"] else "0 0 None None"
    examples.append({"input_text": input_text, "target_text": output_text})

dataset = Dataset.from_list(examples)
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]


model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


max_input_length = 512
max_target_length = 32

def preprocess(example):
    model_inputs = tokenizer(example["input_text"], max_length=max_input_length, padding="max_length", truncation=True)
    labels = tokenizer(example["target_text"], max_length=max_target_length, padding="max_length", truncation=True)
    labels["input_ids"] = [l if l != tokenizer.pad_token_id else -100 for l in labels["input_ids"]]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

FORMAT_RE = re.compile(r'^\s*\d+\s+\d+\s+[A-Za-z]+\s+[A-Za-z]+\s*$')
PARSE_RE  = re.compile(r'^\s*(\d+)\s+(\d+)\s+([A-Za-z]+)\s+([A-Za-z]+)\s*$')

def validate_pred(pred: str) -> str:
    return pred if FORMAT_RE.match(pred) else "0 0 None None"

def parse_salary(s: str):
    m = PARSE_RE.match(s)
    if not m:
        return None
    return (
        int(m.group(1)),
        int(m.group(2)),
        m.group(3).lower(),
        m.group(4).lower()
    )

def compute_metrics(eval_preds):
    preds_ids, labels_ids = eval_preds
    vocab_size = tokenizer.vocab_size

    # 1. 过滤并解码预测 ID
    filtered_preds = []
    for seq in preds_ids:
        seq = seq.tolist() if hasattr(seq, "tolist") else seq
        filtered_preds.append([tok for tok in seq if 0 <= tok < vocab_size])

    filtered_labels = []
    for seq in labels_ids:
        seq = seq.tolist() if hasattr(seq, "tolist") else seq
        filtered_labels.append([
            tok if tok != -100 else tokenizer.pad_token_id
            for tok in seq
        ])

    decoded_preds  = tokenizer.batch_decode(filtered_preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(filtered_labels, skip_special_tokens=True)

    preds = [validate_pred(p.strip()) for p in decoded_preds]
    trues = [l.strip() for l in decoded_labels]

    parsed_preds = [parse_salary(p.lower()) for p in preds]
    parsed_trues = [parse_salary(t.lower()) for t in trues]

    default = (0, 0, 'none', 'none')
    TP = FP = FN = TN = 0

    for p, t in zip(parsed_preds, parsed_trues):
        is_def_p = (p == default)
        is_def_t = (t == default)
        match    = (
            p is not None and t is not None
            and not is_def_p and not is_def_t
            and p[0] == t[0]
            and p[1] == t[1]
            and p[2] == t[2]
            and p[3] == t[3]
        )

        if match:
            TP += 1
        elif not is_def_p:
            FP += 1
        elif not is_def_t:
            FN += 1
        else:
            TN += 1

    precision = TP / (TP+FP) if TP+FP else 0.0
    recall    = TP / (TP+FN) if TP+FN else 0.0
    f1        = 2*precision*recall/(precision+recall) if precision+recall else 0.0
    accuracy  = (TP+TN)/(TP+FP+FN+TN)

    return {"precision": precision,
            "recall":    recall,
            "f1":        f1,
            "accuracy":  accuracy}



tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_eval = eval_dataset.map(preprocess, batched=True)


data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-finetuned-salary",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=15,

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,

    predict_with_generate=True,
    generation_num_beams=4,
    generation_max_length=32,

    label_smoothing_factor=0.1,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",

    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    fp16=True,

    seed=42
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.save_model("t5-finetuned-salary/best")
tokenizer.save_pretrained("t5-finetuned-salary/best")
best_ckpt = trainer.state.best_model_checkpoint
print("Best checkpoint dir:", best_ckpt)
best_dir = "./t5-finetuned-salary/best"
trainer.model.save_pretrained(best_dir)
tokenizer.save_pretrained(best_dir)

metrics = trainer.evaluate()
print(metrics)

Map:   0%|          | 0/2040 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.1652,1.828198,0.444444,0.067797,0.117647,0.471366
2,1.6366,1.543275,0.686441,0.880435,0.771429,0.788546
3,1.5643,1.500295,0.696721,0.913978,0.790698,0.801762
4,1.5341,1.487391,0.756098,0.93,0.834081,0.837004
5,1.5219,1.476565,0.736,0.938776,0.825112,0.828194
6,1.5016,1.468378,0.746032,0.949495,0.835556,0.837004
7,1.4917,1.458851,0.770492,0.930693,0.843049,0.845815
8,1.4834,1.462233,0.801653,0.932692,0.862222,0.863436
9,1.4832,1.453942,0.772358,0.95,0.852018,0.854626
10,1.476,1.452573,0.772358,0.95,0.852018,0.854626


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Best checkpoint dir: ./t5-finetuned-salary\checkpoint-2040


{'eval_loss': 1.462233066558838, 'eval_precision': 0.8016528925619835, 'eval_recall': 0.9326923076923077, 'eval_f1': 0.8622222222222222, 'eval_accuracy': 0.8634361233480177, 'eval_runtime': 15.0191, 'eval_samples_per_second': 15.114, 'eval_steps_per_second': 1.931, 'epoch': 11.0}


In [3]:
pred_out = trainer.predict(tokenized_eval)

decoded_preds = tokenizer.batch_decode(
    pred_out.predictions, skip_special_tokens=True
)
decoded_labels = tokenizer.batch_decode(
    np.where(pred_out.label_ids != -100, pred_out.label_ids, tokenizer.pad_token_id),
    skip_special_tokens=True
)

default = (0, 0, 'none', 'none')
TP = FP = FN = TN = 0

print(" idx | Prediction                | True                       |Flag")
print("-" * 70)

for i, (raw_p, raw_t) in enumerate(zip(decoded_preds, decoded_labels)):
    p_str = validate_pred(raw_p.strip())
    t_str = raw_t.strip()

    p_parsed = parse_salary(p_str.lower())
    t_parsed = parse_salary(t_str.lower())

    is_def_pred = (p_parsed == default)
    is_def_true = (t_parsed == default)

    match = (
        not is_def_pred and not is_def_true
        and p_parsed[0] == t_parsed[0]
        and p_parsed[1] == t_parsed[1]
        and p_parsed[2] == t_parsed[2]
        and p_parsed[3] == t_parsed[3]
    )

    if match:
        TP += 1
    elif not is_def_pred and (is_def_true or not match):
        FP += 1
    elif is_def_pred and not is_def_true:
        FN += 1
    else:
        TN += 1

    flag = "✔" if p_str == t_str else "✘"
    print(f"[{i:3d}] | {p_str:<25} | {t_str:<25} | {flag}")

print("\n🔢 Confusion Matrix Counts:")
print(f"  TP: {TP}")
print(f"  FP: {FP}")
print(f"  FN: {FN}")
print(f"  TN: {TN}")

precision = TP / (TP + FP) if TP + FP else 0.0
recall    = TP / (TP + FN) if TP + FN else 0.0
f1        = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
accuracy  = (TP + TN) / (TP + FP + FN + TN)

print("\n📊 Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  Accuracy:  {accuracy:.4f}")

 idx | Prediction                | True                       |Flag
----------------------------------------------------------------------
[  0] | 0 0 None None             | 0 0 None None             | ✔
[  1] | 0 0 None None             | 0 0 None None             | ✔
[  2] | 65000 95000 THB MONTHLY   | 65000 95000 THB MONTHLY   | ✔
[  3] | 67 100 SGD DAILY          | 67 100 SGD DAILY          | ✔
[  4] | 0 0 None None             | 0 0 None None             | ✔
[  5] | 16000 16000 PHP MONTHLY   | 16000 16000 PHP MONTHLY   | ✔
[  6] | 0 0 None None             | 0 0 None None             | ✔
[  7] | 0 0 None None             | 50 100 SGD HOURLY         | ✘
[  8] | 0 0 None None             | 0 0 None None             | ✔
[  9] | 0 0 None None             | 0 0 None None             | ✔
[ 10] | 0 0 None None             | 0 0 None None             | ✔
[ 11] | 0 0 None None             | 0 0 None None             | ✔
[ 12] | 18000 18000 PHP MONTHLY   | 18000 18000 PHP MONTHLY   | ✔
[ 1

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset

df_test = pd.read_csv(
    "../../MISC/salary_labelled_test_set.csv",
    encoding="utf-8"
)
df_test.columns = [
    "job_id","job_title","job_ad_details",
    "nation_short_desc","salary_additional_text","y_true"
]
df_test['y_true'] = (
    df_test['y_true']
      .fillna("0-0-None-None")
      .str.strip()
      .str.replace(
          r'(\d+)-(\d+)-([A-Za-z]+)-([A-Za-z]+)',
          r'\1 \2 \3 \4',
          regex=True
      )
)

test_examples = []
PREFIX =("Extract the salary info from the job ad below and return it as: "
        '"MinimumSalary MaximumSalary Currency PayPeriod"')
for _, row in df_test.iterrows():
    inp = PREFIX + clean_html_tags(str(row["job_ad_details"]))
    gold = row["y_true"]
    test_examples.append({"input_text": inp, "target_text": gold})

test_ds = Dataset.from_list(test_examples)
tokenized_test = test_ds.map(preprocess, batched=True)

pred_out = trainer.predict(tokenized_test)

decoded_preds  = tokenizer.batch_decode(
    pred_out.predictions, skip_special_tokens=True
)
decoded_labels = [ex["target_text"] for ex in test_examples]

default = parse_salary("0 0 None None")
TP = FP = FN = TN = 0
print(" idx | Prediction                | True                       | ✔/✘")
print("-"*70)
for i, (rp, rt) in enumerate(zip(decoded_preds, decoded_labels)):
    p_str = validate_pred(rp.strip())
    t_str = rt.strip()
    pp = parse_salary(p_str.lower())
    tp = parse_salary(t_str.lower())
    is_def_p = (pp == default)
    is_def_t = (tp == default)
    match = (
        not is_def_p and not is_def_t
        and pp[0]==tp[0]
        and pp[1]==tp[1]
        and pp[2]==tp[2]
        and pp[3]==tp[3]
    )
    if match:        TP += 1
    elif not is_def_p: FP += 1
    elif not is_def_t: FN += 1
    else:             TN += 1

    flag = "✔" if p_str == t_str else "✘"
    print(f"[{i:3d}] | {p_str:<25} | {t_str:<25} | {flag}")

print(f"\n🔢 TP: {TP}   FP: {FP}   FN: {FN}   TN: {TN}")
precision = TP/(TP+FP) if TP+FP else 0.0
recall    = TP/(TP+FN) if TP+FN else 0.0
f1        = 2*precision*recall/(precision+recall) if precision+recall else 0.0
accuracy  = (TP+TN)/(TP+FP+FN+TN)
print("\n📊 Precision: {:.4f}".format(precision))
print("    Recall:    {:.4f}".format(recall))
print("    F1 Score:  {:.4f}".format(f1))
print("    Accuracy:  {:.4f}".format(accuracy))


Map:   0%|          | 0/567 [00:00<?, ? examples/s]

 idx | Prediction                | True                       | ✔/✘
----------------------------------------------------------------------
[  0] | 1500 1800 MYR MONTHLY     | 1500 1800 MYR MONTHLY     | ✔
[  1] | 60 60 HKD HOURLY          | 60 60 HKD HOURLY          | ✔
[  2] | 0 0 None None             | 0 0 None None             | ✔
[  3] | 0 0 None None             | 0 0 None None             | ✔
[  4] | 0 0 None None             | 0 0 None None             | ✔
[  5] | 21 21 AUD HOURLY          | 21 21 NZD HOURLY          | ✘
[  6] | 0 0 None None             | 0 0 None None             | ✔
[  7] | 0 0 None None             | 0 0 None None             | ✔
[  8] | 32 32 AUD HOURLY          | 32 32 AUD HOURLY          | ✔
[  9] | 1500 2000 MYR MONTHLY     | 2000 3000 MYR MONTHLY     | ✘
[ 10] | 3000 4000 MYR MONTHLY     | 3000 4000 MYR MONTHLY     | ✔
[ 11] | 0 0 None None             | 0 0 None None             | ✔
[ 12] | 0 0 None None             | 0 0 None None             | ✔
[ 1