In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd

df = pd.read_csv('../DATASETS/salary_labelled_development_set.csv', encoding='utf-8')
df.columns = ["job_id", "job_title", "job_ad_details", "nation_short_desc", "salary_additional_text", "y_true"]
df = df[df['y_true'].notna()]  # 去掉缺失标注
df['y_true'] = df['y_true'].apply(lambda x: x.strip())


examples = []
for _, row in df.iterrows():
    context = str(row["job_ad_details"])
    question = (
        "Extract the salary info from the job ad below and return it as: "
        '"MinimumSalary-MaximumSalary-Currency-PayPeriod"'
    )
    input_text = f"{question}\n\n{context}"
    output_text = row["y_true"] if row["y_true"] else "0-0-None-None"
    examples.append({"input_text": input_text, "target_text": output_text})

dataset = Dataset.from_list(examples)
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]


model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


max_input_length = 512
max_target_length = 32

def preprocess(example):
    model_inputs = tokenizer(example["input_text"], max_length=max_input_length, padding="max_length", truncation=True)
    labels = tokenizer(example["target_text"], max_length=max_target_length, padding="max_length", truncation=True)
    labels["input_ids"] = [l if l != tokenizer.pad_token_id else -100 for l in labels["input_ids"]]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_eval = eval_dataset.map(preprocess, batched=True)


training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-finetuned-salary",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=50,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    fp16=True,
    seed=42
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 2040/2040 [00:01<00:00, 1073.33 examples/s]
Map: 100%|██████████| 227/227 [00:00<00:00, 1014.77 examples/s]
  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,4.1656
100,0.4015
150,0.1933
200,0.1746
250,0.1436
300,0.1214
350,0.1087
400,0.1205
450,0.1148
500,0.0985


TrainOutput(global_step=12750, training_loss=0.049108741229655695, metrics={'train_runtime': 2703.1821, 'train_samples_per_second': 37.733, 'train_steps_per_second': 4.717, 'total_flos': 6.211370483712e+16, 'train_loss': 0.049108741229655695, 'epoch': 50.0})

In [2]:
from transformers import T5ForConditionalGeneration

trainer.save_model("./t5-finetuned-salary")
tokenizer.save_pretrained("./t5-finetuned-salary")


('./t5-finetuned-salary\\tokenizer_config.json',
 './t5-finetuned-salary\\special_tokens_map.json',
 './t5-finetuned-salary\\spiece.model',
 './t5-finetuned-salary\\added_tokens.json')

In [10]:
import re
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration

FORMAT_RE = re.compile(r'^\s*\d+\s*-\s*\d+\s*-[A-Za-z]+\s*-[A-Za-z]+\s*$', re.IGNORECASE)
PARSE_RE  = re.compile(r'^\s*(\d+)\s*-\s*(\d+)\s*-\s*([A-Za-z]+)\s*-\s*([A-Za-z]+)\s*$', re.IGNORECASE)

def validate_pred(pred: str) -> str:
    return pred if FORMAT_RE.match(pred) else "0-0-None-None"

def parse_salary(s: str):
    m = PARSE_RE.match(s)
    if not m:
        return None
    return (int(m.group(1)), int(m.group(2)),
            m.group(3).lower(), m.group(4).lower())

tokenizer = T5Tokenizer.from_pretrained("t5-base")
model     = T5ForConditionalGeneration.from_pretrained("./t5-finetuned-salary")
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

df_raw = pd.read_csv("../DATASETS/salary_labelled_development_set.csv", encoding="utf-8")
df_raw.columns = ["job_id","job_title","job_ad_details","nation_short_desc",
                  "salary_additional_text","y_true"]

PREFIX = ("Extract the salary info from the job ad below and return it as: "
          "\"MinimumSalary-MaximumSalary-Currency-PayPeriod\"\n\n")
eval_dataset = []
for _, row in df_raw.iterrows():
    inp  = PREFIX + str(row["job_ad_details"])
    gold = row["y_true"].strip() if pd.notna(row["y_true"]) else "0-0-None-None"
    eval_dataset.append({
        "job_id":      row["job_id"],
        "job_title":   row["job_title"],
        "input_text":  inp,
        "gold":        gold
    })

print(f"Loaded {len(eval_dataset)} examples for evaluation.\n")

results = []
for sample in tqdm(eval_dataset, desc="Evaluating"):
    inputs  = tokenizer(sample["input_text"],
                        truncation=True, max_length=512,
                        return_tensors="pt").to(device)
    raw_ids = model.generate(**inputs, max_new_tokens=32, num_beams=4)[0]
    raw_pred = tokenizer.decode(raw_ids, skip_special_tokens=True).strip()
    pred = validate_pred(raw_pred)
    results.append({
        "job_id": sample["job_id"],
        "job_title": sample["job_title"],
        "prediction": pred,
        "gold":       sample["gold"]
    })

df = pd.DataFrame(results)

for i, row in df.iterrows():
    pred = row['prediction']
    true = row['gold']
    p = parse_salary(pred)
    g = parse_salary(true)
    match = False
    if p and g \
       and abs(p[0]-g[0]) <= 1 \
       and abs(p[1]-g[1]) <= 1 \
       and p[2]==g[2] \
       and p[3]==g[3]:
        match = True
    symbol = "√" if match else "×"
    print(f"[{i:03d}] {symbol} Pred: {pred:<20}  True: {true}")


df = df.rename(columns={'prediction':'predicted_salary','gold':'y_true'})
df['p_parsed'] = df['predicted_salary'].apply(lambda x: parse_salary(x.strip().lower()))
df['g_parsed'] = df['y_true'].apply(lambda x: parse_salary(x.strip().lower()))

TP = df.dropna(subset=['g_parsed']).apply(
    lambda row: (
        row['p_parsed'] is not None
        and abs(row['p_parsed'][0] - row['g_parsed'][0]) <= 1
        and abs(row['p_parsed'][1] - row['g_parsed'][1]) <= 1
        and row['p_parsed'][2] == row['g_parsed'][2]
        and row['p_parsed'][3] == row['g_parsed'][3]
    ),
    axis=1
).sum()

FP = np.sum(
    (df['predicted_salary'] != df['y_true']) &
    (df['predicted_salary'] != "0-0-None-None")
)

FN = np.sum(
    (df['predicted_salary'] == "0-0-None-None") &
    (df['y_true'] != "0-0-None-None")
)

TN = np.sum(
    (df['predicted_salary'] == "0-0-None-None") &
    (df['y_true'] == "0-0-None-None")
)

precision = TP / (TP + FP) if (TP + FP) else 0.0
recall    = TP / (TP + FN) if (TP + FN) else 0.0
f1        = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
accuracy  = (TP + TN) / (TP + FP + TN + FN)

print("\n🔢 Confusion Matrix Counts:")
print(f"  TP: {TP}")
print(f"  FP: {FP}")
print(f"  FN: {FN}")
print(f"  TN: {TN}")

print("\n📊 Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  Accuracy:  {accuracy:.4f}")


Loaded 2267 examples for evaluation.



Evaluating: 100%|██████████| 2267/2267 [05:44<00:00,  6.58it/s]

[000] √ Pred: 17500-17500-PHP-MONTHLY  True: 17500-17500-PHP-MONTHLY
[001] × Pred: 18000-18000-PHP-MONTHLY  True: 16000-16000-PHP-MONTHLY
[002] √ Pred: 0-0-None-None         True: 0-0-None-None
[003] √ Pred: 0-0-None-None         True: 0-0-None-None
[004] √ Pred: 0-0-None-None         True: 0-0-None-None
[005] √ Pred: 50-60-HKD-HOURLY      True: 50-60-HKD-HOURLY
[006] √ Pred: 0-0-None-None         True: 0-0-None-None
[007] √ Pred: 16000-16000-PHP-MONTHLY  True: 16000-16000-PHP-MONTHLY
[008] × Pred: 25000-25000-PHP-MONTHLY  True: 17500-17500-PHP-MONTHLY
[009] √ Pred: 32-32-NZD-HOURLY      True: 32-32-NZD-HOURLY
[010] √ Pred: 2800-3200-MYR-MONTHLY  True: 2800-3200-MYR-MONTHLY
[011] √ Pred: 65-65-HKD-HOURLY      True: 65-65-HKD-HOURLY
[012] √ Pred: 28-30-NZD-HOURLY      True: 28-30-NZD-HOURLY
[013] √ Pred: 0-0-None-None         True: 0-0-None-None
[014] √ Pred: 0-0-None-None         True: 0-0-None-None
[015] √ Pred: 0-0-None-None         True: 0-0-None-None
[016] √ Pred: 35-35-AUD-HOURLY 




In [11]:
import re
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration

FORMAT_RE = re.compile(r'^\s*\d+\s*-\s*\d+\s*-[A-Za-z]+\s*-[A-Za-z]+\s*$', re.IGNORECASE)
PARSE_RE  = re.compile(r'^\s*(\d+)\s*-\s*(\d+)\s*-\s*([A-Za-z]+)\s*-\s*([A-Za-z]+)\s*$', re.IGNORECASE)

def validate_pred(pred: str) -> str:
    return pred if FORMAT_RE.match(pred) else "0-0-None-None"

def parse_salary(s: str):
    m = PARSE_RE.match(s)
    if not m:
        return None
    return (int(m.group(1)), int(m.group(2)),
            m.group(3).lower(), m.group(4).lower())

tokenizer = T5Tokenizer.from_pretrained("t5-base")
model     = T5ForConditionalGeneration.from_pretrained("./t5-finetuned-salary")
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

df_raw = pd.read_csv("../DATASETS/salary_labelled_test_set.csv", encoding="utf-8")
df_raw.columns = ["job_id","job_title","job_ad_details","nation_short_desc",
                  "salary_additional_text","y_true"]

PREFIX = ("Extract the salary info from the job ad below and return it as: "
          "\"MinimumSalary-MaximumSalary-Currency-PayPeriod\"\n\n")
eval_dataset = []
for _, row in df_raw.iterrows():
    inp  = PREFIX + str(row["job_ad_details"])
    gold = row["y_true"].strip() if pd.notna(row["y_true"]) else "0-0-None-None"
    eval_dataset.append({
        "job_id":      row["job_id"],
        "job_title":   row["job_title"],
        "input_text":  inp,
        "gold":        gold
    })

print(f"Loaded {len(eval_dataset)} examples for evaluation.\n")

results = []
for sample in tqdm(eval_dataset, desc="Evaluating"):
    inputs  = tokenizer(sample["input_text"],
                        truncation=True, max_length=512,
                        return_tensors="pt").to(device)
    raw_ids = model.generate(**inputs, max_new_tokens=32, num_beams=4)[0]
    raw_pred = tokenizer.decode(raw_ids, skip_special_tokens=True).strip()
    pred = validate_pred(raw_pred)
    results.append({
        "job_id": sample["job_id"],
        "job_title": sample["job_title"],
        "prediction": pred,
        "gold":       sample["gold"]
    })

df = pd.DataFrame(results)

print("\n🔍 每条预测 vs 真实 比较：\n")
for i, row in df.iterrows():
    pred = row['prediction']
    true = row['gold']
    p = parse_salary(pred)
    g = parse_salary(true)
    match = False
    if p and g \
       and abs(p[0]-g[0]) <= 1 \
       and abs(p[1]-g[1]) <= 1 \
       and p[2]==g[2] \
       and p[3]==g[3]:
        match = True
    symbol = "√" if match else "×"
    print(f"[{i:03d}] {symbol} Pred: {pred:<20}  True: {true}")

df = df.rename(columns={'prediction':'predicted_salary','gold':'y_true'})
df['p_parsed'] = df['predicted_salary'].apply(lambda x: parse_salary(x.strip().lower()))
df['g_parsed'] = df['y_true'].apply(lambda x: parse_salary(x.strip().lower()))

TP = df.dropna(subset=['g_parsed']).apply(
    lambda row: (
        row['p_parsed'] is not None
        and abs(row['p_parsed'][0] - row['g_parsed'][0]) <= 1
        and abs(row['p_parsed'][1] - row['g_parsed'][1]) <= 1
        and row['p_parsed'][2] == row['g_parsed'][2]
        and row['p_parsed'][3] == row['g_parsed'][3]
    ),
    axis=1
).sum()

FP = np.sum(
    (df['predicted_salary'] != df['y_true']) &
    (df['predicted_salary'] != "0-0-None-None")
)

FN = np.sum(
    (df['predicted_salary'] == "0-0-None-None") &
    (df['y_true'] != "0-0-None-None")
)

TN = np.sum(
    (df['predicted_salary'] == "0-0-None-None") &
    (df['y_true'] == "0-0-None-None")
)

precision = TP / (TP + FP) if (TP + FP) else 0.0
recall    = TP / (TP + FN) if (TP + FN) else 0.0
f1        = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
accuracy  = (TP + TN) / (TP + FP + TN + FN)

print("\n🔢 Confusion Matrix Counts:")
print(f"  TP: {TP}")
print(f"  FP: {FP}")
print(f"  FN: {FN}")
print(f"  TN: {TN}")

print("\n📊 Metrics:")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"  Accuracy:  {accuracy:.4f}")


Loaded 567 examples for evaluation.



Evaluating: 100%|██████████| 567/567 [01:26<00:00,  6.59it/s]


🔍 每条预测 vs 真实 比较：

[000] √ Pred: 1500-1800-MYR-MONTHLY  True: 1500-1800-MYR-MONTHLY
[001] √ Pred: 60-60-HKD-HOURLY      True: 60-60-HKD-HOURLY
[002] × Pred: 75000-85000-AUD-ANNUAL  True: 0-0-None-None
[003] √ Pred: 0-0-None-None         True: 0-0-None-None
[004] √ Pred: 0-0-None-None         True: 0-0-None-None
[005] × Pred: 21-23-AUD-HOURLY      True: 21-21-NZD-HOURLY
[006] √ Pred: 0-0-None-None         True: 0-0-None-None
[007] √ Pred: 0-0-None-None         True: 0-0-None-None
[008] √ Pred: 32-32-AUD-HOURLY      True: 32-32-AUD-HOURLY
[009] × Pred: 1500-2000-MYR-MONTHLY  True: 2000-3000-MYR-MONTHLY
[010] √ Pred: 3000-4000-MYR-MONTHLY  True: 3000-4000-MYR-MONTHLY
[011] √ Pred: 0-0-None-None         True: 0-0-None-None
[012] √ Pred: 0-0-None-None         True: 0-0-None-None
[013] √ Pred: 80-90-HKD-HOURLY      True: 80-90-HKD-HOURLY
[014] √ Pred: 142642-156491-AUD-ANNUAL  True: 142642-156491-AUD-ANNUAL
[015] √ Pred: 0-0-None-None         True: 0-0-None-None
[016] × Pred: 55-65-AUD-HOURL


