In [18]:
%pip install protobuf
import transformers
from transformers import pipeline
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup
from datasets import Dataset
from nltk.tokenize import word_tokenize

print(torch.backends.mps.is_available())

import sys
sys.path.append("../CODE-Baseline")  
import warnings
warnings.filterwarnings('ignore')


from salary_baseline import extract_salary_with_inference


file_path = '../DATASETS/salary_labelled_development_set.csv'
test_file_path = '../DATASETS/salary_labelled_test_set.csv'
ignore_id_path = '../DATASETS/err_salary_develpment.csv'

df = pd.read_csv(file_path)
df_ignore = pd.read_csv(ignore_id_path)
# some y_true is impossible
df = df[~df['job_id'].isin(df_ignore['job_id'])]


model_name = "google/mt5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
True


In [19]:
# 清理 HTML 标签
def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    for tag in soup(["script", "style"]):
        tag.decompose()
    return soup.get_text(separator="\n", strip=True)

# 构造输入输出
def row_to_input_output(row):
    context = clean_html_tags(f"{row['job_title']} {row['job_ad_details']}")
    prompt = (
        "Extract the salary info from the job ad below and return it as: "
        "\"MinimumSalary MaximumSalary Currency PayPeriod\"\n\n"
    )
    input_text = prompt + context
    output_text = str(row["y_true"]).strip().replace("-", " ")
    return {"input": input_text, "output": output_text}

# 预处理函数
def preprocess(example):
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], max_length=64, truncation=True, padding="max_length")["input_ids"]
    labels = [token if token != tokenizer.pad_token_id else -100 for token in labels]
    model_inputs["labels"] = labels
    return model_inputs

In [20]:
# tokenized_dataset = squad_dataset
squad_dataset = Dataset.from_list([row_to_input_output(row) for _, row in df.iterrows()])
tokenized_dataset = squad_dataset.map(preprocess)
final_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
print("training data size：", len(final_dataset['train']))
print("testing data size：", len(final_dataset['test']))



Map:   0%|          | 0/2133 [00:00<?, ? examples/s]

training data size： 1706
testing data size： 427


In [21]:
# # 查看训练集中的两个样本
# for i in range(10):
#     print(f"\nExample {i+1}:")
#     print("Input:", final_dataset["train"][i])
#     print("Input:", final_dataset["train"][i]["input"])
#     print("Output:", final_dataset["train"][i]["output"])

In [22]:
# print(transformers.__version__)
def compute_metrics(p):
    preds, labels = p

    # 有些模型的 preds 是 tuple，取第一个元素
    if isinstance(preds, tuple):
        preds = preds[0]

    # 转成 list 并移除 -100
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # 对于 labels，要先把 -100 替换为 pad_token_id，防止 decode 出错
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 计算 F1 分数
    f1_scores = []
    for pred, label in zip(decoded_preds, decoded_labels):
        pred_tokens = word_tokenize(pred.lower())
        label_tokens = word_tokenize(label.lower())

        common = set(pred_tokens) & set(label_tokens)
        if len(pred_tokens) == 0 or len(label_tokens) == 0:
            f1 = 0
        else:
            precision = len(common) / len(pred_tokens)
            recall = len(common) / len(label_tokens)
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

    return {
        "f1": np.mean(f1_scores)
    }
    
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-base-salary",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=35,
    weight_decay=0.01,
    logging_dir="./logs",
    predict_with_generate=True,
    # save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,  
    metric_for_best_model="f1",  
    greater_is_better=True,  
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  
)

trainer.train()


Epoch,Training Loss,Validation Loss,F1
1,No log,0.94992,0.214156
2,6.323700,0.330867,0.598184
3,1.059200,0.28591,0.627384
4,0.600000,0.237781,0.649855
5,0.484400,0.231004,0.650859
6,0.350800,0.239855,0.654734
7,0.350800,0.261861,0.648192
8,0.319200,0.221385,0.65808
9,0.259400,0.208767,0.662763
10,0.253400,0.188928,0.659836


SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

In [None]:
trainer.save_model("./mt5-base-salary")
model.eval()

In [None]:
model_name = "./mt5-base-salary"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to('cpu')
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

test_samples = [final_dataset["test"][i] for i in range(100)]

# # 生成并打印结果
for i, example in enumerate(test_samples):
    sample = example['input']
    print(f"Test Sample {i + 1}: {sample}")
    output = qa_pipeline(sample)
    print(f"Prediction: {output[0]['generated_text']}")
    print(f"Answer: {example['output']}\n")
