In [1]:
%pip install protobuf
import transformers
from transformers import pipeline
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup
from datasets import Dataset

print(torch.backends.mps.is_available())

import sys
sys.path.append("../CODE-Baseline")  
import warnings
warnings.filterwarnings('ignore')


from salary_baseline import extract_salary_with_inference


file_path = '../DATASETS/salary_labelled_development_set.csv'
test_file_path = '../DATASETS/salary_labelled_test_set.csv'
df = pd.read_csv(file_path)

model_name = "google/mt5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)

Note: you may need to restart the kernel to use updated packages.
True
Development dataset:
Precision: 0.7811
Recall: 0.9370
F1 Score: 0.8519
Accuracy: 0.8489

Test dataset:
Precision: 0.7412
Recall: 0.9206
F1 Score: 0.8212
Accuracy: 0.8219


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu


In [2]:
# html -> context 
def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    for tag in soup(["script", "style"]):
        tag.decompose()
    text = soup.get_text(separator=" ", strip=True)
    return text

In [3]:
def row_to_squad(row):
    text = str(row['job_title']) + " " + str(row["job_ad_details"])
    context = clean_html_tags(text)

    # prompt & answer
    Q = (
        "Extract the salary info from the job ad below and return it as: "
        "\"MinimumSalary-MaximumSalary-Currency-PayPeriod\"\n\n"
    )
    answer_text = str(row["y_true"]).strip()
    answer_text = answer_text.replace('-',' ')
    return {
        "input": Q+context,
        "output": answer_text
    }



squad_data = [row_to_squad(row) for _, row in df.iterrows()]
squad_dataset = Dataset.from_list(squad_data)

In [4]:
# Tokenize
def preprocess(example):
    inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(example["output"], max_length=64, truncation=True, padding="max_length")
    # print(targets)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = squad_dataset.map(preprocess)

Map:   0%|          | 0/2267 [00:00<?, ? examples/s]

In [5]:
# tokenized_dataset = squad_dataset
final_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
print("training data size：", len(final_dataset['train']))
print("testing data size：", len(final_dataset['test']))



training data size： 1813
testing data size： 454


In [6]:
# 查看训练集中的两个样本
for i in range(10):
    print(f"\nExample {i+1}:")
    print("Input:", final_dataset["train"][i])
    print("Input:", final_dataset["train"][i]["input"])
    print("Output:", final_dataset["train"][i]["output"])


Example 1:
Input: {'input': 'Extract the salary info from the job ad below and return it as: "MinimumSalary-MaximumSalary-Currency-PayPeriod"\n\n顧客服務代表- 熱線中心 (Customer Service Representative- Call Centre) 我們的客戶正在尋找顧客服務代表（Call Centre） 職責: 客戶服務熱線運作，包括解答查詢和處理投訴 建立有關客戶互動和投訴的報告 提出改善客戶體驗的建議 跟進及確保部門運作順利 提供行政支援和協助 Requirements: 中五或以上任何學科 1 年客戶服務接線經驗，包括投訴處理（電器行業相關優先） 良好的人際溝通與溝通技巧 能夠承受工作壓力 經驗較少者亦會考慮 可立即上班優先 具經驗者會考慮為顧客服務高級主任 Interested candidates please send your MS Word version resume with expected salary and notice period to ivan.cheng@manpowergrc.hk.', 'output': '0 0 None None', 'input_ids': [67893, 287, 163829, 276, 4933, 702, 287, 8185, 1341, 13702, 305, 4649, 609, 527, 267, 313, 72347, 229971, 276, 264, 83964, 229971, 276, 264, 124054, 264, 66838, 43203, 311, 259, 115561, 79585, 19539, 264, 259, 15862, 10361, 11834, 274, 145684, 5124, 259, 111240, 264, 10633, 10747, 271, 259, 68426, 493, 209051, 31324, 152645, 24281, 115561, 79585, 19539, 312, 56523, 10747, 271, 259, 36074, 99126, 267, 259

In [7]:
# print(transformers.__version__)
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-base-salary",
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs",
    predict_with_generate=True,
    save_total_limit=2,
    save_strategy="epoch",
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,22.3465,2.881392
1000,1.6447,0.370893
1500,1.7373,2.851604
2000,1.9172,1.449995
2500,1.5722,0.160239
3000,0.855,0.163423
3500,0.6488,0.08911
4000,0.3418,0.083962
4500,0.3263,0.042207
5000,0.2745,0.035972


TrainOutput(global_step=9080, training_loss=1.7981705485724142, metrics={'train_runtime': 10706.9416, 'train_samples_per_second': 3.387, 'train_steps_per_second': 0.848, 'total_flos': 4.347741273587712e+16, 'train_loss': 1.7981705485724142, 'epoch': 20.0})

In [8]:
# import os
# # train start or 
# continue_train = False
# model_path = "./mt5-base-salary"


# if os.path.exists(model_path):
#     if continue_train:
#         trainer.train(resume_from_checkpoint=True)
#     else:
#         print("Model already exists. Skipping training.")
# else:
#     trainer.train()


In [9]:
trainer.save_model("./mt5-base-salary")
model.eval()

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 768)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 768)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
         

In [10]:
model_name = "./mt5-base-salary"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to('cpu')
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)

test_samples = [final_dataset["test"][i] for i in range(100)]

# # 生成并打印结果
for i, example in enumerate(test_samples):
    sample = example['input']
    print(f"Test Sample {i + 1}: {sample}")
    output = qa_pipeline(sample)
    print(f"Prediction: {output[0]['generated_text']}")
    print(f"Answer: {example['output']}\n")


Device set to use cpu


Test Sample 1: Extract the salary info from the job ad below and return it as: "MinimumSalary-MaximumSalary-Currency-PayPeriod"

Digital Marketing Officer Commitment to Safety Competitive Salary & Benefit Package Cabuyao, Laguna James Hardie Industries is the world’s #1 producer and marketer of high-performance fibre cement and fibre gypsum building solutions. As a local manufacturer, trusted innovator and industry leader James Hardie empowers homeowners across Asia Pacific to achieve their dream home with premium quality exterior cladding solutions. The company has transformed to become A NEW JAMES HARDIE that consistently provides value to our customers, employees and shareholders. Our vision is to be a PREMIER, CONSUMER BRANDED COMPANY, that offers ENDLESS DESIGN POSSIBILITIES to EXTERIORS and INTERIORS of the home, transforming the way the world builds. Our business is fast-paced, and our people are smart in the way they solve problems, driven in their pursuit of results, and real 