In [1]:
%pip install protobuf
import transformers
from transformers import pipeline
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup
from datasets import Dataset
from nltk.tokenize import word_tokenize

print(torch.backends.mps.is_available())

import sys
sys.path.append("../CODE-Baseline")  
import warnings
warnings.filterwarnings('ignore')


from salary_baseline import extract_salary_with_inference


file_path = '../DATASETS/salary_labelled_development_set.csv'
test_file_path = '../DATASETS/salary_labelled_test_set.csv'
ignore_id_path = '../DATASETS/err_salary_develpment.csv'

df = pd.read_csv(file_path)
df_ignore = pd.read_csv(ignore_id_path)
# some y_true is impossible
df = df[~df['job_id'].isin(df_ignore['job_id'])]


model_name = "google/mt5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)

Note: you may need to restart the kernel to use updated packages.
True
Development dataset:
Precision: 0.8357
Recall: 0.9524
F1 Score: 0.8902
Accuracy: 0.8889

Test dataset:
Precision: 0.7412
Recall: 0.9206
F1 Score: 0.8212
Accuracy: 0.8219


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
# Function to remove HTML tags from job ad text
def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    # Remove <script> and <style> tags completely
    for tag in soup(["script", "style"]):
        tag.decompose()
    # Return clean text with line breaks between blocks
    return soup.get_text(separator="\n", strip=True)

# Function to convert a DataFrame row into input-output format
def row_to_input_output(row):
    # Clean and concatenate job title and ad details as context
    context = clean_html_tags(f"{row['job_title']} {row['job_ad_details']}")
    # Define the instruction prompt
    prompt = (
        "Extract the salary info from the job ad below and return it as: "
        "\"MinimumSalary MaximumSalary Currency PayPeriod\"\n\n"
    )
    input_text = prompt + context
    # Normalize the label by converting to string and replacing dashes with spaces
    output_text = str(row["y_true"]).strip().replace("-", " ")
    return {"input": input_text, "output": output_text}

# Function to tokenize input-output pairs for training
def preprocess(example):
    # Tokenize the input with padding and truncation (max 512 tokens)
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    
    # Tokenize the target output sequence (label)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], max_length=64, truncation=True, padding="max_length")["input_ids"]
    
    # Replace padding token IDs with -100 so they are ignored in the loss computation
    labels = [token if token != tokenizer.pad_token_id else -100 for token in labels]
    
    # Add the labels to the model input dictionary
    model_inputs["labels"] = labels
    return model_inputs

In [3]:
# tokenized_dataset = squad_dataset
squad_dataset = Dataset.from_list([row_to_input_output(row) for _, row in df.iterrows()])
tokenized_dataset = squad_dataset.map(preprocess)
final_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=24)
print("training data size：", len(final_dataset['train']))
print("testing data size：", len(final_dataset['test']))



Map:   0%|          | 0/2133 [00:00<?, ? examples/s]

training data size： 1706
testing data size： 427


In [4]:
# # 查看训练集中的两个样本
# for i in range(10):
#     print(f"\nExample {i+1}:")
#     print("Input:", final_dataset["train"][i])
#     print("Input:", final_dataset["train"][i]["input"])
#     print("Output:", final_dataset["train"][i]["output"])

In [5]:
# print(transformers.__version__)
def compute_metrics(p):
    preds, labels = p

    # 有些模型的 preds 是 tuple，取第一个元素
    if isinstance(preds, tuple):
        preds = preds[0]

    # 转成 list 并移除 -100
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # 对于 labels，要先把 -100 替换为 pad_token_id，防止 decode 出错
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 计算 F1 分数
    f1_scores = []
    for pred, label in zip(decoded_preds, decoded_labels):
        pred_tokens = word_tokenize(pred.lower())
        label_tokens = word_tokenize(label.lower())

        common = set(pred_tokens) & set(label_tokens)
        if len(pred_tokens) == 0 or len(label_tokens) == 0:
            f1 = 0
        else:
            precision = len(common) / len(pred_tokens)
            recall = len(common) / len(label_tokens)
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

    return {
        "f1": np.mean(f1_scores)
    }
    
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-base-salary",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=35,
    weight_decay=0.01,
    logging_dir="./logs",
    predict_with_generate=True,
    save_total_limit=20,
    save_strategy="epoch",
    load_best_model_at_end=True,  
    metric_for_best_model="f1",  
    greater_is_better=True,  
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  
)

trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.975163,0.213115
2,7.100700,0.349029,0.590192
3,1.101100,0.296242,0.63243
4,0.665100,0.235849,0.645199
5,0.429100,0.23419,0.656295
6,0.329800,0.245269,0.66176
7,0.329800,0.236968,0.669204
8,0.257200,0.233077,0.670542
9,0.224700,0.215025,0.675096
10,0.200600,0.211215,0.675096


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=14945, training_loss=0.4210079449640544, metrics={'train_runtime': 19096.5779, 'train_samples_per_second': 3.127, 'train_steps_per_second': 0.783, 'total_flos': 7.159504452452352e+16, 'train_loss': 0.4210079449640544, 'epoch': 35.0})

In [6]:
trainer.save_model("./mt5-base-salary")
model.eval()

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 768)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 768)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
         

In [7]:
model_name = "./mt5-base-salary"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# model.to('cpu')
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

test_samples = [final_dataset["test"][i] for i in range(100)]

# # 生成并打印结果
for i, example in enumerate(test_samples):
    sample = example['input']
    print(f"Test Sample {i + 1}: {sample}")
    output = qa_pipeline(sample)
    print(f"Prediction: {output[0]['generated_text']}")
    print(f"Answer: {example['output']}\n")


Device set to use mps:0


Test Sample 1: Extract the salary info from the job ad below and return it as: "MinimumSalary MaximumSalary Currency PayPeriod"

ADMIN SECRETARY
Full time administrative secretary
Position Monthly Salary : RM3000-RM4000
JOIN OUR TEAM NOW.
Job Requirements:
• Got experience in used cars (If not, training will be provided)
• With knowledge of MS Office and Website communication software
• Responsible and Self-motivated
• Willing to learn, Active and Hardworking
• Can start immediately
Working Hour : Monday to Friday (9: 30am - 6: 00pm)
Saturday (9: 30am - 1: 00pm)
Working Location: 7-G, Multilevel Car Park, Jalan Equine 10E, Taman
Equine, 43300 Seri Kembangan Selangor Malaysia
Prediction: 3000 4000 MYR MONTHLY
Answer: 3000 4000 MYR MONTHLY

Test Sample 2: Extract the salary info from the job ad below and return it as: "MinimumSalary MaximumSalary Currency PayPeriod"

IT Engineers - Software, Apps and SQL Support / OJT Provided
Information
$2600 to $3400 +Bonuses
SGX Listed Establishment
