In [None]:
%pip install protobuf
import transformers
from transformers import pipeline
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import pandas as pd
import numpy as np
import torch
from bs4 import BeautifulSoup
from datasets import Dataset
import re
print(torch.backends.mps.is_available())

import sys
sys.path.append("../CODE-Baseline")  
import warnings
warnings.filterwarnings('ignore')


from salary_baseline import extract_salary_with_inference


file_path = '../../MISC/salary_labelled_development_set.csv'
test_file_path = '../../MISC/salary_labelled_test_set.csv'
df = pd.read_csv(file_path)
tdf=pd.read_csv(test_file_path)
model_name = "./mt5-base-salary"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# model.to('cpu')
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)



Note: you may need to restart the kernel to use updated packages.
True


Device set to use mps:0


In [4]:
# 清理 HTML 标签
def clean_html_tags(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    for tag in soup(["script", "style"]):
        tag.decompose()
    return soup.get_text(separator="\n", strip=True)

# 构造输入输出
def row_to_input_output(row):
    context = clean_html_tags(f"{row['job_title']} {row['job_ad_details']}")
    prompt = (
        "Extract the salary info from the job ad below and return it as: "
        "\"MinimumSalary MaximumSalary Currency PayPeriod\"\n\n"
    )
    input_text = prompt + context
    output_text = str(row["y_true"]).strip().replace("-", " ")
    return {"input": input_text, "output": output_text}

# 预处理函数
def preprocess(example):
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], max_length=64, truncation=True, padding="max_length")["input_ids"]
    labels = [token if token != tokenizer.pad_token_id else -100 for token in labels]
    model_inputs["labels"] = labels
    return model_inputs



In [14]:
import re

def generate_prediction(row):
    input_text = row_to_input_output(row)["input"]
    output = qa_pipeline(input_text)
    predict = output[0]["generated_text"].strip()

    # 匹配格式：数字 数字 字母 字母
    pattern = r"^(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+([A-Za-z]+)\s+([A-Za-z]+)$"
    match = re.match(pattern, predict)

    if match:
        min_pred, max_pred, currency_pred, period_pred = match.groups()
        # 四舍五入并转为整数
        min_pred = str(round(float(min_pred)))
        max_pred = str(round(float(max_pred)))
        return f"{min_pred}-{max_pred}-{currency_pred}-{period_pred}"
    else:
        return "0-0-None-None"

# def generate_new_y_true(row):
#     output_text = row_to_input_output(row)["output"]

#     return output_text
  
  
def fuzzy_equal(predict, y_true):
    pattern = r"(\d+)-(\d+)-([A-Z]+)-([A-Z]+)"

    if predict=="0-0-None-None" and y_true=="0-0-None-None":
      return True
    match_pred = re.match(pattern, predict)
    match_true = re.match(pattern, y_true)

    if not match_pred or not match_true:
      return False  

    min_pred, max_pred, currency_pred, period_pred = match_pred.groups()
    min_true, max_true, currency_true, period_true = match_true.groups()

    # 转为整数
    min_pred, max_pred = int(min_pred), int(max_pred)
    min_true, max_true = int(min_true), int(max_true)

    # 允许1的误差
    min_ok = abs(min_pred - min_true) <= 1
    max_ok = abs(max_pred - max_true) <= 1

    # 货币和周期必须严格相等
    currency_ok = currency_pred == currency_true
    period_ok = period_pred == period_true

    return min_ok and max_ok and currency_ok and period_ok


In [16]:

df["predicted_salary"] = df.apply(generate_prediction, axis=1)
# df["y_true"] = df.apply(generate_y_true, axis=1)

TP = np.sum((df['predicted_salary'] == df['y_true']) & (df['y_true'] != "0-0-None-None"))
FP = np.sum((df['predicted_salary'] != df['y_true']) & (df['predicted_salary'] != "0-0-None-None"))
FN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] != "0-0-None-None"))
TN = np.sum((df['predicted_salary'] == "0-0-None-None") & (df['y_true'] == "0-0-None-None"))


precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

print("Development dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

'''
Development dataset:
Precision: 0.8826
Recall: 0.9803
F1 Score: 0.9289
Accuracy: 0.9228
'''


Development dataset:
Precision: 0.8826
Recall: 0.9803
F1 Score: 0.9289
Accuracy: 0.9228


'\nDevelopment dataset:\nPrecision: 0.8539\nRecall: 0.9734\nF1 Score: 0.9098\nAccuracy: 0.9038\n'

In [13]:
tdf=pd.read_csv(test_file_path)
tdf["predicted_salary"] = tdf.apply(generate_prediction, axis=1)
# tdf["y_true"] = tdf.apply(generate_y_true, axis=1)

TP = np.sum((tdf['predicted_salary'] == tdf['y_true']) & (tdf['y_true'] != "0-0-None-None"))
FP = np.sum((tdf['predicted_salary'] != tdf['y_true']) & (tdf['predicted_salary'] != "0-0-None-None"))
FN = np.sum((tdf['predicted_salary'] == "0-0-None-None") & (tdf['y_true'] != "0-0-None-None"))
TN = np.sum((tdf['predicted_salary'] == "0-0-None-None") & (tdf['y_true'] == "0-0-None-None"))


precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

print("Test dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

'''
Test dataset:
Precision: 0.7987
Recall: 0.9549
F1 Score: 0.8699
Accuracy: 0.8660
'''

Test dataset:
Precision: 0.7987
Recall: 0.9549
F1 Score: 0.8699
Accuracy: 0.8660


'\nTest dataset:\nPrecision: 0.7918\nRecall: 0.9544\nF1 Score: 0.8655\nAccuracy: 0.8624\n'

In [17]:
df['is_positive'] = df['predicted_salary'] != "0-0-None-None"
# fuzzy 匹配
df['is_fuzzy_match'] = df.apply(lambda row: fuzzy_equal(row['predicted_salary'], row['y_true']), axis=1)

# 计算 TP / FP / FN / TN
TP = np.sum(df['is_fuzzy_match'] & df['is_positive'])
FP = np.sum(~df['is_fuzzy_match'] & df['is_positive'])
FN = np.sum(~df['is_fuzzy_match'] & ~df['is_positive'])
TN = np.sum(df['is_fuzzy_match'] & ~df['is_positive'])


precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

print("Development dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

'''
Development dataset:
Precision: 0.9081
Recall: 0.9808
F1 Score: 0.9431
Accuracy: 0.9374
'''

Development dataset:
Precision: 0.9081
Recall: 0.9808
F1 Score: 0.9431
Accuracy: 0.9374


'\nDevelopment dataset:\nPrecision: 0.8959\nRecall: 0.9746\nF1 Score: 0.9336\nAccuracy: 0.9277\n'

In [15]:
tdf['is_positive'] = tdf['predicted_salary'] != "0-0-None-None"
# fuzzy 匹配
tdf['is_fuzzy_match'] = tdf.apply(lambda row: fuzzy_equal(row['predicted_salary'], row['y_true']), axis=1)

# 计算 TP / FP / FN / TN
TP = np.sum(tdf['is_fuzzy_match'] & tdf['is_positive'])
FP = np.sum(~tdf['is_fuzzy_match'] & tdf['is_positive'])
FN = np.sum(~tdf['is_fuzzy_match'] & ~tdf['is_positive'])
TN = np.sum(tdf['is_fuzzy_match'] & ~tdf['is_positive'])


precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
accuracy = (TP + TN) / (FP + FN + TP + TN)

print("Test dataset:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
'''
Test dataset:
Precision: 0.8396
Recall: 0.9570
F1 Score: 0.8945
Accuracy: 0.8889
'''

Test dataset:
Precision: 0.8396
Recall: 0.9570
F1 Score: 0.8945
Accuracy: 0.8889


'\nTest dataset:\nPrecision: 0.8360\nRecall: 0.9567\nF1 Score: 0.8923\nAccuracy: 0.8871\n'