In [48]:
import json
from pathlib import Path

In [3]:
sub_dir = Path("../../data") / "resume"
sub_dir.mkdir(exist_ok=True)

## 生成prompt文件

In [None]:
from uuid import uuid4

In [52]:
with open("../../data/resume_data.txt", "r", encoding="utf-8") as f:
    texts = f.readlines()
len(texts)

4679

In [53]:
system_prompt = r"""
## 任务描述
你需要浏览每一条人员简历，根据简历中人员的在职时间和职位名称构造人员职位晋升数据集，不要回答用户问题
对于其中不明确的部分，你必须进行编造，保证你的回答符合输出格式
每个人员的简历中，必须包含起始年月，职位名称中必须包含完整的公司名称和职位名称

## 输出格式
[
  {{
    "name": "<姓名>",
    "resumes": [
      "<在职起始年月，格式YYYY.mm-YYYY-mm> <职位全称>",
      ...
    ]
  }},
  ...
]

# 输出示例
{{
  "name": "张明",
  "resumes": [
    "2010.07-2015.03 上海市徐汇区发改委科员",
    "2015.04-2018.06 上海市徐汇区发改委副主任",
    "2018.07-2021.11 上海市虹口区政府办公室主任",
    "2021.12-至今 上海市虹口区副区长"
  ]
}}

# 人员简历
{}
"""

In [54]:
batch_size = 3

In [55]:
batch_texts = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
batch_prompt = [system_prompt.format("\n".join(f"- {text}" for text in batch)) for batch in batch_texts]
len(batch_prompt)

1560

In [56]:
tasks = [
    {
        "custom_id": str(uuid4()).replace("-", ""),
        "method": "POST",
        "url": "/v4/chat/completions",
        "body": {
            "model": "glm-4-flash",
            "messages": [{"role": "user", "content": p}],
            "temperature": 0.5
        }
    }
    for p in batch_prompt
]

In [57]:
with open(sub_dir / "resume_data.jsonl", "w", encoding="utf-8") as f:
    for task in tasks:
        f.write(json.dumps(task, ensure_ascii=False) + "\n")

## 处理结果文件

In [57]:
import random
import re
from datetime import datetime
import json_repair
from dateutil.relativedelta import relativedelta

In [99]:
def loader():
    with open(sub_dir / "output.jsonl", "r", encoding="utf-8") as f:
        for line in f:
            data = json_repair.loads(line)
            yield data

In [112]:
all_data = []
for data in loader():
    data = json_repair.loads(data["response"]["body"]["choices"][0]["message"]["content"])
    all_data.extend(data)
len(all_data)

4680

In [101]:
all_data[0]

{'name': '周文香', 'resumes': ['1989.09-2008.10 个体工商户', '2008.11-2016.03 有限公司监事']}

### 标准化

In [102]:
def generate_end_date(start_date: datetime) -> datetime:
    end_date = start_date + relativedelta(months=random.randint(1, 36))
    return end_date

In [103]:
def generate_date_interval() -> str:
    start_year = random.randint(2000, 2020)
    start_month = random.randint(1, 12)
    start_date = datetime(start_year, start_month, 1)
    end_date = generate_end_date(start_date)
    start = start_date.strftime("%Y.%m")
    end = end_date.strftime("%Y.%m")
    interval = f"{start}-{end}"
    return interval

In [63]:
generate_date_interval()

'2015.09-2017.07'

In [113]:
def process_resume(person_resume: str) -> str:
    person_resume = person_resume.split(" ", maxsplit=1)
    if len(person_resume) == 1:
        time = generate_date_interval()
        return f"{time} {person_resume[0]}"

    time, resume = person_resume
    time = time.replace(" ", "")
    resume = resume.replace(" ", "")
    if match := re.match(r"(\d{4})\.(\d{2})-(.*)", time):
        # fix start
        start_year = int(match.group(1))
        start_month = int(match.group(2))
        start_month = start_month if 0 < start_month < 12 else random.randint(1, 12)
        start_date = datetime(start_year, start_month, 1)
        if match1 := re.match(r"(\d{4})\.(\d{2})", match.group(3)):
            # fix end
            end_year = int(match1.group(1))
            end_month = int(match1.group(2))
            end_month = end_month if 0 < end_month < 12 else random.randint(1, 12)
            end_date = datetime(end_year, end_month, 1)
        else:
            end_date = generate_end_date(start_date)
        time = f"{start_date.strftime('%Y.%m')}-{end_date.strftime('%Y.%m')}"
    else:
        time = generate_date_interval()
    return f"{time} {resume}"

In [114]:
process_resume("至今 xxxx")

'2013.06-2014.08 xxxx'

In [115]:
process_resume("2017.00-至今 xxxx")

'2017.03-2017.10 xxxx'

In [116]:
process_resume("2017.02-2018.05 xxxx ssss")

'2017.02-2018.05 xxxxssss'

In [117]:
process_resume("xxxx")

'2009.04-2009.08 xxxx'

In [118]:
process_resume("2017.00-2020.13 xxx")

'2017.01-2020.03 xxx'

In [119]:
for person in all_data:
    for i in range(len(person["resumes"])):
        try:
            person["resumes"][i] = process_resume(person["resumes"][i])
        except Exception as e:
            print(person)
            print(e)

### 过滤

In [120]:
all_data = [person for person in all_data if len(person["resumes"]) >= 3]
len(all_data)

4153

### 按起始时间排序

In [92]:
def datetime_key(resume: str) -> datetime:
    start_date = resume.split("-")[0]
    return datetime.strptime(start_date, "%Y.%m")

In [121]:
for person in all_data:
    try:
        person["resumes"] = sorted(person["resumes"], key=datetime_key)
    except:
        print(person)

### 输出

In [122]:
json.dump(all_data, open(sub_dir / "samples.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)

## 划分数据集

In [50]:
from sklearn.model_selection import train_test_split

In [123]:
data = json.load(open(sub_dir / "samples.json", "r", encoding="utf-8"))
len(data)

4153

In [124]:
seed = 200

In [125]:
train, test = train_test_split(data, train_size=0.8, shuffle=True, random_state=seed)
len(train), len(test)

(3322, 831)

In [126]:
json.dump(train, open(sub_dir / "train.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)

In [127]:
json.dump(test, open(sub_dir / "test.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)