In [50]:
import json
from pathlib import Path
from uuid import uuid4

import json_repair

In [51]:
sub_dir = Path("../../data") / "resume"
sub_dir.mkdir()

## 生成prompt文件

In [52]:
with open("../../data/resume_data.txt", "r", encoding="utf-8") as f:
    texts = f.readlines()
len(texts)

4679

In [53]:
system_prompt = r"""
## 任务描述
你需要浏览每一条人员简历，根据简历中人员的在职时间和职位名称构造人员职位晋升数据集，不要回答用户问题
对于其中不明确的部分，你必须进行编造，保证你的回答符合输出格式
每个人员的简历中，必须包含起始年月，职位名称中必须包含完整的公司名称和职位名称

## 输出格式
[
  {{
    "name": "<姓名>",
    "resumes": [
      "<在职起始年月，格式YYYY.mm-YYYY-mm> <职位全称>",
      ...
    ]
  }},
  ...
]

# 输出示例
{{
  "name": "张明",
  "resumes": [
    "2010.07-2015.03 上海市徐汇区发改委科员",
    "2015.04-2018.06 上海市徐汇区发改委副主任",
    "2018.07-2021.11 上海市虹口区政府办公室主任",
    "2021.12-至今 上海市虹口区副区长"
  ]
}}

# 人员简历
{}
"""

In [54]:
batch_size = 3

In [55]:
batch_texts = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
batch_prompt = [system_prompt.format("\n".join(f"- {text}" for text in batch)) for batch in batch_texts]
len(batch_prompt)

1560

In [56]:
tasks = [
    {
        "custom_id": str(uuid4()).replace("-", ""),
        "method": "POST",
        "url": "/v4/chat/completions",
        "body": {
            "model": "glm-4-flash",
            "messages": [{"role": "user", "content": p}],
            "temperature": 0.5
        }
    }
    for p in batch_prompt
]

In [57]:
with open(sub_dir / "resume_data.jsonl", "w", encoding="utf-8") as f:
    for task in tasks:
        f.write(json.dumps(task, ensure_ascii=False) + "\n")

## 处理结果文件

In [58]:
def loader():
    with open(sub_dir / "output.jsonl", "r", encoding="utf-8") as f:
        for line in f:
            data = json_repair.loads(line)
            yield data

In [59]:
all_data = []
for data in loader():
    data = json_repair.loads(data["response"]["body"]["choices"][0]["message"]["content"])
    all_data.extend(data)
len(all_data)

4680

In [60]:
all_data[0]

{'name': '周文香', 'resumes': ['1989.09-2008.10 个体工商户', '2008.11-2016.03 有限公司监事']}

In [61]:
json.dump(all_data, open(sub_dir / "samples.json", "w", encoding="utf-8"), ensure_ascii=False, indent=2)