In [3]:
# python 3.9.2
# python包 lightgbm 3.1.1
# python包 numpy 1.20.1
# python包 pandas 1.2.2
#
# 输入：
#   trainset/person.csv
#   trainset/person_cv.csv
#   trainset/person_job_hist.csv
#   trainset/person_pro_cert.csv
#   trainset/person_project.csv
#   trainset/recruit.csv
#   trainset/recruit_folder.csv
#   testset/recruit_folder.csv
#
# 输出：
#   result.csv
#
# 0.8516
#
import pandas as pd
import lightgbm as lgb

# 读取数据
person_df = pd.read_csv("trainset/person.csv", header=0, names=["求职者编号", "性别", "工作年限", "最高学历", "应聘者专业", "年龄", "最近工作岗位", "最近所在行业", "当前工作所在地", "语言能力", "专业特长"], encoding='utf-8')
person_cv_df = pd.read_csv("trainset/person_cv.csv", header=0, names=["求职者编号", "自荐信", "岗位类别", "工作地点", "所在行业", "可到职天数", "其他说明"])
person_job_hist_df = pd.read_csv("trainset/person_job_hist.csv", header=0, names=["求职者编号", "岗位类别", "单位所在地", "单位所属行业", "主要业绩"])
person_pro_cert_df = pd.read_csv("trainset/person_pro_cert.csv", header=0, names=["求职者编号", "专业证书名称", "备注"])
person_project_df = pd.read_csv("trainset/person_project.csv", header=0, names=["求职者编号", "项目名称", "项目说明", "职责说明", "关键技术"])
recruit_df = pd.read_csv("trainset/recruit.csv", header=0, names=["岗位编号", "招聘对象代码", "招聘对象", "招聘职位", "对应聘者的专业要求", "岗位最低学历", "岗位工作地点", "岗位工作年限", "具体要求"])
train_folder_df = pd.read_csv("trainset/recruit_folder.csv", header=0, names=["岗位编号", "求职者编号", "标签"])
test_folder_df = pd.read_csv("trainset/recruit_folder.csv", header=0, names=["岗位编号", "求职者编号", "标签"])

# 数据预处理
person_df["性别"] = (person_df["性别"] == "女").astype(float)
person_df["最高学历"] = person_df["最高学历"].map({"其它": 0, "中专": 1, "高中（职高、技校）": 2, "大专": 3, "大学本科": 4, "硕士研究生": 5, "博士研究生": 6, "博士后": 7})
person_df["应聘者专业"] = person_df["应聘者专业"].astype("category")
person_cv_df["自荐信字数"] = person_cv_df["自荐信"].str.len()
person_job_hist_df["主要业绩字数"] = person_job_hist_df["主要业绩"].str.len()
recruit_df["招聘对象代码"] = recruit_df["招聘对象代码"].fillna(-1).astype("category")
recruit_df["招聘对象"] = recruit_df["招聘对象"].astype("category")
recruit_df["岗位最低学历"] = recruit_df["岗位最低学历"].map({"其它": 0, "中专": 1, "高中（职高、技校）": 2, "大专": 3, "大学本科": 4, "硕士研究生": 5, "博士研究生": 6, "博士后": 7})
recruit_df["岗位工作年限"] = recruit_df["岗位工作年限"].map({"不限": -1, "应届毕业生": 0, "0至1年": 0, "1至2年": 1, "3至5年": 3, "5年以上": 5})
recruit_df["具体要求字数"] = recruit_df["具体要求"].str.len()

# 聚合工作经历和项目经验数据
job_hist_agg_df = person_job_hist_df.groupby("求职者编号").agg({"岗位类别": "count", "主要业绩字数": ["mean", "sum"]}).reset_index()
job_hist_agg_df.columns = ["求职者编号", "工作经历数", "平均主要业绩字数", "总主要业绩字数"]
project_agg_df = person_project_df.groupby("求职者编号").agg({"项目名称": "count"}).reset_index()
project_agg_df.columns = ["求职者编号", "项目经验数"]

# 训练和测试数据的合并
train_test_df = pd.concat([train_folder_df, test_folder_df], ignore_index=True)
person_agg_df = train_test_df.groupby("求职者编号").agg({"岗位编号": "count"}).reset_index()
person_agg_df.columns = ["求职者编号", "求职者数"]
recruit_agg_df = train_test_df.groupby("岗位编号").agg({"求职者编号": "count"}).reset_index()
recruit_agg_df.columns = ["岗位编号", "岗位数"]

# 特征合并函数
def merge_features(base_df, feature_df):
    feature_person_agg_df = feature_df.groupby("求职者编号").agg({"标签": "mean"}).reset_index()
    feature_person_agg_df.columns = ["求职者编号", "求职者平均标签"]
    feature_recruit_agg_df = feature_df.groupby("岗位编号").agg({"标签": "mean"}).reset_index()
    feature_recruit_agg_df.columns = ["岗位编号", "岗位平均标签"]
    
    base_df = base_df.merge(person_df, on="求职者编号", how="left")
    base_df = base_df.merge(person_cv_df, on="求职者编号", how="left")
    base_df = base_df.merge(recruit_df, on="岗位编号", how="left")
    base_df = base_df.merge(project_agg_df, on="求职者编号", how="left")
    base_df = base_df.merge(job_hist_agg_df, on="求职者编号", how="left")
    base_df = base_df.merge(person_agg_df, on="求职者编号", how="left")
    base_df = base_df.merge(recruit_agg_df, on="岗位编号", how="left")
    base_df = base_df.merge(feature_person_agg_df, on="求职者编号", how="left")
    base_df = base_df.merge(feature_recruit_agg_df, on="岗位编号", how="left")
    base_df["工作地点符合否"] = (base_df["工作地点"] == base_df["岗位工作地点"]).astype(float)
    
    features = ["岗位编号", "求职者编号", "标签", "性别", "工作年限", "最高学历", "应聘者专业", "年龄", "自荐信字数", "可到职天数",
                "项目经验数", "工作经历数", "平均主要业绩字数", "总主要业绩字数", "招聘对象代码", "招聘对象", "岗位最低学历",
                "岗位工作年限", "具体要求字数", "工作地点符合否", "求职者数", "岗位数", "求职者平均标签", "岗位平均标签"]
    base_df = base_df[features]
    return base_df

# K折交叉验证
num_folds = 4
train_data = None

for fold in range(num_folds):
    fold_train_df = train_folder_df[train_folder_df.index % num_folds != fold].reset_index(drop=True)
    fold_valid_df = train_folder_df[train_folder_df.index % num_folds == fold].reset_index(drop=True)
    
    fold_data = merge_features(fold_valid_df, fold_train_df)
    train_data = pd.concat([train_data, fold_data], ignore_index=True)

# 训练 LightGBM 模型
lgb_train_data = lgb.Dataset(train_data.iloc[:, 3:], label=train_data["标签"])
lgb_params = {
    "objective": "binary",
    "learning_rate": 0.03,
    "max_depth": 6,
    "num_leaves": 32,
    "verbose": -1,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8
}

model = lgb.train(lgb_params, lgb_train_data, num_boost_round=10)

# 测试数据
test_data = merge_features(test_folder_df, train_folder_df)
predictions_df = test_data.loc[:, ["岗位编号", "求职者编号"]]
predictions_df["预测打分"] = model.predict(test_data.iloc[:, 3:])
predictions_df = predictions_df.sort_values("预测打分", ascending=False, ignore_index=True)
predictions_df["预测"] = 0
predictions_df.loc[:int(0.30 * len(predictions_df)), ["预测"]] = 1

# 保存结果
result_df = predictions_df.loc[:, ["岗位编号", "求职者编号", "预测"]]
result_df.columns = ["RECRUIT_ID", "PERSON_ID", "LABEL"]
result_df.to_csv("result.csv", index=False)

# # Evaluation
# y_true = test_df.label
# y_pred = 预测表["prediction"]

# accuracy = accuracy_score(y_true, y_pred)
# precision = precision_score(y_true, y_pred)
# recall = recall_score(y_true, y_pred)
# f1 = f1_score(y_true, y_pred)

# print(f"Accuracy: {accuracy:.4f}")
# print(f"Precision: {precision:.4f}")
# print(f"Recall: {recall:.4f}")
# print(f"F1 Score: {f1:.4f}")

In [3]:
result_df.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,LABEL
0,42786680,316772922,1
1,44039674,317239937,1
2,44108339,3274119,1
3,42491720,6118671,1
4,43386658,319097915,1


In [17]:
# fold_data.shape
result_df.shape

(35291, 3)

In [4]:
from sklearn.metrics import classification_report
import pandas as pd

# # 加载数据
# submit_df = pd.read_csv('submit.csv')
# train_df = pd.read_csv('trainset/recruit_folder.csv')

# # 确保数据对齐，这里假设我们需要对齐的列是'column_to_align'
# aligned_submit_df = submit_df[submit_df['column_to_align'].isin(train_df['column_to_align'])]
# aligned_train_df = train_df[train_df['column_to_align'].isin(submit_df['column_to_align'])]

# 加载 trainset/recruit_folder.csv
train_df = pd.read_csv('trainset/recruit_folder.csv')

# 根据 'RECRUIT_ID' 和 'PERSON_ID' 对齐两个表
df = pd.merge(result_df, train_df, on=['RECRUIT_ID', 'PERSON_ID'])

# 提取标签
y_true = df['LABEL_y']  # 假设 train_df 中的标签列名为 'LABEL'
y_pred = df['LABEL_x']  # 假设 submit_df 中的标签列名为 'LABEL'

# 打印分类报告
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.83      0.91     29670
           1       0.52      0.99      0.69      5621

    accuracy                           0.86     35291
   macro avg       0.76      0.91      0.80     35291
weighted avg       0.92      0.86      0.87     35291



In [7]:
# 数据预处理
person_df["性别"] = (person_df["性别"] == "女").astype(float)
person_df["最高学历"] = person_df["最高学历"].map({"其它": 0, "中专": 1, "高中（职高、技校）": 2, "大专": 3, "大学本科": 4, "硕士研究生": 5, "博士研究生": 6, "博士后": 7})
person_df["应聘者专业"] = person_df["应聘者专业"].astype("category")
person_cv_df["自荐信字数"] = person_cv_df["自荐信"].str.len()
person_job_hist_df["主要业绩字数"] = person_job_hist_df["主要业绩"].str.len()
recruit_df["招聘对象代码"] = recruit_df["招聘对象代码"].fillna(-1).astype("category")
recruit_df["招聘对象"] = recruit_df["招聘对象"].astype("category")
recruit_df["岗位最低学历"] = recruit_df["岗位最低学历"].map({"其它": 0, "中专": 1, "高中（职高、技校）": 2, "大专": 3, "大学本科": 4, "硕士研究生": 5, "博士研究生": 6, "博士后": 7})
recruit_df["岗位工作年限"] = recruit_df["岗位工作年限"].map({"不限": -1, "应届毕业生": 0, "0至1年": 0, "1至2年": 1, "3至5年": 3, "5年以上": 5})
recruit_df["具体要求字数"] = recruit_df["具体要求"].str.len()

# 聚合工作经历和项目经验数据
job_hist_agg_df = person_job_hist_df.groupby("求职者编号").agg({"岗位类别": "count", "主要业绩字数": ["mean", "sum"]}).reset_index()
job_hist_agg_df.columns = ["求职者编号", "工作经历数", "平均主要业绩字数", "总主要业绩字数"]
project_agg_df = person_project_df.groupby("求职者编号").agg({"项目名称": "count"}).reset_index()
project_agg_df.columns = ["求职者编号", "项目经验数"]

# 训练和测试数据的合并
train_test_df = pd.concat([train_folder_df, test_folder_df], ignore_index=True)
person_agg_df = train_test_df.groupby("求职者编号").agg({"岗位编号": "count"}).reset_index()
person_agg_df.columns = ["求职者编号", "求职者数"]
recruit_agg_df = train_test_df.groupby("岗位编号").agg({"求职者编号": "count"}).reset_index()
recruit_agg_df.columns = ["岗位编号", "岗位数"]

# 特征合并函数
def merge_features2(base_df, feature_df):
    feature_person_agg_df = feature_df.groupby("求职者编号").agg({"标签": "mean"}).reset_index()
    feature_person_agg_df.columns = ["求职者编号", "求职者平均标签"]
    feature_recruit_agg_df = feature_df.groupby("岗位编号").agg({"标签": "mean"}).reset_index()
    feature_recruit_agg_df.columns = ["岗位编号", "岗位平均标签"]
    
    base_df = base_df.merge(person_df, on="求职者编号", how="left")
    base_df = base_df.merge(person_cv_df, on="求职者编号", how="left")
    base_df = base_df.merge(recruit_df, on="岗位编号", how="left")
    base_df = base_df.merge(project_agg_df, on="求职者编号", how="left")
    base_df = base_df.merge(job_hist_agg_df, on="求职者编号", how="left")
    base_df = base_df.merge(person_agg_df, on="求职者编号", how="left")
    base_df = base_df.merge(recruit_agg_df, on="岗位编号", how="left")
    base_df = base_df.merge(feature_person_agg_df, on="求职者编号", how="left")
    base_df = base_df.merge(feature_recruit_agg_df, on="岗位编号", how="left")
    base_df["工作地点符合否"] = (base_df["工作地点"] == base_df["岗位工作地点"]).astype(float)
    
    features = ["岗位编号", "求职者编号", "标签", "性别", "最高学历", "应聘者专业", "自荐信",  "岗位最低学历","具体要求", "工作地点符合否"]
    base_df = base_df[features]
    return base_df


# 在此处合并所有数据
all_data = merge_features2(train_test_df, train_folder_df)

all_data = all_data.drop_duplicates(subset=["岗位编号", "求职者编号"])

In [13]:
all_data.shape
all_data['自荐信'] = all_data['自荐信'].fillna('')
all_data['具体要求'] = all_data['具体要求'].fillna('')
all_data.sample(10)

Unnamed: 0,岗位编号,求职者编号,标签,性别,最高学历,应聘者专业,自荐信,岗位最低学历,具体要求,工作地点符合否
34959,781116,6139939,0,0.0,,职业技术教育学,本人乐观向上、诚实守信，有较强的集体荣誉感。工作态度认真、积极。掌握了应有的技****...,,****-****岁，初中以上学历，1年以上相关工作经验。,1.0
19837,770813,6009583,0,0.0,,人力资源管理,头脑灵活，思维敏捷，对事物有敏锐的洞察力；能很好的与人沟通，具有团队合作精神；对负责的...,,"五官端正,性格开朗大方,英语水平在4级以上.有市场信息统计和分析经验![注:请将应聘简****",0.0
12453,44233690,321137979,1,0.0,,会计学,为人诚恳塌实，待人真诚热情，开朗而不失内敛；勤奋好学，勇于迎接新挑战；做事沉稳，认真负责，适...,,1、大学专科以上，会计专业，初级职称（含）以上，具备良好的职业道德；2、熟练增值税发票，了解...,1.0
4087,807757,4607869,0,0.0,,会计学,助理会计师 多年的财务工作经验 熟悉税法 曾从事过一般纳税人工业企业及其商业企业 小规模...,,1、性别女，大专以上学历。2、全日制经济类财务会计专业及审计专业生。3、懂全盘账务，对财务软...,0.0
577,819877,4699080,1,0.0,,日语,属学习，实干型的职员。工作认真负责.善于创新.敢于迎接挑战.敢于承担责任.有较强的精力投入...,,"1)有开发海外客户经验者2)日语二级以上,口语佳3)形象好,气质佳4)有责任心5)无经验者勿投",1.0
4205,42390694,5577525,1,0.0,,,自信乐观、性格随和、诚实守信，进取心强，适应能力强。敢于尝试，自信能做好各种工作。热切寻觅一...,,要求： 1、****-****岁。身体健康。 2、富有敬业精神。工作认真负责。有良好的沟通能...,1.0
30948,792037,6135500,0,0.0,,金融学（含保险学）,本人熟悉外贸业务跟单及QC的基本操作流程。同时熟悉各种材质的产品品质控制。3年多的工厂业务跟...,,"1)有一定的产品检验经验,能及时发现问题.2)认真仔细,安排事情妥当!3)能够很好的安排工作...",0.0
33995,791584,6127387,0,0.0,,文秘,本人性格随和大方、为人真诚质朴，具备良好的人际交往能力和表达沟通能力。积极上进、乐意迎接挑战...,,⒈懂白话。⒉打字****字/分钟以上，能熟练操作EXCEL和WORD。⒊有财务经验。,0.0
4658,812237,2605779,0,0.0,,计算机网络与多媒体,1. 熟悉台资、日资企业文化，能较快的融入新工作中；能熟练应用EXCEL、WORD...,,1.从事过生产现场管理且有生产主管经验2年以上经验2.能够全盘操作生产各项事宜，跟进生产计划...,0.0
10999,810290,6110979,0,0.0,,高等教育学,性格坦诚、自信、乐观。积极向上，具有敏锐的观察能力。能迅速的适应各种环境，有较强的时间观念虚...,,要求体貌端正，身体健康，工作要有吃苦耐劳的精神，有合作团队精神。要求外语要好，如有贸易跟单工...,0.0


In [11]:
# 定义一个函数来转换文件的编码
def convert_file_encoding(file_path, new_encoding='utf-8'):
    with open(file_path, 'r', encoding='ansi') as file_in:
        content = file_in.read()
        
    with open(file_path, 'w', encoding=new_encoding) as file_out:
        file_out.write(content)

# 列出需要转换的文件路径
file_paths = [
    'trainset/person.csv',
    'trainset/person_cv.csv',
    'trainset/person_job_hist.csv',
    'trainset/person_project.csv',
    'trainset/person_pro_cert.csv',
    'trainset/recruit.csv',
    'trainset/recruit_folder.csv'
]

# 遍历文件路径列表并转换每个文件的编码
for file_path in file_paths:
    convert_file_encoding(file_path)

print("所有文件已成功转换为UTF-8编码。")


所有文件已成功转换为UTF-8编码。


In [5]:
# 技能匹配度计算函数
def calculate_match_score(resume_skills, job_skills):
    score = 0
    matched_skills = [skill for skill in resume_skills if skill in job_skills]
    if matched_skills:
        score = len(matched_skills) / len(job_skills)  # 基本匹配度为匹配技能数量与岗位技能数量之比
    return score

def calculate_match_score(resume_skills, job_skills):
    proficiency_scale = {'了解': 1, '掌握': 2, '熟悉': 3, '精通': 4}
    total_score = 0
    max_score = sum(proficiency_scale[job_skill] for job_skill in job_skills.values())  # Max possible score based on required proficiency

    for skill, resume_proficiency in resume_skills.items():
        if skill in job_skills:
            resume_score = proficiency_scale[resume_proficiency]
            job_score = proficiency_scale[job_skills[skill]]
            total_score += (resume_score / job_score) * job_score  # Weighted by required proficiency

    if max_score > 0:
        score = total_score / max_score
    else:
        score = 0

    return score

def calculate_overall_match(resume, job_requirements):
    skill_score = calculate_match_score(resume.get('skills', {}), job_requirements.get('skills', {}))
    location_match = 1 if resume.get('location', None) == job_requirements.get('location', None) else 0
    education_match = 1 if resume.get('education_level', 0) >= job_requirements.get('min_education_level', 0) else 0
    qualities_match = len([q for q in resume.get('personal_qualities', []) if q in job_requirements.get('required_qualities', [])])

    # Adjust for cases where required qualities might be zero
    qualities_max = len(job_requirements.get('required_qualities', []))
    qualities_score = qualities_match / qualities_max if qualities_max > 0 else 0

    overall_score = (0.5 * skill_score + 0.2 * location_match + 0.2 * education_match + 0.1 * qualities_score)

    return overall_score

In [17]:
import json
import requests
import time

def parse_text(text):
    url = "http://127.0.0.1:8190/taskflow/uie"
    headers = {"Content-Type": "application/json"}
    data = {"data": {"text": [text]}}
    
    # 发送 POST 请求
    response = requests.post(url=url, headers=headers, data=json.dumps(data))
    
    # 解析响应数据
    parsed_data = json.loads(response.text)
    
    return parsed_data

# 示例调用
text_input = "姓名：陈梦|电话号码：13812345678|邮箱：chenmeng@outlook.com|专业：信息系统|学历：本科|技能：精通系统分析和设计，熟悉SQL及数据库应用开发，曾在电信行业内部署和优化企业级信息系统|个人素质：具有良好的逻辑思维能力和项目管理经验，能够迅速适应新环境并解决复杂问题|个人经历：毕业于西安电子科技大学信息系统专业，曾为系内多个研究项目编写和维护代码，参与校内软件开发竞赛并获奖|个人性格：细致入微，决策力强，具备良好的沟通和协调能力。"

parsed_data = parse_text(text_input)

print(parsed_data)

{'result': [{'姓名': [{'text': '陈梦', 'start': 3, 'end': 5, 'probability': 0.9954159302207621}], '邮箱': [{'text': 'chenmeng@outlook.com', 'start': 26, 'end': 46, 'probability': 0.6801257965137921}], '电话号码': [{'text': '13812345678', 'start': 11, 'end': 22, 'probability': 0.978821837318975}], '学历': [{'text': '本科', 'start': 58, 'end': 60, 'probability': 0.9842219457495958}], '专业': [{'text': '信息系统', 'start': 50, 'end': 54, 'probability': 0.9837346721411677}], '技能': [{'text': '设计', 'start': 71, 'end': 73, 'probability': 0.5801237827244563}, {'text': '精通系统分析', 'start': 64, 'end': 70, 'probability': 0.4449831070826207}]}]}


In [18]:
print(parsed_data['result'][0]['技能'])

[{'text': '设计', 'start': 71, 'end': 73, 'probability': 0.5801237827244563}, {'text': '精通系统分析', 'start': 64, 'end': 70, 'probability': 0.4449831070826207}]


In [14]:
print(all_data['自荐信'][1])
print(all_data['具体要求'][1])

热情大方，责任心强，工作认真负责，有较强的适应能力和学习能力，有扎实的专业功底
岗位职责：电话沟通客户,机票订购服务 任职条件：中专及以上学历，年龄****-****岁。    1.计算机操作熟练,打字速度****字/分以上    2.普通话标准,声音圆润,记忆力好,有良好服务意识    3.能吃苦耐劳,性格大方,善于通过电话与人沟通,责任感强    4.工作积极主动、认真、负责有良好的团队合作精神和沟通理解能力


In [19]:
import pandas as pd
import json
resume_data = []
job_data = []
# Apply the matching function to each row in the DataFrame
resume_data.append(parse_text(all_data['自荐信'][1]))
job_data.append(parse_text(all_data['具体要求'][1]))

In [21]:
score = calculate_overall_match(resume_data[0], job_data[0])
print(score)

0.4


In [25]:
# Apply the parse_text function to each row in the '自荐信' and '具体要求' columns
all_data['parsed_自荐信'] = all_data['自荐信'].apply(parse_text)
all_data['parsed_具体要求'] = all_data['具体要求'].apply(parse_text)

# Calculate the overall match for each row
all_data['score'] = all_data.apply(lambda row: calculate_overall_match(row['parsed_自荐信'], row['parsed_具体要求']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_10_rows['parsed_自荐信'] = first_10_rows['自荐信'].apply(parse_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_10_rows['parsed_具体要求'] = first_10_rows['具体要求'].apply(parse_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_10_rows['score'] = first_10_rows.apply(lambda row: calculat

In [55]:
# 添加新的列 "predict"
all_data["预测"] = all_data["score"].apply(lambda x: 1 if x > 0.5 else 0)

In [56]:
all_data.head(10)

Unnamed: 0,岗位编号,求职者编号,标签,性别,最高学历,应聘者专业,自荐信,岗位最低学历,具体要求,工作地点符合否,预测,score
0,825081,6256839,0,0.0,,,,,要求懂2D，3D等绘图软件，对非标设备有了解。,0.0,0,0.02
1,772899,5413605,0,0.0,,文秘,热情大方，责任心强，工作认真负责，有较强的适应能力和学习能力，有扎实的专业功底,,"岗位职责：电话沟通客户,机票订购服务 任职条件：中专及以上学历，年龄****-****岁。 ...",0.0,0,0.25
2,795668,5219796,0,0.0,,财政学（含税收学）,"本人诚恳、踏实、忠于信用、责任心强,工作勤奋认真、有较强的沟通、分析能力，并具有与同事良好协...",,此招聘信息由系统根据单位搜索条件自动生成！,1.0,0,0.3
3,769754,5700693,0,0.0,,计算机应用技术,为人乐观、严谨务实，以诚待人，团队协作能力强；虚心好学、积极上进、有耐心；吃苦耐劳，工作上有...,,我公司急招聘：维修，测试组，装技术人员 2名 年龄****-**** 懂电子电路PCB电...,0.0,0,0.07
4,773645,6208645,0,0.0,,计算机应用技术,,,"****-****岁，持B牌或以上驾驶证，3年以上驾龄，有1年以上拖车驾驶经验,有深户担保。",1.0,0,0.37
5,813938,1391289,1,0.0,,计算机科学与技术,自评价: 诚实守信，成熟稳重，朴素大方，活泼开朗，从一而终，善于交际，勇于挑战，思维灵敏，富...,,此招聘信息由系统根据单位搜索条件自动生成！,1.0,1,0.53
6,795526,6196384,0,0.0,,文秘,自信，有责任感，有耐心，有干劲。善于总结和思考问题。对生活和工作有明确的目标和计划，并能够为...,,1、中专或大专以上学历；2、性格文静，稳重，反应灵敏，工作主动性强，吃苦耐劳，能承受工作压力...,0.0,0,0.32
7,781773,1340058,0,0.0,,机械制造及其自动化,对产品有创新能力;完成新产品设计可行分析;独立主导新产品设计工作。,,"1, 会PRO-e,UG,2, 懂汽车多媒体结构设计特点,3, 5年以上经验,4, 会cor...",0.0,0,0.24
8,820496,5869866,1,0.0,,电子商务,,,该岗****公司3G无线****公司将区域划分，每月按区域进行指标考核。,0.0,1,0.79
9,838614,6228495,0,0.0,,,"是个上进心强,能吃苦耐劳,服从上级安排、与同事和睦相处，并在 工作中不断提高自己，在职中对工...",,"1年以上工作.****-****岁女性.身高1.6米以上,形象好，做事勤快.为人诚实.操作电...",0.0,0,0.34


In [60]:
# 保存结果
result_df2 = all_data.loc[:, ["岗位编号", "求职者编号", "预测"]]
result_df2.columns = ["RECRUIT_ID", "PERSON_ID", "LABEL"]
result_df2.to_csv("result2.csv", index=False)


# 根据 'RECRUIT_ID' 和 'PERSON_ID' 对齐两个表
df = pd.merge(result_df2, train_df, on=['RECRUIT_ID', 'PERSON_ID'])

# 提取标签
y_true = df['LABEL_y']  # 假设 train_df 中的标签列名为 'LABEL'
y_pred = df['LABEL_x']  # 假设 submit_df 中的标签列名为 'LABEL'

# 打印分类报告
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97     29670
           1       0.78      0.97      0.86      5621

    accuracy                           0.95     35291
   macro avg       0.88      0.96      0.92     35291
weighted avg       0.96      0.95      0.95     35291

