In [1]:
import pandas as pd
import gc
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from tqdm import tqdm
import lightgbm as lgb
import catboost as ctb
import xgboost as xgb
import os
import numpy as np


warnings.simplefilter('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
pd.options.display.max_colwidth = 10000

In [2]:
seed = 2021

In [3]:
df_train = pd.read_csv('data/trainset/recruit_folder.csv')
df_test = pd.read_csv('data/testset/recruit_folder.csv')
df_test['LABEL'] = np.nan

In [4]:
df_feature = df_train.append(df_test, sort=False)

In [5]:
df_feature['LABEL'].mean() #正样本占比0.159

0.15927573602334874

In [6]:
df_feature.shape

(106065, 3)

# 求职者基本信息

In [7]:
df_person = pd.read_csv('data/trainset/person.csv')

In [8]:
#wf
# df_person['LANGUAGE_REMARK_LEN'] = df_person.LANGUAGE_REMARK.str.len()
# df_person['SPECILTY_LEN'] = df_person.SPECILTY.str.len()

In [9]:
df_person = df_person.drop(columns=['LANGUAGE_REMARK', 'SPECILTY'], axis=1)

In [10]:
df_person.rename(columns={'MAJOR': 'PERSON_MAJOR'}, inplace=True)

In [11]:
df_person.head()

Unnamed: 0,PERSON_ID,GENDER,WORK_YEARS,HIGHEST_EDU,PERSON_MAJOR,AGE,LAST_POSITION,LAST_INDUSTRY,CURR_LOC
0,33291,男,15,大专,计算机应用技术,37,网络管理/信息安全管理,,深圳市
1,2985277,男,12,大专,计算机应用技术,35,*公关/营销/业务类,文化体育行业,深圳市
2,2982066,女,10,大专,金融学（含保险学）,32,出纳,医药销售行业,南山区
3,3010866,男,10,中专,物理电子学,34,营销代表/销售顾问,珠宝玉石行业,深圳市
4,316816964,女,15,中专,学前教育学,34,小学教育/幼儿教育/保育,行业组织,福田区


In [12]:
edu_map = {
    '其它': 0,
    '中专': 1,
    '高中（职高、技校）': 2,
    '大专': 3,
    '大学本科': 4,
    '硕士研究生': 5,
    '博士后': 6
}

df_person['HIGHEST_EDU'] = df_person['HIGHEST_EDU'].map(edu_map)

In [13]:
def major_clean(x):
    if type(x) == float:
        return x

    x = x.replace('【', '').replace('】', '')
    return x


df_person['PERSON_MAJOR'] = df_person['PERSON_MAJOR'].apply(major_clean)

In [14]:
df_feature = df_feature.merge(df_person, how='left', on='PERSON_ID')
df_feature.shape

(106065, 11)

# 求职意向

In [15]:
df_person_cv = pd.read_csv('data/trainset/person_cv.csv')

In [16]:
#wf
# df_person_cv['SELF_COMMENT_LEN'] = df_person_cv.SELF_COMMENT.str.len()
# df_person_cv['PERSON_CV_REMARK_LEN'] = df_person_cv.REMARK.str.len()

In [17]:
df_person_cv = df_person_cv.drop(columns=['REMARK'], axis=1)    #'SELF_COMMENT'这边先保留，留作后面拼接embedding

In [18]:
df_person_cv.rename(columns={'LOCATION': 'CV_LOCATION'}, inplace=True)

In [19]:
df_person_cv.head()

Unnamed: 0,PERSON_ID,SELF_COMMENT,POSITION,CV_LOCATION,INDUSTRY,AVAILABLE_IN_DAYS
0,2985277,面谈或电话，并提供资料,导游,深圳市,餐饮旅游娱乐行业,
1,4736088,工作方面：积极主动，责任心较强，有团队协作精神；学习方面：谦虚好学，积极进取；心理方面：乐观开朗，能承受压力与挫折；,*机械类,深圳市,,14.0
2,3016588,"本人性格开朗,品行端正,对环境适应能力强，做事踏实，细心，工作认真负责，工作责任心强，有上进心，善于交际，人际关系融洽、专业知识扎实并很热衷于财务工作。****公司录用，将****公司的辉煌事业献上一份薄力。",*财务类/审计类,宝安区,,7.0
3,2981299,在过去一段时间的社会实践及工作经历磨练了的求实作风、上进之心和坚忍不拔的个性。以本人良好的敬业精神以及个人的工作经验，一定能够胜任所从事的工作，创造良好的工作业绩！ 本人真诚地希望****公司工作，发挥自己的****公司贡献全部的才能。谢谢您的关注！,*电子/通讯类*,深圳市,能源/光电/电器行业,7.0
4,2983790,１：本人为全国注册监理工程师，职称为高级工程师，先后从事过建筑施工、房地产开发、以及监理行业。专业技术扎实，现场管理经验及手段丰富，在工程开展前能迅速做出各项计划并发现工程的难点及特点，工程进展的同时能依据自己的经验找到工程中需解决的核心问题并能迅速找到解决的办法和持续改进；对整个项目的建设有良好的大局观。 2：为人踏实上进，沟通协调能力强，善于处理人际关系，善于团队建设和管理；有良好的职业道德。3：身体健康，性格开朗，会驾驶。 4：具有良好的品牌意识，善于承受各种压力并能承受外地工作等。,结构技术,深圳市,建筑房地产行业,7.0


In [20]:
df_feature = df_feature.merge(df_person_cv, how='left', on='PERSON_ID')
df_feature.shape

(106065, 16)

# 工作经历

In [21]:
df_person_job_hist = pd.read_csv('data/trainset/person_job_hist.csv')

In [22]:
df_person_job_hist.head()

Unnamed: 0,PERSON_ID,POSITION,LOCATION,INDUSTRY,ACHIEVEMENT
0,1281276,行政管理,罗湖区,其它,1.协助总经理处理好日常事务及和外部公共关系；2.负责协助起草总经理各类工作往来文件，并负责有关文件的呈报、催办、归档等管理事宜； 3.协助****公司企业文化、企业战略发展的规划； 4.协助****公司来宾的接待工作；****公司各个项目以及相关日常事务的执行情况，定期跟踪、汇报； 6.兼管行政人事、财务等事务。
1,980158,售前/售后服务,,,"本人主要负责万佳,天虹,岁宝,民润等重要客户的品牌分类管理,收集竞争对手信息与反馈,店内执行评估,货架,助销,价格等,建立和维护重点终端客户及kA市场实践经验．"
2,3016108,培训管理,福田区,信息行业（IT/通讯/互联网）,从事学生管理工作.并负责分校区的学生心理辅导和职业指导工作.
3,3016108,培训管理,福田区,信息行业（IT/通讯/互联网）,从事心理学的教学工作，并担任学校的心理辅导老师，负责了学校心理咨询中心的组建和日常咨询工作的开展，接受咨询需求****人次以上，获得了良好的社会效益。
4,2980989,产品开发,宝安区,电子行业,"1.工程师对产品进行设计及开发 2.处理3D图和2D图,同时制作相关的资料(如BOM\技术文件等)3.修改及更新旧产品结构及性能 4.制作产品的加工工艺及流程5.处理产品的结构及工艺问题"


In [23]:
df_tmp = df_person_job_hist.groupby(['PERSON_ID']).size().reset_index()
df_tmp.columns = ['PERSON_ID', 'JOB_HIST_CNT']

In [24]:
#wf 项目详细经历的总字数以及平均字数和经历中岗位类别数
df_person_job_hist['ACHIEVEMENT_LEN'] = df_person_job_hist.ACHIEVEMENT.str.len()
df_tmp_jobhist = df_person_job_hist.groupby(['PERSON_ID']).aggregate({"POSITION": "count", "ACHIEVEMENT_LEN": ["mean", "sum"]}).reset_index()

df_tmp_jobhist.columns = ['PERSON_ID','POSITION_COUNT','ACHIEVEMENT_LEN_MEAN','ACHIEVEMENT_LEN_SUM']
df_tmp_jobhist.head()

Unnamed: 0,PERSON_ID,POSITION_COUNT,ACHIEVEMENT_LEN_MEAN,ACHIEVEMENT_LEN_SUM
0,488,4,95.5,382.0
1,2477,10,96.5,965.0
2,3929,3,9.0,18.0
3,4706,2,108.0,216.0
4,5813,1,,0.0


In [25]:
df_feature = df_feature.merge(df_tmp, how='left', on='PERSON_ID')
df_feature = df_feature.merge(df_tmp_jobhist, how='left', on='PERSON_ID')
df_feature.shape

(106065, 20)

# 招聘岗位信息

In [26]:
df_recruit = pd.read_csv('data/trainset/recruit.csv')

In [27]:
#wf
df_recruit['DETAIL_LEN'] = df_recruit.DETAIL.str.len()

In [28]:
df_recruit = df_recruit.drop(columns=['DETAIL'], axis=1)

In [29]:
df_recruit.rename(columns={
    'LOCATION': 'RECRUIT_LOCATION',
    'MAJOR': 'RECRUIT_MAJOR'
},
                  inplace=True)

In [30]:
df_recruit.head()

Unnamed: 0,RECRUIT_ID,PERSON_TYPE_CODE,PERSON_TYPE,JOB_TITLE,RECRUIT_MAJOR,LOWER_EDU,RECRUIT_LOCATION,WORK_YEARS_RANGE,DETAIL_LEN
0,135144,,,业务员,,高中（职高、技校）,深圳市,应届毕业生,
1,137045,,,电子技术支持工程师,电子信息工程学,中专,龙岗区,0至1年,70.0
2,146798,,,仓管,【工商管理】,中专,龙岗区,0至1年,41.0
3,436321,2.0,社会无职,销售代表,,中专,深圳市,应届毕业生,43.0
4,440725,99.0,不限,造价员,工民建,中专,深圳市,3至5年,109.0


In [31]:
def major_clean(x):
    if type(x) == float:
        return x

    x = x.replace('【', '').replace('】', '')
    return x


df_recruit['RECRUIT_MAJOR'] = df_recruit['RECRUIT_MAJOR'].apply(major_clean)

In [32]:
df_recruit['LOWER_EDU'] = df_recruit['LOWER_EDU'].map(edu_map)

In [33]:
work_year_range_map = {
    '应届毕业生': 0,
    '0至1年': 1,
    '1至2年': 2,
    '3至5年': 3,
    '5年以上': 4,
    '不限': 5
}
df_recruit['WORK_YEARS_RANGE'] = df_recruit['WORK_YEARS_RANGE'].map(
    work_year_range_map)

In [34]:
person_type_range_map = {
    'nan': 0,
    '不限': 1,
    '社会无职': 2,
    '应届毕业': 3,
    '社会在职': 4
}
df_recruit["PERSON_TYPE"] = df_recruit["PERSON_TYPE"].map(person_type_range_map)

In [35]:
df_feature = df_feature.merge(df_recruit, how='left', on='RECRUIT_ID')
df_feature.shape

(106065, 28)

## person的项目资料信息

In [36]:
#wf  计算每个人的项目个数
#df_person_project = pd.read_csv('data/trainset/person_project.csv')
#df_person_project.head()
# df_tmp_project = df_person_project.groupby(['PERSON_ID'])['PROJECT_NAME'].count().reset_index()
# df_tmp_project.columns = ['PERSON_ID','PROJECT_COUNT']
# print(df_tmp_project.shape)  #(1838,2)  可以看出，有项目资料的样本数比较少。大约只占所有样本的1/20
# df_feature = df_feature.merge(df_tmp_project,how='left',on='PERSON_ID')
# df_feature.shape

# 添加后几乎没效果

## 每个人的证书

In [37]:
#wf  每个人证书中remark长度均值和总长,证书数量
# df_person_cert = pd.read_csv('data/trainset/person_pro_cert.csv')
# df_person_cert['PRO_CERT_DSP'].unique()
#df_person_cert['CERT_REMARK_LEN'] = df_person_cert.REMARK.str.len()
# df_person_cert.drop(['REMARK'],axis=1,inplace=True)
# df_tmp_cert = df_person_cert.groupby(['PERSON_ID']).aggregate({"PRO_CERT_DSP": "count", "CERT_REMARK_LEN": ["mean", "sum"]}).reset_index()
# df_tmp_cert.columns = ['PERSON_ID','CERT_COUNT','CERT_REMARK_LEN_MEAN','CERT_REMARK_LEN_SUM']
# df_tmp_cert.head()

In [38]:
# tmp1 = df_person_cert.groupby(['PERSON_ID'])['PRO_CERT_DSP'].apply(lambda x:x.str.cat(sep=' ')).reset_index()   #证书名称这个文本特征要留着，用于后面拼接它的embedding，，错了，这里groupby了，制作embedding的时候没有groupby，因此拼不上
# tmp1.columns = ['PERSON_ID','PRO_CERT_DSP']
# df_tmp_cert = df_tmp_cert.merge(tmp1,how='left',on='PERSON_ID')

In [39]:
# df_feature = df_feature.merge(df_tmp_cert,how='left',on='PERSON_ID')
# df_feature.shape

# embedding 特征

In [40]:
df_feature.columns

Index(['RECRUIT_ID', 'PERSON_ID', 'LABEL', 'GENDER', 'WORK_YEARS',
       'HIGHEST_EDU', 'PERSON_MAJOR', 'AGE', 'LAST_POSITION', 'LAST_INDUSTRY',
       'CURR_LOC', 'SELF_COMMENT', 'POSITION', 'CV_LOCATION', 'INDUSTRY',
       'AVAILABLE_IN_DAYS', 'JOB_HIST_CNT', 'POSITION_COUNT',
       'ACHIEVEMENT_LEN_MEAN', 'ACHIEVEMENT_LEN_SUM', 'PERSON_TYPE_CODE',
       'PERSON_TYPE', 'JOB_TITLE', 'RECRUIT_MAJOR', 'LOWER_EDU',
       'RECRUIT_LOCATION', 'WORK_YEARS_RANGE', 'DETAIL_LEN'],
      dtype='object')

In [41]:
job_title_embeddings = pd.read_pickle('data/embedding/job_title.pkl')
df_feature = df_feature.merge(job_title_embeddings, how='left', on='JOB_TITLE')
print(df_feature.shape)
del df_feature['JOB_TITLE']

(106065, 58)


In [42]:
job_title_embeddings.shape

(972, 31)

In [45]:
position = pd.read_pickle('data/position_score.pkl')
df_feature = df_feature.merge(position,how="left")

In [46]:
# person_major_embeddings = pd.read_pickle('data/embedding/person_major.pkl')
# person_major_embeddings.rename(columns={
#     'MAJOR': 'PERSON_MAJOR'
# },inplace=True)
# print(person_major_embeddings.head())
# df_feature = df_feature.merge(person_major_embeddings, how='left', on='PERSON_MAJOR')
# del df_feature['JOB_TITLE']

In [47]:
#wf 加入了person的证书embedding###   embeding特征当时没有groupby，因此这里拼不了  然后我在embedding之前做了groupby
# person_cert_embeddings = pd.read_pickle('data/embedding/person_cert.pkl')
# df_feature = df_feature.merge(person_cert_embeddings, how='left', on='PRO_CERT_DSP')
#del df_feature['PRO_CERT_DSP']
#添加后几乎没效果

In [48]:
#wf 加入了person_cv的自我介绍的embedding### 该特征降分！！！
# person_selfcomment_embeddings = pd.read_pickle('data/embedding/person_selfcomment.pkl')
# print(person_selfcomment_embeddings.shape)
# df_feature = df_feature.merge(person_selfcomment_embeddings, how='left', on='SELF_COMMENT')
# del df_feature['SELF_COMMENT']
# df_feature.shape

# 交叉特征

In [49]:
# 简单统计
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()

    return df_merge


def statis_feat(df_know, df_unknow):
    for f in tqdm([['CV_LOCATION'], ['RECRUIT_ID']]):
        df_unknow = stat(df_know, df_unknow, f, {'LABEL': ['mean']})

    return df_unknow


# 5折交叉
df_train = df_feature[~df_feature['LABEL'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['LABEL'].isnull()]

df_stas_feat = None
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kfold.split(df_train, df_train['LABEL']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del (df_fold_train)
    del (df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del (df_stas_feat)
del (df_train)
del (df_test)
gc.collect()

100%|██████████| 2/2 [00:00<00:00, 17.27it/s]
100%|██████████| 2/2 [00:00<00:00, 18.11it/s]
100%|██████████| 2/2 [00:00<00:00, 18.27it/s]
100%|██████████| 2/2 [00:00<00:00, 18.02it/s]
100%|██████████| 2/2 [00:00<00:00, 18.30it/s]
100%|██████████| 2/2 [00:00<00:00,  7.70it/s]


0

In [50]:
df_score = pd.read_pickle('data/score.pkl')
df_feature = df_feature.merge(df_score, how='left')

In [51]:
df_score.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,recruit_person_MAJOR_score
0,825081,6256839,-1.0
1,772899,5413605,0.026797
2,795668,5219796,-1.0
3,769754,5700693,-1.0
4,773645,6208645,0.261549


In [52]:
# count,一个人投了几个简历，一个职位有几个人投
for f in [['PERSON_ID'], ['POSITION']]:
    df_feature['{}_cnt'.format(
        '_'.join(f))] = df_feature.groupby(f)['PERSON_ID'].transform('count')

In [53]:
# nunique
for f1, f2 in [['RECRUIT_ID', 'POSITION'], ['RECRUIT_ID', 'PERSON_MAJOR']]:
    df_feature[f'{f1}_{f2}_nunique'] = df_feature.groupby(
        [f1])[f2].transform('nunique')

In [54]:
# 连续变量统计
for f1, f2 in [['RECRUIT_ID', 'WORK_YEARS']]:
    df_temp = df_feature.groupby(f1)[f2].agg([
        (f'{f1}_{f2}_mean'.format(f),'mean'),
        (f'{f1}_{f2}_max'.format(f),'max'),
        (f'{f1}_{f2}_min'.format(f),'min'),
        (f'{f1}_{f2}_std'.format(f),'std'),
        ]).reset_index()
    df_feature = df_feature.merge(df_temp, how='left')

In [55]:
df_feature['CV_RECRUIT_LOCATION_equal'] = df_feature[[
    'RECRUIT_LOCATION', 'CV_LOCATION'
]].apply(lambda x: x['RECRUIT_LOCATION'] == x['CV_LOCATION'], axis=1)

df_feature['LOWER_EDU_HIGHEST_EDU_higher'] = df_feature[[
    'LOWER_EDU', 'HIGHEST_EDU'
]].apply(lambda x: x['LOWER_EDU'] > x['HIGHEST_EDU'], axis=1)

In [56]:
df_feature.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,LABEL,GENDER,WORK_YEARS,HIGHEST_EDU,PERSON_MAJOR,AGE,LAST_POSITION,LAST_INDUSTRY,CURR_LOC,SELF_COMMENT,POSITION,CV_LOCATION,INDUSTRY,AVAILABLE_IN_DAYS,JOB_HIST_CNT,POSITION_COUNT,ACHIEVEMENT_LEN_MEAN,ACHIEVEMENT_LEN_SUM,PERSON_TYPE_CODE,PERSON_TYPE,RECRUIT_MAJOR,LOWER_EDU,RECRUIT_LOCATION,WORK_YEARS_RANGE,DETAIL_LEN,JOB_TITLE_ernie_emb_0,JOB_TITLE_ernie_emb_1,JOB_TITLE_ernie_emb_2,JOB_TITLE_ernie_emb_3,JOB_TITLE_ernie_emb_4,JOB_TITLE_ernie_emb_5,JOB_TITLE_ernie_emb_6,JOB_TITLE_ernie_emb_7,JOB_TITLE_ernie_emb_8,JOB_TITLE_ernie_emb_9,JOB_TITLE_ernie_emb_10,JOB_TITLE_ernie_emb_11,JOB_TITLE_ernie_emb_12,JOB_TITLE_ernie_emb_13,JOB_TITLE_ernie_emb_14,JOB_TITLE_ernie_emb_15,JOB_TITLE_ernie_emb_16,JOB_TITLE_ernie_emb_17,JOB_TITLE_ernie_emb_18,JOB_TITLE_ernie_emb_19,JOB_TITLE_ernie_emb_20,JOB_TITLE_ernie_emb_21,JOB_TITLE_ernie_emb_22,JOB_TITLE_ernie_emb_23,JOB_TITLE_ernie_emb_24,JOB_TITLE_ernie_emb_25,JOB_TITLE_ernie_emb_26,JOB_TITLE_ernie_emb_27,JOB_TITLE_ernie_emb_28,JOB_TITLE_ernie_emb_29,recruit_person_POSITION_score,CV_LOCATION_LABEL_mean,RECRUIT_ID_LABEL_mean,recruit_person_MAJOR_score,PERSON_ID_cnt,POSITION_cnt,RECRUIT_ID_POSITION_nunique,RECRUIT_ID_PERSON_MAJOR_nunique,RECRUIT_ID_WORK_YEARS_mean,RECRUIT_ID_WORK_YEARS_max,RECRUIT_ID_WORK_YEARS_min,RECRUIT_ID_WORK_YEARS_std,CV_RECRUIT_LOCATION_equal,LOWER_EDU_HIGHEST_EDU_higher
0,772899,5413605,0.0,女,0,3.0,文秘,29,人力资源管理,通讯行业,广东省,热情大方，责任心强，工作认真负责，有较强的适应能力和学习能力，有扎实的专业功底,行政管理,深圳市,,30.0,2.0,2.0,26.5,53.0,1.0,4.0,旅游管理,1.0,福田区,2,168.0,-0.020093,-0.262906,-0.056557,0.194056,-0.11533,-0.099092,0.278635,-0.135564,-0.144507,0.282973,-0.190195,0.049953,0.125888,-0.37462,-0.016451,-0.065058,-0.025949,0.1739,0.355954,-0.235681,0.126304,-0.126924,0.041683,0.106642,-0.003075,0.228822,-0.081131,0.172357,0.190737,-0.294608,-1.0,0.147684,0.020161,0.026797,2,2240.0,121,124,6.5321,28,0,5.060721,False,False
1,795668,5219796,0.0,男,13,3.0,财政学（含税收学）,36,会计,医疗器械行业,宝安区,"本人诚恳、踏实、忠于信用、责任心强,工作勤奋认真、有较强的沟通、分析能力，并具有与同事良好协作的能力",会计,深圳市,制造业,7.0,2.0,2.0,158.0,316.0,,,,3.0,深圳市,3,21.0,-0.217036,-0.074155,-0.090927,-0.140628,0.054602,-0.104413,0.045719,0.267473,0.150557,-0.245592,-0.066818,0.031661,0.088511,0.080997,-0.120754,-0.14712,-0.119424,0.017087,-0.072285,0.087759,-0.076689,0.234719,0.33583,0.390114,0.238854,-0.333758,-0.151096,0.111547,-0.024619,0.367446,-1.0,0.147684,0.032258,-1.0,8,7194.0,11,15,17.38843,43,0,8.374935,True,False
2,773645,6208645,0.0,男,0,1.0,计算机应用技术,28,,,南山区,,汽车修理,深圳市,信息行业（IT/通讯/互联网）,7.0,1.0,1.0,101.0,101.0,2.0,2.0,汽车工程,2.0,深圳市,3,46.0,-0.30936,-0.023115,0.012245,-0.09055,0.07009,0.308661,-0.226925,-0.207737,0.019689,-0.060863,0.057828,0.148603,0.235077,-0.086942,-0.06163,0.221807,-0.025869,0.147118,-0.205001,0.042192,-0.150689,0.004426,0.212918,0.22135,-0.074678,0.110514,0.331468,0.313623,0.265447,-0.274928,-1.0,0.147684,0.0,0.261549,1,83.0,26,44,9.927461,33,0,8.40851,True,True
3,795526,6196384,0.0,女,6,3.0,文秘,29,客户服务,,罗湖区,自信，有责任感，有耐心，有干劲。善于总结和思考问题。对生活和工作有明确的目标和计划，并能够为之不断努力奋斗。成功就是把未来变成梦想，把梦想变为现实，一直在这样做。百度李彦宏说，生活可以走直线，深信不疑!不是最聪明的，但会用心去做的工作！有足够的信心帮你解决问题！,国际贸易/涉外业务,深圳市,商业零售行业,14.0,1.0,1.0,110.0,110.0,,,,1.0,福田区,1,81.0,-0.290974,-0.250667,-0.099687,0.224765,-0.070474,-0.102311,0.304896,0.286902,-0.111283,0.355839,-0.180672,-0.188631,0.11547,0.164261,-0.344643,0.21237,-0.020526,0.076332,-0.000754,-0.272077,-0.031857,-0.192921,0.051502,0.090361,-0.064331,0.032852,0.125393,0.026824,-0.196383,0.0189,-1.0,0.147684,0.005208,-1.0,5,6018.0,88,97,6.677188,22,0,4.694759,False,False
4,817993,5868175,0.0,男,13,1.0,电子信息科学与技术,34,售前/售后服务,,南山区,为人随和正直，积极乐观，有思想，有梦想。善于人际交往，有很强的团队合作意识。,售前/售后服务,深圳市,商业零售行业,7.0,1.0,1.0,89.0,89.0,,,,3.0,福田区,0,110.0,-0.256669,-0.044442,0.093839,-0.471277,0.131238,-0.013545,-0.368669,-0.019363,-0.035097,-0.00727,-0.12651,-0.225958,0.039176,0.226642,-0.029324,-0.133674,0.217693,0.158621,-0.2622,-0.137221,0.111625,-0.000705,0.300331,-0.075424,0.078619,0.007605,0.356642,0.069532,-0.027976,-0.006046,-1.0,0.147684,0.25,-1.0,1,1657.0,17,26,10.860465,26,0,5.792365,False,True


# 建模

In [57]:
for f in df_feature.select_dtypes('object'):
    le = LabelEncoder()
    print(f)
    df_feature[f] = le.fit_transform(df_feature[f].astype('str'))

GENDER
PERSON_MAJOR
LAST_POSITION
LAST_INDUSTRY
CURR_LOC
SELF_COMMENT
POSITION
CV_LOCATION
INDUSTRY
RECRUIT_MAJOR
RECRUIT_LOCATION


In [58]:
df_train = df_feature[df_feature['LABEL'].notnull()]
df_test = df_feature[df_feature['LABEL'].isnull()]

In [59]:
df_train.shape, df_test.shape

((35291, 71), (70774, 71))

In [60]:
ycol = 'LABEL'
feature_names = list(filter(lambda x: x not in [ycol], df_train.columns))

oof = []
lgbprediction = df_test[['RECRUIT_ID', 'PERSON_ID']]
lgbprediction['pred'] = 0
df_importance_list = []

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.08,
                           n_estimators=1000000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=2021,
                           metric='auc')

kfold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
for fold_id, (trn_idx, val_idx) in enumerate(
        kfold.split(df_train[feature_names], df_train[ycol])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(
        fold_id + 1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['valid'],
                          eval_set=[(X_val, Y_val)],
                          verbose=100,
                          eval_metric='auc',
                          early_stopping_rounds=200)

    pred_val = lgb_model.predict_proba(X_val)
    df_oof = df_train.iloc[val_idx][['RECRUIT_ID', 'PERSON_ID', ycol]].copy()
    df_oof['pred'] = pred_val[:, 1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(df_test[feature_names])
    lgbprediction['pred'] += pred_test[:, 1]
tmplgb = lgbprediction




Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.982601
[200]	valid's auc: 0.982202
Early stopping, best iteration is:
[82]	valid's auc: 0.982891


Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.981722
[200]	valid's auc: 0.982555
[300]	valid's auc: 0.98248
[400]	valid's auc: 0.982745
[500]	valid's auc: 0.982453
Early stopping, best iteration is:
[376]	valid's auc: 0.982775


Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.984484
[200]	valid's auc: 0.984372
Early stopping, best iteration is:
[83]	valid's auc: 0.984752


Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.981773
[200]	valid's auc: 0.983095
[300]	valid's auc: 0.982962
[400]	valid's auc: 0.982682
Early stopping, best iteration is:
[250]	valid's auc: 0.983376


Training until validation scores don't improve for 200 rounds
[100]	valid's auc: 0.977055
[200]	valid's auc: 0.978047
[300]	valid

In [61]:
#检查特征重要性
df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
df_importance_list.append(df_importance)

df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg('mean').sort_values(ascending=False).reset_index()
df_importance   

Unnamed: 0,column,importance
0,PERSON_ID,479
1,RECRUIT_ID_LABEL_mean,392
2,RECRUIT_ID,261
3,ACHIEVEMENT_LEN_MEAN,255
4,ACHIEVEMENT_LEN_SUM,223
5,PERSON_ID_cnt,215
6,POSITION_cnt,212
7,SELF_COMMENT,201
8,WORK_YEARS,194
9,POSITION,178


In [62]:
df_oof = pd.concat(oof)
df_oof.sort_values(['pred'], inplace=True, ascending=False)
df_oof.reset_index(drop=True, inplace=True)
df_oof['pred_label'] = 0
df_oof.loc[:int(0.153 * len(df_oof)), 'pred_label'] = 1
f1_lgb = f1_score(df_oof[ycol], df_oof['pred_label'])
auc = roc_auc_score(df_oof[ycol], df_oof['pred'])
f1_lgb, auc

(0.8697940295798929, 0.9799267660328369)

In [63]:
lgbprediction.sort_values(['pred'], inplace=True, ascending=False)
lgbprediction.reset_index(drop=True, inplace=True)
lgbprediction['LABEL'] = 0
lgbprediction.loc[:int(0.153 * len(lgbprediction)), 'LABEL'] = 1
lgbprediction['LABEL'].value_counts()

0    59945
1    10829
Name: LABEL, dtype: int64

In [64]:
os.makedirs('sub', exist_ok=True)
lgbprediction[['RECRUIT_ID', 'PERSON_ID', 'LABEL']].to_csv('submit/lgb{}.csv'.format(f1_lgb),
                                                        index=False)
lgbprediction[['RECRUIT_ID', 'PERSON_ID', 'LABEL']].to_csv('submit/submission.csv',
                                                        index=False)