In [1]:
pip install transformers

Looking in indexes: https://pypi.doubanio.com/simple/, https://mirrors.aliyun.com/pypi/simple/, https://pypi.tuna.tsinghua.edu.cn/simple/
Requirement already up-to-date: transformers in /root/.local/lib/python3.6/site-packages (4.6.1)
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandarallel

Looking in indexes: https://pypi.doubanio.com/simple/, https://mirrors.aliyun.com/pypi/simple/, https://pypi.tuna.tsinghua.edu.cn/simple/
Requirement already up-to-date: pandarallel in /root/.local/lib/python3.6/site-packages (1.5.2)
Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import AutoTokenizer
from transformers import AutoModel
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import pickle
from pandarallel import pandarallel
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pandarallel.initialize()
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 48 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
os.makedirs('data/tmp', exist_ok=True)
os.makedirs('data/embedding', exist_ok=True)

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
class Encoder():
    def __init__(self,
                 sentences,
                 embeddings,
                 key_name,
                 prefix,
                 keys_sentences_map=None):
        self.sentences = sentences
        self.embeddings = embeddings
        self.key_name = key_name
        self.prefix = prefix

        if keys_sentences_map is not None:
            self.keys_sentences_map = keys_sentences_map
        else:
            self.keys_sentences_map = dict(zip(sentences, sentences))

        sentences_embeddings_map = dict(zip(sentences, embeddings))
        self.keys_embeddings_map = {}
        for key, sentence in self.keys_sentences_map.items():
            self.keys_embeddings_map[key] = sentences_embeddings_map[sentence]

    def get_embeddings(self, normalize=False):
        if normalize and self.keys_normalize_embeddings_map is not None:
            keys_embeddings_map = self.keys_normalize_embeddings_map
        else:
            keys_embeddings_map = self.keys_embeddings_map

        emb_size = len(list(keys_embeddings_map.values())[0])

        data_list = []
        for key, embedding in keys_embeddings_map.items():
            data_list.append([key] + list(embedding))

        df_emb = pd.DataFrame(data_list)
        df_emb.columns = [self.key_name] + [
            '{}_emb_{}'.format(self.prefix, i) for i in range(emb_size)
        ]

        return df_emb

    def get_embedding(self, key, normalize=False):
        try:
            if normalize and self.keys_normalize_embeddings_map is not None:
                return self.keys_normalize_embeddings_map[key]
            else:
                return self.keys_embeddings_map[key]
        except Exception:
            return None

    def transform_and_normalize(self, kernel, bias, n_components=None):
        """应用变换，然后标准化
        """
        if n_components is not None:
            kernel = kernel[:, :n_components]

        if not (kernel is None or bias is None):
            vecs = (self.embeddings + bias).dot(kernel)
        else:
            vecs = vecs

        vecs = vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5

        sentences_embeddings_map = dict(zip(self.sentences, vecs))
        self.keys_normalize_embeddings_map = {}
        for key, sentence in self.keys_sentences_map.items():
            self.keys_normalize_embeddings_map[key] = sentences_embeddings_map[
                sentence]

In [5]:
def build_model(path):
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModel.from_pretrained(path)
    model = model.to(DEVICE)
    return tokenizer, model

In [6]:
def sent_to_vec(sent, tokenizer, model, pooling, max_length):
    with torch.no_grad():
        inputs = tokenizer(sent,
                           return_tensors="pt",
                           padding=True,
                           truncation=True,
                           max_length=max_length)
        inputs['input_ids'] = inputs['input_ids'].to(DEVICE)
        inputs['token_type_ids'] = inputs['token_type_ids'].to(DEVICE)
        inputs['attention_mask'] = inputs['attention_mask'].to(DEVICE)

        hidden_states = model(**inputs,
                              return_dict=True,
                              output_hidden_states=True).hidden_states

        if pooling == 'first_last_avg':
            output_hidden_state = (hidden_states[-1] +
                                   hidden_states[1]).mean(dim=1)
        elif pooling == 'last_avg':
            output_hidden_state = (hidden_states[-1]).mean(dim=1)
        elif pooling == 'last2avg':
            output_hidden_state = (hidden_states[-1] +
                                   hidden_states[-2]).mean(dim=1)
        elif pooling == 'cls':
            output_hidden_state = (hidden_states[-1])[:, 0, :]
        else:
            raise Exception("unknown pooling {}".format(POOLING))

        vec = output_hidden_state.cpu().numpy()[0]
    return vec

In [7]:
def sents_to_vecs(sents, tokenizer, model, pooling, max_length, verbose=True):
    vecs = []
    if verbose:
        sents = tqdm(sents)
    for sent in sents:
        vec = sent_to_vec(sent, tokenizer, model, pooling, max_length)
        vecs.append(vec)
    assert len(sents) == len(vecs)
    vecs = np.array(vecs)
    return vecs

In [8]:
def compute_kernel_bias(vecs):
    """计算kernel和bias
    最后的变换：y = (x + bias).dot(kernel)
    """
    vecs = np.concatenate(vecs, axis=0)
    mu = vecs.mean(axis=0, keepdims=True)
    cov = np.cov(vecs.T)
    u, s, vh = np.linalg.svd(cov)
    W = np.dot(u, np.diag(s**0.5))
    W = np.linalg.inv(W.T)
    return W, -mu

In [9]:
# 加载模型
path = 'data/pretrain_models/ernie'
tokenizer, model = build_model(path)

Some weights of the model checkpoint at data/pretrain_models/ernie were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 生成embedding

In [10]:
vecs_list = []
pooling = 'cls'
max_length = 128

In [16]:
# 招聘岗位信息的招聘职位
def get_job_title_encoder():
    try:
        vecs = np.load('data/tmp/job_title_vecs.npy')
        with open('data/tmp/job_title_encoder.txt', 'rb') as f:
            job_title_encoder = pickle.load(f)

    except Exception:
        df_recruit = pd.read_csv('data/trainset/recruit.csv')
        sentences = df_recruit['JOB_TITLE'].values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        job_title_encoder = Encoder(sentences, vecs, 'JOB_TITLE',
                                    'JOB_TITLE_ernie')

        np.save('data/tmp/job_title_vecs.npy', vecs)
        with open('data/tmp/job_title_encoder.txt', 'wb') as f:
            pickle.dump(job_title_encoder, f)

    return vecs, job_title_encoder


vecs, job_title_encoder = get_job_title_encoder()
vecs_list.append(vecs)

In [17]:
def major_clean(x):
    if type(x) == float:
        return x

    x = x.replace('【', '').replace('】', '')
    return x

In [18]:
# 招聘岗位信息的对应聘者的专业要求
def get_recruit_major_encoder():
    try:
        vecs = np.load('data/tmp/recruit_major_vecs.npy')
        with open('data/tmp/recruit_major_encoder.txt', 'rb') as f:
            recruit_major_encoder = pickle.load(f)

    except Exception:
        df_recruit = pd.read_csv('data/trainset/recruit.csv')
        df_recruit['MAJOR'].fillna('', inplace=True)
        df_recruit['MAJOR'] = df_recruit['MAJOR'].apply(major_clean)
        sentences = df_recruit['MAJOR'].values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        recruit_major_encoder = Encoder(sentences, vecs, 'MAJOR',
                                        'recruit_MAJOR_ernie')

        np.save('data/tmp/recruit_major_vecs.npy', vecs)
        with open('data/tmp/recruit_major_encoder.txt', 'wb') as f:
            pickle.dump(recruit_major_encoder, f)

    return vecs, recruit_major_encoder


vecs, recruit_major_encoder = get_recruit_major_encoder()
vecs_list.append(vecs)

In [19]:
#wf
def detail_clean(x):
    if type(x) == float:
        return x

    x = x.replace('****', '').replace('~', '').replace('-','').replace('')
    return x

In [20]:
#wf 招聘岗位信息的工作职责
def get_recruit_detail_encoder():
    try:
        vecs = np.load('data/tmp/recruit_detail_vecs.npy')
        with open('data/tmp/recruit_detail_encoder.txt', 'rb') as f:
            recruit_major_encoder = pickle.load(f)

    except Exception:
        df_recruit = pd.read_csv('data/trainset/recruit.csv')
        df_recruit['DETAIL'].fillna('', inplace=True)
        df_recruit['DETAIL'] = df_recruit['DETAIL'].apply(detail_clean)
        sentences = df_recruit['DETAIL'].values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        recruit_major_encoder = Encoder(sentences, vecs, 'DETAIL',
                                        'recruit_DETAIL_ernie')

        np.save('data/tmp/recruit_major_vecs.npy', vecs)
        with open('data/tmp/recruit_major_encoder.txt', 'wb') as f:
            pickle.dump(recruit_major_encoder, f)

    return vecs, recruit_major_encoder


vecs, recruit_major_encoder = get_recruit_major_encoder()
vecs_list.append(vecs)

In [21]:
# 求职者基本信息的应聘者专业
def get_person_major_encoder():
    try:
        vecs = np.load('data/tmp/person_major_vecs.npy')
        with open('data/tmp/person_major_encoder.txt', 'rb') as f:
            person_major_encoder = pickle.load(f)

    except Exception:
        df_person = pd.read_csv('data/trainset/person.csv')
        df_person['MAJOR'].fillna('', inplace=True)
        df_person['MAJOR'] = df_person['MAJOR'].apply(major_clean)
        sentences = df_person['MAJOR'].values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        person_major_encoder = Encoder(sentences, vecs, 'MAJOR',
                                       'person_MAJOR_ernie')

        np.save('data/tmp/person_major_vecs.npy', vecs)
        with open('data/tmp/person_major_encoder.txt', 'wb') as f:
            pickle.dump(person_major_encoder, f)

    return vecs, person_major_encoder


vecs, person_major_encoder = get_person_major_encoder()
vecs_list.append(vecs)

In [22]:
#wf
def lastposition_clean(x):
    if type(x) == float:
        return x

    x = x.replace('*', '').replace('/', '')
    return x

In [23]:
# wf求职者基本信息的应聘者上一份工作岗位
def get_person_lastposition_encoder():
    try:
        vecs = np.load('data/tmp/person_lastposition_vecs.npy')
        with open('data/tmp/person_lastposition_encoder.txt', 'rb') as f:
            person_lastposition_encoder = pickle.load(f)

    except Exception:
        df_person = pd.read_csv('data/trainset/person.csv')
        df_person['LAST_POSITION'].fillna('', inplace=True)
        df_person['LAST_POSITION'] = df_person['LAST_POSITION'].apply(lastposition_clean)
        sentences = df_person['LAST_POSITION'].values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        person_lastposition_encoder = Encoder(sentences, vecs, 'LAST_POSITION',
                                       'person_LAST_POSITION_ernie')

        np.save('data/tmp/person_lastposition_vecs.npy', vecs)
        with open('data/tmp/person_lastposition_encoder.txt', 'wb') as f:
            pickle.dump(person_lastposition_encoder, f)

    return vecs, person_lastposition_encoder


vecs, person_lasposition_encoder = get_person_lastposition_encoder()
vecs_list.append(vecs)

In [24]:
#wf
def cvposition_clean(x):
    if type(x) == float:
        return x

    x = x.replace('*', '').replace('/', '')
    return x

In [25]:
# wf求职者简历信息的投递职位
def get_person_cvposition_encoder():
    try:
        vecs = np.load('data/tmp/person_cvposition_vecs.npy')
        with open('data/tmp/person_cvposition_encoder.txt', 'rb') as f:
            person_cvposition_encoder = pickle.load(f)

    except Exception:
        df_person = pd.read_csv('data/trainset/person_cv.csv')
        df_person['POSITION'].fillna('', inplace=True)
        df_person['POSITION'] = df_person['POSITION'].apply(cvposition_clean)
        tmp = df_person.groupby('PERSON_ID')['POSITION'].apply(lambda x:x.str.cat(sep='')).reset_index()
        sentences = tmp.values.tolist()
        sentences = list(set(sentences))
        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        person_cvposition_encoder = Encoder(sentences, vecs, 'POSITION',
                                       'person_POSITION_ernie')

        np.save('data/tmp/person_cvposition_vecs.npy', vecs)
        with open('data/tmp/person_cvposition_encoder.txt', 'wb') as f:
            pickle.dump(person_cvposition_encoder, f)

    return vecs, person_cvposition_encoder


vecs, person_cvposition_encoder = get_person_cvposition_encoder()
vecs_list.append(vecs)

In [26]:
#求职者投递简历的自我介绍
# wf
def get_person_selfcomment_encoder():
    try:
        vecs = np.load('data/tmp/person_selfcomment_vecs.npy')
        with open('data/tmp/person_selfcomment_encoder.txt', 'rb') as f:
            person_selfcomment_encoder = pickle.load(f)

    except Exception:
        df_person = pd.read_csv('data/trainset/person_cv.csv')
        
        #tmp = df_person.groupby(['PERSON_ID'])['PRO_CERT_DSP'].apply(lambda x:x.str.cat(sep=' ')).reset_index()
        #tmp.columns = ['PERSON_ID','PRO_CERT_DSP']
        #tmp['PRO_CERT_DSP'].fillna('', inplace=True)
        #sentences = tmp['PRO_CERT_DSP'].values.tolist()
        df_person['SELF_COMMENT'].fillna('', inplace=True)
        sentences = df_person['SELF_COMMENT'].values.tolist()
        sentences = list(set(sentences))

        vecs = sents_to_vecs(sentences, tokenizer, model, pooling, max_length)
        person_selfcomment_encoder = Encoder(sentences, vecs, 'SELF_COMMENT',
                                       'person_SELF_COMMENT_ernie')

        np.save('data/tmp/person_selfcomment_vecs.npy', vecs)
        with open('data/tmp/person_selfcomment_encoder.txt', 'wb') as f:
            pickle.dump(person_selfcomment_encoder, f)

    return vecs, person_selfcomment_encoder


vecs, person_selfcomment_encoder = get_person_selfcomment_encoder()
#vecs_list.append(vecs)

In [27]:
vecs_list

[array([[ 8.1258273e-01, -1.4553578e-01, -5.2823836e-01, ...,
         -5.5799925e-01, -1.1621797e+00,  2.7167436e-01],
        [ 1.4431961e+00,  4.8142123e-01, -8.5353512e-01, ...,
         -1.3246323e-01, -5.6695729e-01,  3.9856920e-01],
        [ 1.2136361e-01,  4.9559134e-01, -8.1867927e-01, ...,
         -3.1611624e-01, -9.8271406e-01, -4.7156900e-01],
        ...,
        [ 7.3841190e-01,  2.6713213e-01, -6.7786264e-01, ...,
          1.6303787e-01, -1.2209930e+00,  1.4365354e-01],
        [ 7.5565451e-01,  1.1729981e-02,  8.7025249e-01, ...,
         -3.5121053e-01, -5.8748102e-01, -7.0887886e-02],
        [ 8.9640278e-01,  8.4413867e-04, -4.0254822e-01, ...,
          2.4465663e-02, -9.9738163e-01,  8.2489550e-01]], dtype=float32),
 array([[-0.68353266,  0.0651826 , -0.6007641 , ..., -0.0736664 ,
         -0.14644288,  0.84311026],
        [ 0.44478586, -0.28655863, -0.10807729, ..., -0.90000695,
         -0.52228886,  0.54808503],
        [ 1.0355208 ,  0.40055448,  0.22379814

# BERT-whitening
https://kexue.fm/archives/8321

In [50]:
kernel, bias = compute_kernel_bias(vecs_list)

# 保存embedding

In [51]:
job_title_encoder.transform_and_normalize(kernel, bias, 30)
job_title_embeddings = job_title_encoder.get_embeddings(True)
job_title_embeddings.to_pickle('data/embedding/job_title.pkl')

In [52]:
recruit_major_encoder.transform_and_normalize(kernel, bias, 30)
recruit_major_embeddings = recruit_major_encoder.get_embeddings(True)
recruit_major_embeddings.to_pickle('data/embedding/recruit_major.pkl')

In [53]:
person_major_encoder.transform_and_normalize(kernel, bias, 30)
person_major_embeddings = person_major_encoder.get_embeddings(True)
person_major_embeddings.to_pickle('data/embedding/person_major.pkl')

In [54]:
person_major_embeddings.head()

Unnamed: 0,MAJOR,person_MAJOR_ernie_emb_0,person_MAJOR_ernie_emb_1,person_MAJOR_ernie_emb_2,person_MAJOR_ernie_emb_3,person_MAJOR_ernie_emb_4,person_MAJOR_ernie_emb_5,person_MAJOR_ernie_emb_6,person_MAJOR_ernie_emb_7,person_MAJOR_ernie_emb_8,person_MAJOR_ernie_emb_9,person_MAJOR_ernie_emb_10,person_MAJOR_ernie_emb_11,person_MAJOR_ernie_emb_12,person_MAJOR_ernie_emb_13,person_MAJOR_ernie_emb_14,person_MAJOR_ernie_emb_15,person_MAJOR_ernie_emb_16,person_MAJOR_ernie_emb_17,person_MAJOR_ernie_emb_18,person_MAJOR_ernie_emb_19,person_MAJOR_ernie_emb_20,person_MAJOR_ernie_emb_21,person_MAJOR_ernie_emb_22,person_MAJOR_ernie_emb_23,person_MAJOR_ernie_emb_24,person_MAJOR_ernie_emb_25,person_MAJOR_ernie_emb_26,person_MAJOR_ernie_emb_27,person_MAJOR_ernie_emb_28,person_MAJOR_ernie_emb_29
0,,0.013635,0.020516,-0.057895,-0.023926,0.175738,0.164491,0.016124,-0.110692,0.045256,0.148158,0.066279,-0.289184,-0.115,-0.091854,0.140595,-0.233214,-0.377075,0.047243,0.001328,-0.053626,0.026738,0.321371,-0.043652,-0.307297,0.060946,-0.27302,0.047547,-0.126086,0.102405,-0.512496
1,海洋生物学,0.210712,0.30761,-0.223881,-0.1088,0.065418,-0.162131,0.227934,-0.04488,-0.280816,0.1057,0.145195,-0.073067,0.178802,0.053919,-0.089757,-0.002913,0.160066,0.160978,0.330783,-0.09473,0.493963,-0.118945,-0.048793,0.127638,-0.246648,0.155616,0.008727,-0.082394,0.114455,-0.05768
2,水力学及河流动力学,0.427427,0.193288,-0.25996,-0.169964,0.056329,-0.019772,0.016125,-0.008364,-0.187139,0.393858,0.095704,-0.066989,0.215284,0.280064,0.132729,0.114047,-0.141971,0.052865,0.099448,0.055683,-0.131634,0.185809,-0.011383,-0.038974,-0.143077,0.277509,0.250184,-0.150335,-0.11045,0.189922
3,中国哲学,0.08137,0.195754,-0.173301,-0.339289,0.155924,0.241601,0.040983,0.115527,-0.484167,-0.147083,-0.125517,-0.006267,0.065417,0.121235,-0.098301,0.218872,0.101693,-0.120968,0.266713,0.215141,-0.087461,-0.330836,0.029399,0.063886,0.050128,-0.02068,0.285808,-0.117418,-0.019004,-0.024318
4,电子自动化,-0.031067,0.260592,-0.05397,0.256489,-0.023287,-0.02974,-0.042982,0.142532,0.331002,0.047029,-0.033678,0.051545,-0.363207,-0.11446,0.261854,0.139233,-0.23106,-0.069551,-0.181619,0.09989,0.256754,-0.228839,0.375351,-0.018511,-0.017693,0.240307,-0.188084,-0.128614,-0.037417,-0.134949


In [55]:
person_lasposition_encoder.transform_and_normalize(kernel, bias, 30)
person_lasposition_embeddings = person_lasposition_encoder.get_embeddings(True)
person_lasposition_embeddings.to_pickle('data/embedding/person_lasposition.pkl')

In [56]:
person_cvposition_encoder.transform_and_normalize(kernel, bias, 30)
person_cvposition_embeddings = person_cvposition_encoder.get_embeddings(True)
person_cvposition_embeddings.to_pickle('data/embedding/person_cvposition.pkl')

In [57]:
# person_selfcomment_encoder.transform_and_normalize(kernel, bias, 30)
# person_selfcomment_embeddings = person_selfcomment_encoder.get_embeddings(True)
# person_selfcomment_embeddings.to_pickle('data/embedding/person_selfcomment.pkl')

# 计算匹配度

In [58]:
df_train = pd.read_csv('data/trainset/recruit_folder.csv')
df_test = pd.read_csv('data/testset/recruit_folder.csv')
df_test['LABEL'] = np.nan
df_feature = df_train.append(df_test, sort=False)
df_recruit = pd.read_csv('data/trainset/recruit.csv')
df_feature = df_feature.merge(df_recruit[['RECRUIT_ID', 'MAJOR']],
                              how='left',
                              on='RECRUIT_ID')
df_feature.rename({'MAJOR': 'recruit_MAJOR'}, axis=1, inplace=True)
df_person = pd.read_csv('data/trainset/person.csv')
df_feature = df_feature.merge(df_person[['PERSON_ID', 'MAJOR']],
                              how='left',
                              on='PERSON_ID')
df_feature.rename({'MAJOR': 'person_MAJOR'}, axis=1, inplace=True)

In [59]:
df_feature.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,LABEL,recruit_MAJOR,person_MAJOR
0,825081,6256839,0.0,工业自动化,
1,772899,5413605,0.0,旅游管理,文秘
2,795668,5219796,0.0,,财政学（含税收学）
3,769754,5700693,0.0,,计算机应用技术
4,773645,6208645,0.0,汽车工程,计算机应用技术


In [60]:
def consine(vector1, vector2):
    if type(vector1) != np.ndarray or type(vector2) != np.ndarray:
        return -1
    distance = np.dot(vector1, vector2) / \
        (np.linalg.norm(vector1)*(np.linalg.norm(vector2)))
    return distance

In [61]:
df_feature['recruit_person_MAJOR_score'] = df_feature[[
    'recruit_MAJOR', 'person_MAJOR'
]].apply(lambda x: consine(
    recruit_major_encoder.get_embedding(x['recruit_MAJOR'], True),
    person_major_encoder.get_embedding(x['person_MAJOR'], True)),
         axis=1)

In [62]:
df_feature[['RECRUIT_ID', 'PERSON_ID',
            'recruit_person_MAJOR_score']].to_pickle('data/score.pkl')

In [63]:
#wf 计算上一份工作职位和当前工作职位的匹配度
df_train = pd.read_csv('data/trainset/recruit_folder.csv')
df_test = pd.read_csv('data/testset/recruit_folder.csv')
df_test['LABEL'] = np.nan
df_feature = df_train.append(df_test, sort=False)
df_recruit = pd.read_csv('data/trainset/recruit.csv')
df_feature = df_feature.merge(df_recruit[['RECRUIT_ID', 'JOB_TITLE']],
                              how='left',
                              on='RECRUIT_ID')
#df_feature.rename({'MAJOR': 'recruit_MAJOR'}, axis=1, inplace=True)
df_person = pd.read_csv('data/trainset/person.csv')
df_feature = df_feature.merge(df_person[['PERSON_ID', 'LAST_POSITION']],
                              how='left',
                              on='PERSON_ID')
#df_feature.rename({'MAJOR': 'person_MAJOR'}, axis=1, inplace=True)


In [64]:
df_feature.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,LABEL,JOB_TITLE,LAST_POSITION
0,825081,6256839,0.0,非标结构工程师,机械设计
1,772899,5413605,0.0,航空机票售票员,人力资源管理
2,795668,5219796,0.0,会计主管,会计
3,769754,5700693,0.0,普工/技术人员,*公关/营销/业务类
4,773645,6208645,0.0,拖车司机,


In [65]:
df_feature['recruit_person_POSITION_score'] = df_feature[[
    'JOB_TITLE', 'LAST_POSITION'
]].apply(lambda x: consine(
    recruit_major_encoder.get_embedding(x['JOB_TITLE'], True),
    person_lasposition_encoder.get_embedding(x['LAST_POSITION'], True)),
         axis=1)

In [66]:
df_feature[['RECRUIT_ID', 'PERSON_ID',
            'recruit_person_POSITION_score']].to_pickle('data/position_score.pkl')

In [67]:
#wf 计算上一份工作职位和投递的工作职位的匹配度
df_train = pd.read_csv('data/trainset/recruit_folder.csv')
df_test = pd.read_csv('data/testset/recruit_folder.csv')
df_test['LABEL'] = np.nan
df_feature = df_train.append(df_test, sort=False)
df_recruit = pd.read_csv('data/trainset/person_cv.csv')
df_feature = df_feature.merge(df_recruit[['PERSON_ID', 'POSITION']],
                              how='left',
                              on='PERSON_ID')
#df_feature.rename({'MAJOR': 'recruit_MAJOR'}, axis=1, inplace=True)
df_person = pd.read_csv('data/trainset/person.csv')
df_feature = df_feature.merge(df_person[['PERSON_ID', 'LAST_POSITION']],
                              how='left',
                              on='PERSON_ID')

In [68]:
df_feature.head()

Unnamed: 0,RECRUIT_ID,PERSON_ID,LABEL,POSITION,LAST_POSITION
0,825081,6256839,0.0,*机械类,机械设计
1,772899,5413605,0.0,行政管理,人力资源管理
2,795668,5219796,0.0,会计,会计
3,769754,5700693,0.0,技术支持,*公关/营销/业务类
4,773645,6208645,0.0,汽车修理,


In [69]:
df_feature['personcv_person_POSITION_score'] = df_feature[[
    'POSITION', 'LAST_POSITION'
]].apply(lambda x: consine(
    person_cvposition_encoder.get_embedding(x['POSITION'], True),
    person_lasposition_encoder.get_embedding(x['LAST_POSITION'], True)),
         axis=1)

In [70]:
df_feature[['PERSON_ID',
            'personcv_person_POSITION_score']].to_pickle('data/peroncv_person_position_score.pkl')

In [72]:
print(df_feature.head())

   RECRUIT_ID  PERSON_ID  LABEL POSITION LAST_POSITION  \
0      825081    6256839    0.0     *机械类          机械设计   
1      772899    5413605    0.0     行政管理        人力资源管理   
2      795668    5219796    0.0      会计            会计    
3      769754    5700693    0.0     技术支持    *公关/营销/业务类   
4      773645    6208645    0.0     汽车修理           NaN   

   personcv_person_POSITION_score  
0                       -1.000000  
1                        0.656679  
2                        1.000000  
3                       -1.000000  
4                       -1.000000  
