In [1]:
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import numpy as np
import random
# set random seed
random.seed(2023)

In [2]:
df_user = pd.read_csv('table1_user_processed.txt', sep='\t')
df_jd = pd.read_csv('table2_jd_processed.txt', sep='\t')
df_action = pd.read_csv('table3_action_processed.txt', sep='\t')

df_user.fillna('', inplace=True)
df_jd.fillna('', inplace=True)
df_action.fillna('', inplace=True)

In [6]:
kg_out_dir = 'kg'
if not os.path.exists(kg_out_dir):
    os.mkdir(kg_out_dir)

kg_text_out_dir = 'kg_text'
if not os.path.exists(kg_text_out_dir):
    os.mkdir(kg_text_out_dir)

### User

city

In [4]:
city2index = {}

df_user['KG_cur_city'] = -1
for index, row in df_user.iterrows():
    city_id = int(row['live_city_id'])
    if city_id not in city2index:
        city2index[city_id] = len(city2index)
    df_user.loc[index, 'KG_cur_city'] = city2index[city_id]

df_user['KG_desire_city'] = '-1'
for index, row in df_user.iterrows():
    city_ids_str = row['desire_jd_city_id'].split(',')
    city_ids = []
    for city_id_str in city_ids_str:
        if city_id_str == '' or city_id_str == '-':
            continue
        city_id = int(city_id_str)
        if city_id not in city2index:
            city2index[city_id] = len(city2index)
        city_ids.append(city2index[city_id])
    df_user.loc[index, 'KG_desire_city'] = ','.join([str(x) for x in city_ids])

df_jd['KG_job_city'] = -1
for index, row in df_jd.iterrows():
    city_id = int(row['city'])
    if city_id not in city2index:
        city2index[city_id] = len(city2index)
    df_jd.loc[index, 'KG_job_city'] = city2index[city_id]
    

industry


In [5]:
industry2index = {}

df_user['KG_cur_industry'] = -1
for index, row in df_user.iterrows():
    industry_str = row['cur_industry_id'].strip()
    if industry_str == '':
        continue
    if industry_str not in industry2index:
        industry2index[industry_str] = len(industry2index)
    df_user.loc[index, 'KG_cur_industry'] = industry2index[industry_str]

df_user['KG_desire_industry'] = '-1'
for index, row in df_user.iterrows():
    industries_str = row['desire_jd_industry_id'].strip()
    if industries_str == '':
        continue
    industries = []
    for industry_str in industries_str.split(','):
        industry_str = industry_str.strip()
        if industry_str == '':
            continue
        if industry_str not in industry2index:
            industry2index[industry_str] = len(industry2index)
        industries.append(industry2index[industry_str])
    df_user.loc[index, 'KG_desire_industry'] = ','.join([str(x) for x in industries])

salary:
0000000000	面议
0000001000	1000元以下
0100002000	1000-2000元/月
0200104000	2001-4000元/月
0400106000	4001-6000元/月
0600108000	6001-8000元/月
0800110000	8001-10000元/月
100001150000	100000元以上
1000115000	10001-15000元/月
1500120000	15000-20000元
1500125000	15000-25000元/月
2000130000	20000-30000元
2500199999	25000元/月以上
3000150000	30000-50000元
3500150000	35000-50000元/月
5000170000	50000-70000元/月
70001100000	70000-100000元/月
2500135000	25000-35000元/月

In [6]:
min_salary_dict = {
    100002000:1000,
    400106000:4000,
    200104000:2000,
    600108000:6000,
    800110000:8000,
    1000115000:10000,
    2500199999:25000,
    1500125000:15000,
    3500150000:35000,
    70001100000:70000,
    1000:0,
    100001150000:100000,
    2500135000:25000,
    5000170000:50000
}
max_salary_dict = {
    100002000:2000,
    400106000:6000,
    200104000:4000,
    600108000:8000,
    800110000:10000,
    1000115000:15000,
    2500199999:99999,
    1500125000:25000,
    3500150000:50000,
    70001100000:100000,
    1000:1000,
    100001150000:150000,
    2500135000:35000,
    5000170000:70000
}



In [7]:
salary2index = {}

df_user['KG_cur_min_salary'] = -1
df_user['KG_cur_max_salary'] = -1
for index, row in df_user.iterrows():
    cur_salary_id = int(row['cur_salary_id']) if row['cur_salary_id'] not in {'-', ''} else 0
    if cur_salary_id not in min_salary_dict:
        continue
    cur_min_salary = min_salary_dict[cur_salary_id]
    if cur_min_salary not in salary2index:
        salary2index[cur_min_salary] = len(salary2index)
    cur_max_salary = max_salary_dict[cur_salary_id]
    if cur_max_salary not in salary2index:
        salary2index[cur_max_salary] = len(salary2index)
    df_user.loc[index, 'KG_cur_min_salary'] = salary2index[cur_min_salary]
    df_user.loc[index, 'KG_cur_max_salary'] = salary2index[cur_max_salary]

df_user['KG_desire_min_salary'] = -1
df_user['KG_desire_max_salary'] = -1
for index, row in df_user.iterrows():
    desire_salary_id = int(row['desire_jd_salary_id']) if row['desire_jd_salary_id'] not in {'-', ''} else 0
    if desire_salary_id not in min_salary_dict:
        continue
    desire_min_salary = min_salary_dict[desire_salary_id]
    if desire_min_salary not in salary2index:
        salary2index[desire_min_salary] = len(salary2index)
    desire_max_salary = max_salary_dict[desire_salary_id]
    if desire_max_salary not in salary2index:
        salary2index[desire_max_salary] = len(salary2index)
    df_user.loc[index, 'KG_desire_min_salary'] = salary2index[desire_min_salary]
    df_user.loc[index, 'KG_desire_max_salary'] = salary2index[desire_max_salary]

print(f"num of salary in df_user: {len(salary2index)}")

df_jd['KG_min_salary'] = -1
df_jd['KG_max_salary'] = -1
for index, row in df_jd.iterrows():
    max_salary = row['max_salary']
    if max_salary == 0:
        continue
    #max_salary = int(max_salary / 1000) * 1000
    #四舍五入
    max_salary = int(max_salary / 1000 + 0.5) * 1000
    if max_salary not in salary2index:
        salary2index[max_salary] = len(salary2index)
    min_salary = row['min_salary']
    min_salary = int(min_salary / 1000 + 0.5) * 1000
    if min_salary not in salary2index:
        salary2index[min_salary] = len(salary2index)
    
    df_jd.loc[index, 'KG_min_salary'] = salary2index[min_salary]
    df_jd.loc[index, 'KG_max_salary'] = salary2index[max_salary]

print(f"num of salary in df_user+df_jd: {len(salary2index)}")
    

num of salary in df_user: 15
num of salary in df_user+df_jd: 53


type

In [8]:
type2index = {}

df_user['KG_cur_jdtype'] = -1
for index, row in df_user.iterrows():
    type_str = row['cur_jd_type'].strip()
    if type_str == '':
        continue
    if type_str not in type2index:
        type2index[type_str] = len(type2index)
    df_user.loc[index, 'KG_cur_jdtype'] = type2index[type_str]

df_user['KG_desire_jdtype'] = '-1'
for index, row in df_user.iterrows():
    types_str = row['desire_jd_type_id'].strip()
    if types_str == '':
        continue
    types = []
    for type_str in types_str.split(','):
        type_str = type_str.strip()
        if type_str == '':
            continue
        if type_str not in type2index:
            type2index[type_str] = len(type2index)
        types.append(type2index[type_str])
    df_user.loc[index, 'KG_desire_jdtype'] = ','.join([str(x) for x in types])

print(f"num of job types in df_user: {len(type2index)}")

df_jd['KG_job_jdtype'] = -1
for index, row in df_jd.iterrows():
    type_str = row['jd_sub_type'].strip()
    if type_str == '' or type_str == '\\N':
        continue
    if type_str not in type2index:
        type2index[type_str] = len(type2index)
    df_jd.loc[index, 'KG_job_jdtype'] = type2index[type_str]

print(f"num of job types in df_user+df_jd: {len(type2index)}")

num of job types in df_user: 558
num of job types in df_user+df_jd: 678


degree

In [9]:
noisy_degree = set(['其他','请选择','\\N','na', ''])

degree2index = {}

df_user['KG_cur_degree'] = -1
for index, row in df_user.iterrows():
    degree_str = row['cur_degree_id'].strip()
    if degree_str in noisy_degree:
        continue
    if degree_str not in degree2index:
        degree2index[degree_str] = len(degree2index)
    df_user.loc[index, 'KG_cur_degree'] = degree2index[degree_str]

df_jd['KG_require_degree'] = -1
for index, row in df_jd.iterrows():
    degree_str = row['min_edu_level'].strip()
    if degree_str in noisy_degree:
        continue
    if degree_str not in degree2index:
        degree2index[degree_str] = len(degree2index)
    df_jd.loc[index, 'KG_require_degree'] = degree2index[degree_str]

year

In [10]:
min_year_dict = {
    305: 4,
    1:1,
    -1:0,
    0:0,
    103:2,
    510:7,
    1099:10,
    399:4,
    599:7,
    199:1,
    299:2,
    110:1
}

df_user['KG_cur_year'] = 2019-df_user["start_work_date"].apply(lambda x : 2018 if x=="-" else int(x))
df_user['KG_cur_year'] = df_user['KG_cur_year'].apply(lambda x : 10 if x>=10 else x)


In [11]:
df_jd['KG_require_year'] = df_jd['min_years'].apply(lambda x : min_year_dict[x] if x in min_year_dict else 0)

year2index = {}

for index, row in df_user.iterrows():
    year = row['KG_cur_year']
    if year < 0:
        year = 0
    if year not in year2index:
        year2index[year] = len(year2index)
    df_user.loc[index, 'KG_cur_year'] = year2index[year]

for index, row in df_jd.iterrows():
    year = row['KG_require_year']
    if year not in year2index:
        year2index[year] = len(year2index)
    df_jd.loc[index, 'KG_require_year'] = year2index[year]

experience

In [12]:
from gensim.models import KeyedVectors
wv_from_text = KeyedVectors.load_word2vec_format('/data1/laikaihuang/word_embedding/tencent-ailab-embedding-zh-d200-v0.2.0/tencent-ailab-embedding-zh-d200-v0.2.0_refine.bin', binary=True)

In [13]:
df_user['experience'] = df_user['experience'].apply(lambda x : '|'.join(list(set([e.strip() for e in x.split('|') if e.strip() != '']))))
df_user['experience'] = df_user['experience'].apply(lambda x : '|'.join([e for e in x.split('|') if e in wv_from_text.key_to_index]))
# if more than 30, randomly sample 30
df_user['experience'] = df_user['experience'].apply(lambda x : '|'.join(random.sample(x.split('|'), min(30, len(x.split('|'))))))

In [14]:
noisy_exp = set(['互联网参考模型osi七层','其他','请选择','\\N','na', ''])
# 加上所有英文字母
for i in range(26):
    noisy_exp.add(chr(ord('a')+i))
    noisy_exp.add(chr(ord('A')+i))
    
def is_number(s):
    if s.isnumeric():
        return True
    try:
        float(s)
        return True
    except ValueError:
        pass
    return False

import json
with open('city.json', 'r', encoding='utf-8') as f:
    city_data = json.load(f)

provinces = [city_data[key][0]['province'] for key in city_data]
provinces = list(set(provinces))
cities = [item['name'] for key in city_data for item in city_data[key]]
noisy = set(provinces + cities)
noisy.update([n[:-1] for n in noisy])
noisy.update(['经营范围', '市值', '排名', '其他','要求','学历', '待遇', '处理'])

def is_location(s):
    if s.endswith('市') or s.endswith('省') or s.endswith('县'):
        return True
    if s in noisy:
        return True
    return False    

exp2index = {}

df_user['KG_cur_experience'] = '-1'
for index, row in df_user.iterrows():
    exps_str = row['experience']
    exps_str = [s.strip() for s in exps_str.split('|')]
    exps_str = [s for s in exps_str if s not in noisy_exp and not is_number(s) and not is_location(s)]
    exps_str = list(set(exps_str))
    experiences = []
    for exp_str in exps_str:
        if exp_str not in exp2index:
            exp2index[exp_str] = len(exp2index)
        experiences.append(exp2index[exp_str])
    df_user.loc[index, 'KG_cur_experience'] = ','.join([str(x) for x in experiences])

print(f"exp in users: {len(exp2index)}")
user_exp = set(exp2index.keys())

exp in users: 5507


In [15]:
df_jd_gpt = pd.read_csv('table2_jd_processed_gpt.txt', sep='\t')
df_jd_gpt.fillna('', inplace=True)
# 处理GPT提取的关键词
# 1、统一分隔符
def unify_keyword(keyword):
    return keyword.replace('、', '|').replace('，', '|').replace('；', '|').replace(';', '|').replace('、', '|').replace('&', '|')

df_jd_gpt['keyword_proc'] = df_jd_gpt['skill_keyword'].progress_apply(unify_keyword)

# 2、去掉包含公司名称
def clean_keyword(keyword):
    keywords = keyword.split('|')
    keywords = [k.strip() for k in keywords if k.strip() != '' and not is_number(k.strip())]
    keywords = [k for k in keywords if not k.endswith('公司') and not k.endswith('集团') and not k.endswith('县')]
    keywords = [k for k in keywords if not '。' in k and not ':' in k and not '：' in k and not '【' in k and not '】' in k]
    return '|'.join(list(set(keywords)))

df_jd_gpt['keyword_proc'] = df_jd_gpt['keyword_proc'].progress_apply(clean_keyword)

# 3、去掉地名和一些无意义的词
def clean_noisy(keyword):
    keywords = keyword.split('|')
    keywords = [k for k in keywords if k not in noisy and len(k) < 10]
    return '|'.join(list(set(keywords)))

df_jd_gpt['keyword_proc'] = df_jd_gpt['keyword_proc'].progress_apply(clean_noisy)

# 4、去掉没有出现在岗位描述中的关键词
df_jd_gpt['keyword_proc'] = df_jd_gpt.progress_apply(lambda x: '|'.join([k for k in x['keyword_proc'].split('|') if k in x['job_description']]), axis=1)

100%|██████████| 19114/19114 [00:00<00:00, 631323.89it/s]
100%|██████████| 19114/19114 [00:00<00:00, 48784.01it/s]
100%|██████████| 19114/19114 [00:00<00:00, 226775.57it/s]
100%|██████████| 19114/19114 [00:00<00:00, 20530.56it/s]


In [16]:
# 将user中的关键词加入jieba词典，再提取工作描述中的关键词
import jieba 
for word in exp2index:
    jieba.add_word(word)

df_jd_gpt['keyword_jieba'] = df_jd_gpt['keyword_proc'].progress_apply(lambda x: '|'.join(list(set([c for c in jieba.cut(x) if len(c) > 1 and not is_number(c)]))))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


Loading model cost 1.215 seconds.
Prefix dict has been built successfully.
100%|██████████| 19114/19114 [00:02<00:00, 7179.68it/s]


In [17]:
job_exp = set()
df_jd['KG_require_experience'] = '-1'
for index, row in tqdm(df_jd_gpt.iterrows()):
    keywords = row['keyword_jieba'].split('|')
    keywords = [k for k in keywords if k != '' and k not in noisy and not k.endswith('K') and not k.endswith('-') and not is_number(k) and not k.endswith('%')]

    keywords = random.sample(keywords, min(30, len(keywords)))
    exps = []
    for keyword in keywords:
        if keyword in wv_from_text.key_to_index:
            job_exp.add(keyword)
            if keyword not in exp2index:
                exp2index[keyword] = len(exp2index)
            exps.append(exp2index[keyword])
    df_jd.loc[index, 'KG_require_experience'] = ','.join([str(x) for x in exps])

print(f"exp in users+df_jd: {len(exp2index)}")
word_both = user_exp.intersection(job_exp)
print(f"exp exist in both user and job: {len(word_both)}")

19114it [00:03, 5552.76it/s]

exp in users+df_jd: 10716
exp exist in both user and job: 3148





In [18]:
df_user['cur_experience_num'] = df_user['KG_cur_experience'].apply(lambda x: len(x.split(',')) if x != '-1' else 0)

In [19]:
df_user['cur_experience_num'].describe()

count    4500.000000
mean       22.411556
std         9.160443
min         1.000000
25%        15.000000
50%        29.000000
75%        30.000000
max        30.000000
Name: cur_experience_num, dtype: float64

### Export KG
Entity:
- user
- city
- industry
- salary
- jdtype
- degree
- year
- experience
- job
- company

Relation:
- user, cur_city, city
- user, desire_city, city
- user, cur_industry, industry
- user, desire_industry, industry
- user, cur_salary, salary
- user, desire_salary, salary
- user, cur_jdtype, jdtype
- user, disire_jdtype, jdtype
- user, cur_degree, degree
- user, cur_year, year
- user, cur_experience, experience
- user, desire_job. job 
  
(revserse)
 
- job, job_user, user
- job, job_city, city 
- job, job_type, type
- job, require_degree, degree
- job, job_salary, salary
- job, require_year, year
- job, require_experience, experience  

(revserse)

- degree, higher_degree, degree
- degree, lower_degree, degree
- salary, higher_salary, salary
- salary, lower_salary, salary
- year, higher_year, year
- year, lower_year, year
- experience, similar_experience, experience

In [33]:
meta_data = {}
meta_data['dataset_name'] = 'zhaopin'
meta_data['node_data'] = []
meta_data['edge_data'] = []
node_type_list = ['user', 'job', 'city', 'industry', 'salary', 'jdtype', 'degree', 'year', 'experience']
for node_type in node_type_list:
    meta_data['node_data'].append({
        'file_name': f'entity_{node_type}.csv',
        'ntype': node_type
    })

edge_type_list = [
    ('user', 'cur_city', 'city'),
    ('user', 'desire_city', 'city'),
    ('user', 'cur_industry', 'industry'),
    ('user', 'desire_industry', 'industry'),
    ('user', 'cur_min_salary', 'salary'),
    ('user', 'cur_max_salary', 'salary'),
    ('user', 'desire_min_salary', 'salary'),
    ('user', 'desire_max_salary', 'salary'),
    ('user', 'cur_jdtype', 'jdtype'),
    ('user', 'desire_jdtype', 'jdtype'),
    ('user', 'cur_degree', 'degree'),
    ('user', 'cur_year', 'year'),
    ('user', 'cur_experience', 'experience'),
    ('user', 'desire_job', 'job'),
    ('city', 'job_city_rev', 'job'),
    ('city', 'desire_city_rev', 'user'),
    ('industry', 'cur_industry_rev', 'user'),
    ('industry', 'desire_industry_rev', 'user'),
    ('salary', 'cur_min_salary_rev', 'user'),
    ('salary', 'cur_max_salary_rev', 'user'),
    ('salary', 'desire_min_salary_rev', 'user'),
    ('salary', 'desire_max_salary_rev', 'user'),
    ('jdtype', 'cur_jdtype_rev', 'user'),
    ('jdtype', 'desire_jdtype_rev', 'user'),
    ('degree', 'cur_degree_rev', 'user'),
    ('year', 'cur_year_rev', 'user'),
    ('experience', 'cur_experience_rev', 'user'),
    ('job', 'job_user', 'user'),
    ('job', 'job_city', 'city'),
    ('job', 'min_salary', 'salary'),
    ('job', 'max_salary', 'salary'),
    ('job', 'job_jdtype', 'jdtype'),
    ('job', 'require_degree', 'degree'),
    ('job', 'require_year', 'year'),
    ('job', 'require_experience', 'experience'),
    ('salary', 'min_salary_rev', 'job'),
    ('salary', 'max_salary_rev', 'job'),
    ('jdtype', 'job_jdtype_rev', 'job'),
    ('degree', 'require_degree_rev', 'job'),
    ('year', 'require_year_rev', 'job'),
    ('experience', 'require_experience_rev', 'job'),
    ('degree', 'higher_degree', 'degree'),
    ('degree', 'lower_degree', 'degree'),
    ('salary', 'higher_salary', 'salary'),
    ('salary', 'lower_salary', 'salary'),
    ('year', 'higher_year', 'year'),
    ('year', 'lower_year', 'year'),
    ('experience', 'similar_experience', 'experience'),
]


for edge_type in edge_type_list:
    meta_data['edge_data'].append({
        'file_name': f'r_{edge_type[0]}_{edge_type[1]}_{edge_type[2]}.csv',
        'etype': [edge_type[0], edge_type[1], edge_type[2]]
    })

def write_meta_data(meta_data, file_path):
    f = open(file_path, 'w')
    # dataset_name
    dataset_name = meta_data['dataset_name']
    f.write(f"dataset_name: {dataset_name}\n")
    # edge_data
    f.write("edge_data:\n")
    for edge in meta_data['edge_data']:
        f.write(f"- file_name: {edge['file_name']}\n")
        f.write(f"  etype: [{','.join(edge['etype'])}]\n")
    # node_data
    f.write("node_data:\n")
    for node in meta_data['node_data']:
        f.write(f"- file_name: {node['file_name']}\n")
        f.write(f"  ntype: {node['ntype']}\n")
        
write_meta_data(meta_data, os.path.join(kg_out_dir, 'meta.yaml'))

In [34]:
# export entity
# user
entity_user = df_user[['user_id']]
entity_user.to_csv(os.path.join(kg_text_out_dir, 'entity_user.csv'), index=False)
pd.DataFrame({'node_id': range(len(entity_user))}).to_csv(os.path.join(kg_out_dir, 'entity_user.csv'), index=False)
# job
entity_job = df_jd[['jd_title']]
entity_job.to_csv(os.path.join(kg_text_out_dir, 'entity_job.csv'), index=False)
pd.DataFrame({'node_id': range(len(entity_job))}).to_csv(os.path.join(kg_out_dir, 'entity_job.csv'), index=False)
# city
pd.DataFrame({'city_id': list(city2index.keys())}).to_csv(os.path.join(kg_text_out_dir, 'entity_city.csv'), index=False)
pd.DataFrame({'node_id': list(city2index.values())}).to_csv(os.path.join(kg_out_dir, 'entity_city.csv'), index=False)
# industry
pd.DataFrame({'industry_name': list(industry2index.keys())}).to_csv(os.path.join(kg_text_out_dir, 'entity_industry.csv'), index=False)
pd.DataFrame({'node_id': list(industry2index.values())}).to_csv(os.path.join(kg_out_dir, 'entity_industry.csv'), index=False)
# salary
pd.DataFrame({'salary': list(salary2index.keys())}).to_csv(os.path.join(kg_text_out_dir, 'entity_salary.csv'), index=False)
pd.DataFrame({'node_id': list(salary2index.values())}).to_csv(os.path.join(kg_out_dir, 'entity_salary.csv'), index=False)
# type
pd.DataFrame({'type_name': list(type2index.keys())}).to_csv(os.path.join(kg_text_out_dir, 'entity_jdtype.csv'), index=False)
pd.DataFrame({'node_id': list(type2index.values())}).to_csv(os.path.join(kg_out_dir, 'entity_jdtype.csv'), index=False)
# degree
pd.DataFrame({'degree_name': list(degree2index.keys())}).to_csv(os.path.join(kg_text_out_dir, 'entity_degree.csv'), index=False)
pd.DataFrame({'node_id': list(degree2index.values())}).to_csv(os.path.join(kg_out_dir, 'entity_degree.csv'), index=False)
# year
pd.DataFrame({'year': list(year2index.keys())}).to_csv(os.path.join(kg_text_out_dir, 'entity_year.csv'), index=False)
pd.DataFrame({'node_id': list(year2index.values())}).to_csv(os.path.join(kg_out_dir, 'entity_year.csv'), index=False)
# experience
pd.DataFrame({'experience': list(exp2index.keys())}).to_csv(os.path.join(kg_text_out_dir, 'entity_experience.csv'), index=False)
pd.DataFrame({'node_id': list(exp2index.values())}).to_csv(os.path.join(kg_out_dir, 'entity_experience.csv'), index=False)

In [35]:
# export relation
for dataframe in [df_user, df_jd]:
    for colomn in dataframe.columns:
        if colomn.startswith('KG_'):
            src_type = dataframe.columns[0][:-3]
            src_type = 'job' if src_type == 'jd' else src_type
            relation_type = colomn[3:]
            dst_type = relation_type.split('_')[-1]
            print(f"processing relation for {src_type} - {relation_type} - {dst_type}")
            df_temp = dataframe[[colomn]].astype(str)
            df_out = pd.DataFrame(columns=['src_id', 'dst_id'])
            for index, row in df_temp.iterrows():
                src_id = index
                dst_ids = row[colomn].split(',')
                for dst_id in dst_ids:
                    if dst_id == '' or dst_id == '-1':
                        continue
                    dst_id = int(dst_id)
                    df_out = pd.concat([df_out, pd.DataFrame({'src_id': [src_id], 'dst_id': [dst_id]})], ignore_index=True)

            df_out.to_csv(os.path.join(kg_out_dir, f'r_{src_type}_{relation_type}_{dst_type}.csv'), index=False)
            # reverse
            df_out = df_out[['dst_id', 'src_id']]
            df_out.rename(columns={'dst_id': 'src_id', 'src_id': 'dst_id'}, inplace=True)
            df_out.to_csv(os.path.join(kg_out_dir, f'r_{dst_type}_{relation_type}_rev_{src_type}.csv'), index=False)

processing relation for user - cur_city - city
processing relation for user - desire_city - city
processing relation for user - cur_industry - industry
processing relation for user - desire_industry - industry
processing relation for user - cur_min_salary - salary
processing relation for user - cur_max_salary - salary
processing relation for user - desire_min_salary - salary
processing relation for user - desire_max_salary - salary
processing relation for user - cur_jdtype - jdtype
processing relation for user - desire_jdtype - jdtype
processing relation for user - cur_degree - degree
processing relation for user - cur_year - year
processing relation for user - cur_experience - experience
processing relation for job - job_city - city
processing relation for job - min_salary - salary
processing relation for job - max_salary - salary
processing relation for job - job_jdtype - jdtype
processing relation for job - require_degree - degree
processing relation for job - require_year - year
pr

In [36]:
# degree_higher_degree_degree
degree_grade = ['初中', '中专', '高中', '中技', '大专', '本科', '硕士', '博士']
df_out = pd.DataFrame(columns=['src_id', 'dst_id'])
for i in range(len(degree_grade)):
    for j in range(i+1, len(degree_grade)):
        df_out = pd.concat([df_out, pd.DataFrame({'src_id': [degree2index[degree_grade[i]]], 'dst_id': [degree2index[degree_grade[j]]]})], ignore_index=True)
df_out.to_csv(os.path.join(kg_out_dir, f'r_degree_higher_degree_degree.csv'), index=False)

# degree_lower_degree_degree
df_out = df_out[['dst_id', 'src_id']]
df_out.rename(columns={'dst_id': 'src_id', 'src_id': 'dst_id'}, inplace=True)
df_out.to_csv(os.path.join(kg_out_dir, f'r_degree_lower_degree_degree.csv'), index=False)

In [37]:
# salary_higher_salary_salary
salarys = list(salary2index.keys())
salarys.sort()
df_out = pd.DataFrame(columns=['src_id', 'dst_id'])
for i in range(len(salarys)):
    for j in range(i+1, len(salarys)):
        df_out = pd.concat([df_out, pd.DataFrame({'src_id': [salary2index[salarys[i]]], 'dst_id': [salary2index[salarys[j]]]})], ignore_index=True)
df_out.to_csv(os.path.join(kg_out_dir, f'r_salary_higher_salary_salary.csv'), index=False)

# salary_lower_salary_salary
df_out = df_out[['dst_id', 'src_id']]
df_out.rename(columns={'dst_id': 'src_id', 'src_id': 'dst_id'}, inplace=True)
df_out.to_csv(os.path.join(kg_out_dir, f'r_salary_lower_salary_salary.csv'), index=False)

In [38]:
# year_higher_year_year
years = list(year2index.keys())
years.sort()
df_out = pd.DataFrame(columns=['src_id', 'dst_id'])
for i in range(len(years)):
    for j in range(i+1, len(years)):
        df_out = pd.concat([df_out, pd.DataFrame({'src_id': [year2index[years[i]]], 'dst_id': [year2index[years[j]]]})], ignore_index=True)
df_out.to_csv(os.path.join(kg_out_dir, f'r_year_higher_year_year.csv'), index=False)

# year_lower_year_year
df_out = df_out[['dst_id', 'src_id']]
df_out.rename(columns={'dst_id': 'src_id', 'src_id': 'dst_id'}, inplace=True)
df_out.to_csv(os.path.join(kg_out_dir, f'r_year_lower_year_year.csv'), index=False)

In [None]:
# user_desire_job_job
train_links = np.load('train_links.npy')
train_users = train_links[0]
train_items = train_links[1]
user_id2index = {user_id: index for index, user_id in enumerate(df_user['user_id'].tolist())}
item_id2index = {item_id: index for index, item_id in enumerate(df_jd['jd_no'].tolist())}
train_user_index = np.array([user_id2index[user_id] for user_id in train_users])
train_item_index = np.array([item_id2index[item_id] for item_id in train_items])

df_out = pd.DataFrame({'src_id': train_user_index.tolist(), 'dst_id': train_item_index.tolist()})
df_out.to_csv(os.path.join(kg_out_dir, f'r_user_desire_job_job.csv'), index=False)

df_out = pd.DataFrame({'src_id': train_item_index.tolist(), 'dst_id': train_user_index.tolist()})
df_out.to_csv(os.path.join(kg_out_dir, f'r_job_job_user_user.csv'), index=False)

In [40]:
# exp_sim_exp
exp_vectors = np.zeros((len(exp2index), 200))
for exp, i in exp2index.items():
    exp_vectors[i] = wv_from_text[exp]

# 计算两两之间的相似度
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(exp_vectors)

# 如果相似度大于0.75，则认为是相似的
df_out = pd.DataFrame(columns=['src_id', 'dst_id'])
for i in range(len(exp2index)):
    for j in range(i+1, len(exp2index)):
        if similarity_matrix[i][j] > 0.75:
            df_out = pd.concat([df_out, pd.DataFrame({'src_id': [i], 'dst_id': [j]})], ignore_index=True)

df_out = pd.concat([df_out, df_out[['dst_id', 'src_id']].rename(columns={'dst_id': 'src_id', 'src_id': 'dst_id'})], ignore_index=True)
df_out.to_csv(os.path.join(kg_out_dir, f'r_experience_similar_experience_experience.csv'), index=False)

In [41]:
df_out['src_word'] = df_out['src_id'].apply(lambda x: list(exp2index.keys())[x])
df_out['dst_word'] = df_out['dst_id'].apply(lambda x: list(exp2index.keys())[x])