In [1]:
import json
import json
import random
from datetime import datetime
import os
import math

# 定义一个函数来读取 JSON 文件
def read_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data



def clean_data_jobName(data):
    # 清洗数据：去除包含“实习”或“intern”的职位
    cleaned_data = [item for item in data if "实习" not in item["jobName"] and "intern" not in item["jobName"].lower()]

    return cleaned_data
    





def transform_salary_description(salary_desc):
    # 检查是否包含"及以下"，以及是千还是万
    if '千及以下' in salary_desc:
        value, _ = salary_desc.split('千及以下')
        min_value = round(float(value) * 0.6, 5)  # 计算下限并四舍五入到5位小数
        new_salary_desc = f"{min_value}-{value}千"
    elif '万及以下' in salary_desc:
        value, _ = salary_desc.split('万及以下')
        min_value = round(float(value) * 0.6, 5)  # 计算下限并四舍五入到5位小数
        new_salary_desc = f"{min_value}-{value}万"
    else:
        # 如果不符合上述任一条件，则不需要转换
        return salary_desc
    
    return new_salary_desc


def parse_and_average_salary(salary):
    # 将薪资字符串中的“千”和“万”统一转换为浮点数（以“万”为单位）
    if "万" in salary:
        salary = salary.replace('千', '*0.1').replace('万', '')
        # 分割薪资范围
        salary_range = salary.split('-')
        # 计算薪资范围的平均值
        if len(salary_range) == 2:
            min_salary, max_salary = salary_range
            average_salary = ((eval(min_salary) + eval(max_salary)) * 10000) / 2
        else:
            average_salary = eval(salary_range[0]) * 10000
        return round(average_salary , 2)  # 返回平均薪资，单位为元
    #仅含"千"
    else:
        salary = salary.replace('千', '')
        salary_range = salary.split('-')
        if len(salary_range) == 2:
            min_salary, max_salary = salary_range
            average_salary = ((eval(min_salary) + eval(max_salary)) * 1000) / 2
        else:
            average_salary = eval(salary_range[0]) * 1000
        return round(average_salary , 2)  # 返回平均薪资，单位为元


def calculate_monthly_salary(salary_string):
    # 假设transform_salary_description已经将薪资描述转换为了易于解析的格式
    salary_string=transform_salary_description(salary_string)
    if '年' in salary_string:
        salary_string = salary_string.replace('/年', '')
        average_annual_salary = parse_and_average_salary(salary_string)
        return round(average_annual_salary / 12, 2)  # 转换为月薪
    elif '薪' in salary_string:
        salary_range, months = salary_string.split('·')
        average_salary = parse_and_average_salary(salary_range)
        months = float(months.replace('薪', ''))
        return round(average_salary * months / 12, 2)  # 计算年薪再转换为月薪
    elif '-' in salary_string:
        return parse_and_average_salary(salary_string)  # 直接返回月薪
    else:
        # 不属于上述情况，可能需要进一步处理
        print(salary_string)
        return None



def random_split_json_array(file_path, num_files=6):
    # 读取原始JSON文件
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # 随机打乱数组
    random.shuffle(data)

    # 计算每份数据的大小
    total_items = len(data)
    items_per_file = math.ceil(total_items / num_files)

    # 获取原始文件所在目录
    original_dir = os.path.dirname(file_path)

    # 分割并保存到新的JSON文件，文件保存在原始文件同目录
    for i in range(num_files):
        start_index = i * items_per_file
        end_index = min(start_index + items_per_file, total_items)
        split_data = data[start_index:end_index]

        # 构造新文件路径
        new_file_path = os.path.join(original_dir, f'random_part_{i+1}.json')

        with open(new_file_path, 'w', encoding='utf-8') as split_file:
            json.dump(split_data, split_file, indent=4, ensure_ascii=False)





In [2]:
# 替换以下路径为你的实际文件路径
input_file = r'C:\Users\Xue\Documents\GitHub\gender-discrimination-hiring\修正\data_part_all.json'  # 输入文件路径
output_file = r'C:\Users\Xue\OneDrive\0_Code\data\final_processed_jobs.json'  # 输出文件路径

In [3]:
data=read_json_file(input_file)

In [5]:
for i in range(1,7):
    for job in data[f'random_part_{i}']:
        if '在校生/应届生' not in job['workYearString']:
            if "应届" in job['jobName'] or "在校" in job['jobName']:
                print(job)

{'jobId': '153565757', 'jobName': '座椅测试工程师（应届毕业生）', 'jobAreaString': '上海', 'provideSalaryString': '4-7千', 'issueDateString': '2024-03-12 23:32:06', 'workYearString': '无需经验', 'degreeString': '本科', 'fullCompanyName': '上海沿浦金属制品股份有限公司', 'companyTypeString': '已上市', 'companySizeString': '150-500人', 'jobHref': 'https://jobs.51job.com/shanghai-mhq/153565757.html?s=sou_sou_soulb&t=0_0&req=6e8579562be21f3f2169f3387c386276', 'termStr': '全职', 'jobWelfareCodeDataList': ['带薪年假', '五险一金', '包住宿', '绩效奖金', '全勤奖', '节日福利'], 'monthly_salary': 5500.0, 'competition': '45'}
{'jobId': '131220715', 'jobName': '部门助理（双休、可招应届生）', 'jobAreaString': '深圳', 'provideSalaryString': '5-8千', 'issueDateString': '2024-04-01 11:52:39', 'workYearString': '无需经验', 'degreeString': '本科', 'fullCompanyName': '昭星实业（深圳）有限公司', 'companyTypeString': '民营', 'companySizeString': '50-150人', 'jobHref': 'https://jobs.51job.com/shenzhen-gmq/131220715.html?s=sou_sou_soulb&t=0_0&req=a51d7c6820bef931b2a9ff141c8df2d8', 'termStr': '全职', 'jobWelfareCo

In [7]:
for i in range(1,7):
    for job in data[f'random_part_{i}']:
        if '在校生/应届生' not in job['workYearString']:
            if "可招" in job['jobName'] or "接受" in job['jobName'] or "亦可" in job['jobName']:
                print(job)

{'jobId': '131220715', 'jobName': '部门助理（双休、可招应届生）', 'jobAreaString': '深圳', 'provideSalaryString': '5-8千', 'issueDateString': '2024-04-01 11:52:39', 'workYearString': '无需经验', 'degreeString': '本科', 'fullCompanyName': '昭星实业（深圳）有限公司', 'companyTypeString': '民营', 'companySizeString': '50-150人', 'jobHref': 'https://jobs.51job.com/shenzhen-gmq/131220715.html?s=sou_sou_soulb&t=0_0&req=a51d7c6820bef931b2a9ff141c8df2d8', 'termStr': '全职', 'jobWelfareCodeDataList': ['五险一金', '员工旅游', '绩效奖金', '节日福利', '带薪年假', '周末双休', '全勤奖', '餐饮补贴'], 'monthly_salary': 6500.0, 'competition': '181'}
{'jobId': '40623618', 'jobName': 'PHP软件开发工程师（应届生亦可）', 'jobAreaString': '上海', 'provideSalaryString': '8千-1万', 'issueDateString': '2024-04-05 11:25:36', 'workYearString': '无需经验', 'degreeString': '本科', 'fullCompanyName': '亿泰和信息技术（上海）有限公司', 'companyTypeString': '外资（欧美）', 'companySizeString': '50-150人', 'jobHref': 'https://jobs.51job.com/shanghai-ptq/40623618.html?s=sou_sou_soulb&t=0_0&req=5020d73e3772c10c0e47b47c470917eb', 'termSt

In [11]:
# 调用函数进行数据清洗
data=clean_data_jobName(data)

In [15]:

# 设定起始日期
start_date_str = '2024-4-7'
start_date = datetime.strptime(start_date_str, '%Y-%m-%d')



company_jobs = {}
for job in data:

    job["jobAreaString"] = job["jobAreaString"].split('·')[0]

    if job["jobAreaString"] in ["北京","上海","广州","深圳"]:

        # 将 issue_date 从字符串转换为 datetime 对象

        issue_date = datetime.strptime(job["issueDateString"], "%Y-%m-%d %H:%M:%S")

        # 计算 issue_date 和 start_date 之间的差异
        date_diff = (start_date - issue_date).days

        monthly_salary = calculate_monthly_salary(job["provideSalaryString"])
        if monthly_salary is not None:
            job["monthly_salary"] = monthly_salary
        else:
            job["monthly_salary"] = -1


        if date_diff <= 30 and (job["monthly_salary"]<40000 ):

            if job["fullCompanyName"] in company_jobs:
                company_jobs[job["fullCompanyName"]].append(job)
            else:
                company_jobs[job["fullCompanyName"]] = [job]

# 随机选择一个职位项
randomized_data = [random.choice(jobs) for jobs in company_jobs.values()]

# 输出处理后的数据以确认
#print(json.dumps(randomized_data, ensure_ascii=False, indent=4))


# 将处理后的数据保存到新的 JSON 文件中
with open(output_file, 'w', encoding='utf-8') as new_file:
    json.dump(randomized_data, new_file, ensure_ascii=False, indent=4)













































































































































































































































































































































































In [13]:
# 处理后的数据再随机分割为6份
random_split_json_array(output_file)
