In [26]:
from genderize import Genderize
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

apikey = 'f4143e73e7517e9f7ea0f9e8dc62ca52'

exportfolder = r"/group/geog_pyloo/XY/01_gender/_data/_clean"

In [27]:
def get_firstname(x):
    xlen = len(x.split(", "))
    if xlen == 1:
        xlen2 = len(x.split(" "))
        if xlen2 == 1:
            return x.strip()
        else:
            return x.split(" ")[1].strip()
    elif xlen >= 2:
        return x.split(", ")[1].strip()
    else:
        return None

def get_surname(x):
    return x.split(",")[0].strip()

def get_corresponding(row):
    if type(row['ORCIDs']) == float:
        return row['author_first']
    else:
        return row['ORCIDs'].split("/")[0]

def clean(DF):
    DF = DF[DF['Author Full Names'].notnull()].reset_index(drop=True)
    DF = DF[DF['Reprint Addresses'].notnull()].reset_index(drop=True)
    DF['author_corresponding'] = DF['Reprint Addresses'].apply(lambda x: x.split('(corresponding author)')[0].strip())
    DF['author_corresponding_n1'] = DF['author_corresponding'].apply(lambda x: x.split(';')[0].strip())
    DF['author_first'] = DF['Author Full Names'].apply(lambda x: x.split(";")[0])
    DF['author_first_len'] = DF['author_first'].apply(lambda x: len(x.split(",")))
    DF['author_ls'] = DF['Authors'].apply(lambda x: x.split("; "))
    
    def get_corresponding(x):
        auls = x['Author Full Names'].split("; ")
        try:
            idx = x['author_ls'].index(x['author_corresponding_n1'])
            return auls[idx]
        except:
            return None

    DF['author_corresponding_full'] = DF.apply(lambda row: get_corresponding(row), axis=1)
    DF['author_first'] = DF['Author Full Names'].apply(lambda x: x.split(";")[0])
    DF['author_first_first_name'] = DF['author_first'].apply(lambda x: get_firstname(x))
    DF['author_first_surname'] = DF['author_first'].apply(lambda x: get_surname(x))
    print(DF[DF['author_corresponding_full'].isnull()].shape[0], " out of ", DF.shape[0], " are missing corresponding author info")
    DF['author_corresponding_full'] = DF['author_corresponding_full'].fillna(DF['author_first'])
    DF['author_corresponding_first_name'] = DF['author_corresponding_full'].apply(lambda x: get_firstname(x))
    DF['author_corresponding_surname'] = DF['author_corresponding_full'].apply(lambda x: get_surname(x))
    return DF

def get_remain_names(DF):
    namels1 = DF['author_first_first_name'].unique()
    print("Number of unique first name for first authors: ", len(namels1))
    namels2 = DF['author_corresponding_first_name'].unique()
    print("Number of unique first name for corresponding authors: ", len(namels2))
    namels2_no1 = [x for x in namels2 if x not in namels1]
    print("Number of unique first name for corresponding authors not in first authors: ", len(namels2_no1))
    
    namels_sent = namels1.tolist() + namels2_no1
    
    genderize_result_path = os.path.join(exportfolder, "genderize_result.csv")
    if os.path.exists(genderize_result_path):
        finished = pd.read_csv(genderize_result_path)
    else:
        finished = pd.DataFrame(columns=['name', 'gender', 'probability'])

    toadd = [x for x in namels_sent if x not in finished['name'].tolist()]
    print("Number of names to detect: ", len(toadd))
    return toadd, finished


def load_and_clean_data(folder, keyword):
    files = os.listdir(folder)
    DF = []
    for f in files:
        if f.endswith('.xls') or f.endswith('.xlsx'):
            df = pd.read_excel(folder + "/" + f)
            DF.append(df)
    DF = pd.concat(DF).reset_index(drop=True)
    DF = clean(DF)
    return DF

folder_dict = {
    #  'ai': '/group/geog_pyloo/XY/01_gender/_data/ai',
    'living': '/group/geog_pyloo/XY/01_gender/_data/Smart living',
    'mobility': '/group/geog_pyloo/XY/01_gender/_data/Smart mobility',
    'people': '/group/geog_pyloo/XY/01_gender/_data/Smart people',
    'economy': '/group/geog_pyloo/XY/01_gender/_data/Smart economy',
    'governance': '/group/geog_pyloo/XY/01_gender/_data/Smart governance',
    'environment': '/group/geog_pyloo/XY/01_gender/_data/Smart environment',
    'city': '/group/geog_pyloo/XY/01_gender/_data/Smart city'
}

genderize = Genderize(
    user_agent='GenderizeDocs/0.0',
    api_key=apikey,
    timeout=5.0)

for keyword, folder in folder_dict.items():
    DF = load_and_clean_data(folder, keyword)
    toadd, finished = get_remain_names(DF)

    print(f"Processing {len(toadd)} names for keyword: {keyword}")
    chucksize = 1000
    N = len(toadd) // chucksize + 1
    result_ls = []
    for i in tqdm(range(N)):
        temp = genderize.get(toadd[i*chucksize:(i+1)*chucksize])
        result_ls.append(temp)

    result_ls2 = []
    for i in range(len(result_ls)):
        result_ls2 = result_ls2 + result_ls[i]

    result = pd.DataFrame(result_ls2).reset_index(drop=True)
    result_update = pd.concat([finished, result]).reset_index(drop=True)
    result_update.to_csv(os.path.join(exportfolder, "genderize_result.csv"), index=False)

    result_update = pd.read_csv(os.path.join(exportfolder, "genderize_result.csv"))
    result_update = result_update[result_update['probability'] >= 0.75]

    DF = DF.merge(result_update[['name', 'gender', 'probability']], left_on='author_first_first_name', right_on='name', how='left').rename(columns={'gender': 'gender_first', 'probability': 'probability_first'}).drop(columns=['name'])
    DF = DF.merge(result_update[['name', 'gender', 'probability']], left_on='author_corresponding_first_name', right_on='name', how='left').rename(columns={'gender': 'gender_corresponding', 'probability': 'probability_corresponding'}).drop(columns=['name'])

    final_columns = [
        'Publication Date',
        'Journal Abbreviation',
        'Publication Year',
        'Research Areas',
        'Authors',
        'Abstract',
        'Article Title',
        'Author Keywords',
        # 'author_corresponding',
        # 'author_first',
        # 'author_corresponding_full',
        # 'author_first_first_name',
        # 'author_first_surname',
        # 'author_corresponding_first_name',
        # 'author_corresponding_surname',
        'gender_first',
        'probability_first',
        'gender_corresponding',
        'probability_corresponding'
    ]

    DF = DF[final_columns]
    DF.to_csv(os.path.join(exportfolder, f"cleandata_{keyword}.csv"), index=False)

2  out of  15990  are missing corresponding author info
Number of unique first name for first authors:  9436
Number of unique first name for corresponding authors:  9241
Number of unique first name for corresponding authors not in first authors:  2352
Number of names to detect:  10013
Processing 10013 names for keyword: living


100%|██████████| 11/11 [06:06<00:00, 33.30s/it]


2  out of  12156  are missing corresponding author info
Number of unique first name for first authors:  6899
Number of unique first name for corresponding authors:  6644
Number of unique first name for corresponding authors not in first authors:  1670
Number of names to detect:  5206
Processing 5206 names for keyword: mobility


100%|██████████| 6/6 [03:13<00:00, 32.18s/it]


2  out of  21850  are missing corresponding author info
Number of unique first name for first authors:  12274
Number of unique first name for corresponding authors:  11954
Number of unique first name for corresponding authors not in first authors:  2450
Number of names to detect:  7373
Processing 7373 names for keyword: people


100%|██████████| 8/8 [04:27<00:00, 33.46s/it]


3  out of  13801  are missing corresponding author info
Number of unique first name for first authors:  7483
Number of unique first name for corresponding authors:  7395
Number of unique first name for corresponding authors not in first authors:  1718
Number of names to detect:  4595
Processing 4595 names for keyword: economy


100%|██████████| 5/5 [02:54<00:00, 34.90s/it]


1  out of  6255  are missing corresponding author info
Number of unique first name for first authors:  3985
Number of unique first name for corresponding authors:  3969
Number of unique first name for corresponding authors not in first authors:  790
Number of names to detect:  0
Processing 0 names for keyword: governance


100%|██████████| 1/1 [00:00<00:00, 6853.44it/s]


2  out of  23246  are missing corresponding author info
Number of unique first name for first authors:  12687
Number of unique first name for corresponding authors:  11994
Number of unique first name for corresponding authors not in first authors:  4306
Number of names to detect:  8148
Processing 8148 names for keyword: environment


100%|██████████| 9/9 [05:00<00:00, 33.43s/it]


2  out of  17172  are missing corresponding author info
Number of unique first name for first authors:  9192
Number of unique first name for corresponding authors:  8714
Number of unique first name for corresponding authors not in first authors:  2374
Number of names to detect:  3036
Processing 3036 names for keyword: city


100%|██████████| 4/4 [01:50<00:00, 27.57s/it]


In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

exportfolder = r"/group/geog_pyloo/XY/01_gender/_data/_clean"

folder_dict = {
    'ai': 'cleandata_ai.csv',
    'living': 'cleandata_living.csv',
    'mobility': 'cleandata_mobility.csv',
    'people': 'cleandata_people.csv',
    'economy': 'cleandata_economy.csv',
    'governance': 'cleandata_governance.csv',
    'environment': 'cleandata_environment.csv',
    'city': 'cleandata_city.csv'
}

# 定义性别组合的函数
def gender_combination(row):
    first = row['gender_first']
    corresponding = row['gender_corresponding']
    if first == 'female' and corresponding == 'female':
        return 'Female only'
    elif (first == 'female' and corresponding == 'unknown') or (first == 'unknown' and corresponding == 'female'):
        return 'Female & Unknown'
    elif first == 'male' and corresponding == 'male':
        return 'Male only'
    elif (first == 'female' and corresponding == 'male') or (first == 'male' and corresponding == 'female'):
        return 'Female & Male'
    elif first == 'unknown' and corresponding == 'unknown':
        return 'Both Unknown'
    elif first == 'unknown' or corresponding == 'unknown':
        if first == 'male' or corresponding == 'male':
            return 'Male & Unknown'
        if first == 'female' or corresponding == 'female':
            return 'Female & Unknown'
    else:
        return 'Non-Analyzable'

# 定义新的性别标签函数
def gender_label_2(combination):
    if combination in ['Female only', 'Female & Unknown', 'Female & Male']:
        return 'withFemale'
    elif combination == 'Male only':
        return 'Male only'
    elif combination == 'Both Unknown':
        return 'other'
    else:
        return 'other'

# 存储结果的字典
results = {}

# 计算每个关键词的性别比例
for keyword, filename in folder_dict.items():
    filepath = os.path.join(exportfolder, filename)
    df = pd.read_csv(filepath)
    # 将空值填充为'unknown'
    df['gender_first'] = df['gender_first'].fillna('unknown')
    df['gender_corresponding'] = df['gender_corresponding'].fillna('unknown')
    df['gender_combination'] = df.apply(gender_combination, axis=1)
    df['gender_label_2'] = df['gender_combination'].apply(gender_label_2)
    result = df['gender_combination'].value_counts(normalize=True)
    results[keyword] = result
    # 保存回原文件
    df.to_csv(filepath, index=False)

# 转换为DataFrame并填充缺失值
result_df = pd.DataFrame(results).fillna(0)



In [2]:
# 计算每个关键词的性别比例
for keyword, filename in folder_dict.items():
    filepath = os.path.join(exportfolder, filename)
    df = pd.read_csv(filepath)
    # 将空值填充为'unknown'
    df['gender_first'] = df['gender_first'].fillna('unknown')
    df['gender_corresponding'] = df['gender_corresponding'].fillna('unknown')
    df['gender_combination'] = df.apply(gender_combination, axis=1)
    result = df['gender_combination'].value_counts()
    results[keyword] = result
    # 保存回原文件
    df.to_csv(filepath, index=False)

# 转换为DataFrame并填充缺失值
result_df = pd.DataFrame(results).fillna(0)

# 按指定顺序排序列
order = ['Male only', 'Male & Unknown', 'Both Unknown', 'Female only', 'Female & Unknown','Female & Male' ]
result_df = result_df.reindex(order)

# 显示结果
result_df

Unnamed: 0_level_0,ai,living,mobility,people,economy,governance,environment,city
gender_combination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Male only,70802,8133,6706,10887,6321,3110,11608,9064
Male & Unknown,10950,1080,870,1403,880,262,2428,1478
Both Unknown,14573,1979,1345,3038,1628,556,2329,1750
Female only,20441,3353,2130,4727,3584,1770,3687,3014
Female & Unknown,3082,333,216,431,385,123,705,421
Female & Male,11033,1112,889,1364,1003,434,2489,1445
