In [2]:
import pandas as pd
import numpy as np
import glob
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from concurrent.futures import ProcessPoolExecutor
import warnings
import tqdm
warnings.filterwarnings('ignore')

In [2]:
# Setting the default figure parameters to de-frame the upper x-axis, left y-axis, and right y-axis
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['ytick.left'] = False
plt.rcParams['axes.grid'] = True  # Ensure grid lines are enabled
plt.rcParams['axes.grid.which'] = 'major'  # Apply grid lines only to major ticks
plt.rcParams['axes.grid.axis'] = 'y'  # Only show horizontal grid lines
plt.rcParams['grid.linestyle'] = '--'  # Set grid line style to dashed
plt.rcParams['grid.alpha'] = 0.3  # Set grid line transparency

In [3]:
control_df_sel=pd.read_csv('../data/control_df_sel_gender_0609.csv')
journals_merged_sel=pd.read_csv('../data/journals_merged_sel_gender_0609.csv')

In [4]:
journal_filtered_sel=journals_merged_sel[journals_merged_sel.journal_id=='jour.1346339'] # this is Science
journal_filtered_sel = journal_filtered_sel[journal_filtered_sel['first_publish_year'] % 5 == 0]

In [5]:
journal_filtered_sel.columns

Index(['author_id', 'year', 'publication_count', 'corresponding_count',
       'first_year', 'career_age', 'total_citations', 'funding_count',
       'average_funding', 'cum_publication_count', 'cum_corresponding_count',
       'cum_total_citations', 'cum_funding_count', 'cum_average_funding',
       'career_stage', 'author_first_name', 'author_last_name', 'journal_id',
       'journal_title', 'first_publish_year', 'to_year', 'id', 'Gender',
       'researcher_id', 'affiliation_country_codes'],
      dtype='object')

In [6]:
journal_filtered_sel.head()

Unnamed: 0,author_id,year,publication_count,corresponding_count,first_year,career_age,total_citations,funding_count,average_funding,cum_publication_count,...,author_first_name,author_last_name,journal_id,journal_title,first_publish_year,to_year,id,Gender,researcher_id,affiliation_country_codes
1755,ur.010000144736.26,1992,1.0,0.0,1992,0,0.0,0.0,0.0,1.0,...,Norman,Loeb,jour.1346339,Science,2005,-13,ur.010000144736.26,0,ur.010000144736.26,CA
1756,ur.010000144736.26,1993,0.0,0.0,1992,1,0.0,0.0,0.0,1.0,...,Norman,Loeb,jour.1346339,Science,2005,-12,ur.010000144736.26,0,ur.010000144736.26,CA
1757,ur.010000144736.26,1994,0.0,0.0,1992,2,0.0,0.0,0.0,1.0,...,Norman,Loeb,jour.1346339,Science,2005,-11,ur.010000144736.26,0,ur.010000144736.26,CA
1758,ur.010000144736.26,1995,0.0,0.0,1992,3,0.0,0.0,0.0,1.0,...,Norman,Loeb,jour.1346339,Science,2005,-10,ur.010000144736.26,0,ur.010000144736.26,CA
1759,ur.010000144736.26,1996,1.0,0.0,1992,4,1.0,0.0,0.0,2.0,...,Norman,Loeb,jour.1346339,Science,2005,-9,ur.010000144736.26,0,ur.010000144736.26,CA


In [7]:
nature=journals_merged_sel[journals_merged_sel.journal_id=='jour.1346339']

In [8]:
nature.shape

(1029670, 25)

In [9]:
def filter_authors_by_publication_window(df, year_col, first_publish_year_col):
    df['to_year'] = df[year_col] - df[first_publish_year_col]
    valid_authors = df.groupby(['author_id','first_publish_year']).filter(lambda x: (x['to_year'].min() <= -5) and (x['to_year'].max() >= 10))['author_id'].unique()
    filtered_df = df[df['author_id'].isin(valid_authors)]
    final_filtered_df = filtered_df[(filtered_df['to_year'] >= -5) & (filtered_df['to_year'] <= 10)]    
    return final_filtered_df
journal_filtered_sel = filter_authors_by_publication_window(journal_filtered_sel,'year','first_publish_year')

In [10]:
valid_author_ids = control_df_sel[control_df_sel['career_age'] >= 15]['author_id'].unique()
control_df_sel = control_df_sel[control_df_sel['author_id'].isin(valid_author_ids)]

control_df_sel=control_df_sel[['author_id','cum_publication_count', 'cum_corresponding_count',
       'cum_total_citations', 'cum_funding_count','career_stage','first_year','year','Gender','affiliation_country_codes']]

In [11]:
def calculate_distance(series1, series2):
    distance, _ = fastdtw(series1, series2, dist=euclidean)
    return distance
def find_matches(treated_group, control_df, num_matches=3):
    matched_control = pd.DataFrame()
    i=0
    for author_id, group in treated_group.groupby(['author_id','first_publish_year']):
        author_distances = {}
        potential_matches = control_df[
            (control_df['first_year'] == group.iloc[0]['first_year'])&(control_df['Gender'] == group.iloc[0]['Gender'])&(
                control_df['affiliation_country_codes'] == group.iloc[0]['affiliation_country_codes'])
        ]
        potential_authors = potential_matches[potential_matches.year==group.iloc[0][
            'first_publish_year']]['author_id']
        potential_matches = potential_matches[potential_matches.author_id.isin(potential_authors)]
        potential_matches['first_publish_year'] = group.iloc[0]['first_publish_year']
        #potential_matches['to_year']=potential_matches['year']-potential_matches['first_publish_year']
        potential_matches=filter_authors_by_publication_window(potential_matches,'year','first_publish_year')
        control_dis = potential_matches[potential_matches['to_year'] < 0]

        for author_id_potential, group_potential in control_dis.groupby('author_id'):
            distance_cum_prod = calculate_distance(group['cum_publication_count'].to_numpy(), group_potential['cum_publication_count'].to_numpy())
            distance_cum_cor = calculate_distance(group['cum_corresponding_count'].to_numpy(), group_potential['cum_corresponding_count'].to_numpy())
            distance_cum_cit = calculate_distance(group['cum_total_citations'].to_numpy(), group_potential['cum_total_citations'].to_numpy())
            distance_cum_fund = calculate_distance(group['cum_funding_count'].to_numpy(), group_potential['cum_funding_count'].to_numpy())
            #distance_avg_fund = calculate_distance(group['F'], group_potential['F'])
            distance = distance_cum_prod + distance_cum_cor + distance_cum_cit + distance_cum_fund
            author_distances[author_id_potential] = distance
        closest_matches_ids = sorted(author_distances, key=author_distances.get)[:num_matches]
        closest_matches = potential_matches[potential_matches['author_id'].isin(closest_matches_ids)]
        matched_control = pd.concat([matched_control, closest_matches])
        if i%100 == 0:
            print('Successfully done ' + str(i) +' times')
        i = i + 1
    return matched_control.reset_index(drop=True)

In [12]:
len(set(journal_filtered_sel.author_id))

2407

In [13]:
journal_filtered_sel=filter_authors_by_publication_window(journal_filtered_sel,'year','first_publish_year')
treated_group_journal_sel = journal_filtered_sel[journal_filtered_sel.to_year<0]
matched_control_journal_sel = find_matches(treated_group_journal_sel, control_df_sel)

Successfully done 0 times
Successfully done 100 times
Successfully done 200 times
Successfully done 300 times
Successfully done 400 times
Successfully done 500 times
Successfully done 600 times
Successfully done 700 times
Successfully done 800 times
Successfully done 900 times
Successfully done 1000 times
Successfully done 1100 times
Successfully done 1200 times
Successfully done 1300 times
Successfully done 1400 times
Successfully done 1500 times
Successfully done 1600 times
Successfully done 1700 times
Successfully done 1800 times
Successfully done 1900 times
Successfully done 2000 times
Successfully done 2100 times
Successfully done 2200 times
Successfully done 2300 times
Successfully done 2400 times
Successfully done 2500 times


In [14]:
treated_group_journal_sel.to_csv('../data/treated_group_science_decade_gender_5y.csv',header=True,index=False)
matched_control_journal_sel.to_csv('../data/matched_control_science_decade_gender_5y.csv',header=True,index=False)

In [15]:
def filter_authors_by_publication_window_updated(df, year_col, first_publish_year_col):
    df['to_year'] = df[year_col] - df[first_publish_year_col]
    df['author_year_id'] = df['author_id'].astype(str) + "_" + df[first_publish_year_col].astype(str)
    valid_author_year_ids = df.groupby('author_year_id').filter(lambda x: (x['to_year'].min() <= -5) and (x['to_year'].max() >= 10))['author_year_id'].unique()
    # Filter the original DataFrame to only include rows with valid author_year_id combinations
    filtered_df = df[df['author_year_id'].isin(valid_author_year_ids)]
    final_filtered_df = filtered_df[(filtered_df['to_year'] >= -5) & (filtered_df['to_year'] <= 10)]
    final_filtered_df = final_filtered_df.drop(columns=['author_year_id'])
    return final_filtered_df

In [16]:
def filter_authors(df):
    # Function to apply the filtering logic to each group
    def filter_group(group):
        # Sort group by 'year' to ensure chronological processing
        group = group.sort_values(by='year')
        # Check if all 'to_year' values have 'is_journal' == 0
        if group['is_journal'].eq(0).all():
            return group
        # Check for mixed condition: 'to_year' < 0 with 'is_journal' == 0, then 'is_journal' becomes 1 and stays 1
        before_zero = group[group['to_year'] < 0]['is_journal'].eq(0).all()
        after_zero = group[group['to_year'] >= 0]['is_journal'].eq(1).all()
        if before_zero and after_zero:
            # Ensure it does not revert back to 0
            if (group['is_journal'].diff() == -1).any():
                return None  # Exclude groups that revert back to 0
            return group
        return None
    
    # Apply the filter function to each group and drop None results
    filtered_groups = [filter_group(group) for _, group in df.groupby('author_id')]
    filtered_groups = [group for group in filtered_groups if group is not None]
    
    # Concatenate the filtered groups back into a DataFrame
    if filtered_groups:
        return pd.concat(filtered_groups)
    else:
        return pd.DataFrame(columns=df.columns)

In [17]:
treated = journal_filtered_sel[journal_filtered_sel.author_id.isin(treated_group_journal_sel.author_id)]
treated = treated[['author_id', 'cum_publication_count', 'cum_corresponding_count',
       'cum_total_citations', 'cum_funding_count', 'career_stage',
       'first_year', 'year', 'first_publish_year', 'to_year','Gender','affiliation_country_codes']]
treated['is_journal']=0
treated.loc[treated.to_year>=0,'is_journal']=1
matched=matched_control_journal_sel
matched['is_journal']=0
matched_pair=pd.concat([treated,matched],axis=0)
matched_pair=matched_pair.sort_values('is_journal',ascending=False)
matched_pair=filter_authors_by_publication_window_updated(matched_pair,'year','first_publish_year')
matched_pair=matched_pair.sort_values('Gender').drop_duplicates(subset=['author_id','first_publish_year','to_year'])
matched_pair=matched_pair.sort_values(['author_id','year'],ascending=True)
matched_pair['researcher_id']=matched_pair['author_id'].apply(lambda x: int(x.split('.')[1]))

In [18]:
filtered_matched_pair = filter_authors(matched_pair)
matched_pair_grouped=filter_authors_by_publication_window_updated(filtered_matched_pair,'year','first_publish_year').drop_duplicates()
matched_pair_grouped.to_csv('../data/matches/matched_pair_science_grouped_gender_4k.csv',header=True,index=False)

In [19]:
matched_pair_pnas_grouped_gender_newid=matched_pair_grouped.copy()
matched_pair_pnas_grouped_gender_newid['researcher_id']=matched_pair_pnas_grouped_gender_newid['researcher_id'].astype(str)+matched_pair_pnas_grouped_gender_newid['first_publish_year'].astype(str)
matched_pair_pnas_grouped_gender_newid.to_csv('../data/matches/matched_pair_science_grouped_gender_newid.csv',header=True,index=False)


In [35]:
corresponding=pd.read_csv('../data/physics_top_authors_gp_0610_corresponding.csv')
corresponding_sub=corresponding[corresponding.journal_id=='jour.1346339']

In [36]:
corresponding_sub.head()

Unnamed: 0,author_id,author_first_name,author_last_name,journal_id,journal_title,is_corresponding,first_publish_year
46,ur.0777763726.43,Kenneth P.,Murphy,jour.1346339,Science,False,1990
56,ur.0777761054.61,Arnold J. M.,Driessen,jour.1346339,Science,False,2007
98,ur.0777750503.75,Anne,Thoul,jour.1346339,Science,False,2008
99,ur.0777750503.75,A.,Thoul,jour.1346339,Science,False,2003
152,ur.07777344557.98,J F,Chang,jour.1346339,Science,False,2023


In [37]:
corresponding_sub.value_counts('is_corresponding')

is_corresponding
False    57459
True        10
dtype: int64

In [21]:
matched_pair_pnas_grouped_gender_newid.shape

(155680, 14)

In [26]:
matched_pair_pnas_grouped_gender_newid.head()

Unnamed: 0,author_id,cum_publication_count,cum_corresponding_count,cum_total_citations,cum_funding_count,career_stage,first_year,year,first_publish_year,to_year,Gender,affiliation_country_codes,is_journal,researcher_id
29616,ur.0100000521.21,11.0,0.0,257.0,0.0,late-career,1965,1995,2000,-5,0,GB,0,1000005212000
29617,ur.0100000521.21,11.0,0.0,267.0,0.0,late-career,1965,1996,2000,-4,0,GB,0,1000005212000
29618,ur.0100000521.21,11.0,0.0,278.0,0.0,late-career,1965,1997,2000,-3,0,GB,0,1000005212000
29619,ur.0100000521.21,11.0,0.0,287.0,0.0,late-career,1965,1998,2000,-2,0,GB,0,1000005212000
29620,ur.0100000521.21,11.0,0.0,295.0,0.0,late-career,1965,1999,2000,-1,0,GB,0,1000005212000


In [29]:
a=pd.merge(matched_pair_pnas_grouped_gender_newid,corresponding_sub[[
    'author_id','first_publish_year','is_corresponding']],how='left',on=['author_id','first_publish_year'])
a=a.fillna(False)

In [30]:
a.value_counts('is_corresponding')

is_corresponding
False    156512
dtype: int64

## Consider first/last authors

In [8]:
import pandas as pd
import glob

path = '../data/authorship/all_first/*.csv'
all_files = glob.glob(path)
column_names = ['id', 'doi', 'title', 'first_name', 'last_name', 'researcher_id', 
                'is_corresponding', 'is_first', 'journal_id', 'journal_title', 'year']
df_combined_first = pd.concat(
    (pd.read_csv(file, header=None, names=column_names) for file in all_files),
    ignore_index=True
)

In [11]:
df_combined_first.to_csv('../data/df_combined_first.csv',header=True,index=False)

In [9]:
df_combined_first.head()

Unnamed: 0,id,doi,title,first_name,last_name,researcher_id,is_corresponding,is_first,journal_id,journal_title,year
0,pub.1150576243,10.1080/00150193.2022.2079463,AuNPs labeled antisera improve the visualizati...,Shu,Jiang,ur.016367506242.97,False,1,jour.1045310,Ferroelectrics,2022.0
1,pub.1157121999,10.25130/tjps.v23i3.510,Study the Spectral Properties of the Molecule ...,,Galsan T. Kamal,,False,1,jour.1301866,Tikrit Journal of Pure Science,2018.0
2,pub.1164371987,10.1103/physrevd.108.050001,Editorial: To Review Is to Be,Randall D.,Kamien,ur.01342711413.09,False,1,jour.1320496,Physical Review D,2023.0
3,pub.1156092598,10.48550/arxiv.2303.04982,The Robustness Verification of Linear Sound Qu...,Su,Bonan,,False,1,jour.1371339,arXiv,2023.0
4,pub.1142418768,10.4236/ojer.2021.104009,Magnetic Control of the Earthquakes,L. Buchachenko,A.,,False,1,jour.1151234,Open Journal of Earthquake Research,2021.0


In [13]:
path = '../data/authorship/all_last/*.csv'
all_files = glob.glob(path)
column_names = ['id', 'doi', 'title', 'first_name', 'last_name', 'researcher_id', 
                'is_corresponding', 'is_last', 'journal_id', 'journal_title', 'year']
df_combined_last = pd.concat(
    (pd.read_csv(file, header=None, names=column_names) for file in all_files),
    ignore_index=True
)

In [14]:
df_combined_last.to_csv('../data/df_combined_last.csv',header=True,index=False)

## first/last authors

In [3]:
matched_pair_pnas_grouped_gender_newid=pd.read_csv('../data/matches/matched_pair_science_grouped_gender_newid.csv')

In [4]:
first=pd.read_csv('../data/df_combined_first.csv')
last=pd.read_csv('../data/df_combined_last.csv')
first_sub=first[first.journal_id=='jour.1346339']
first_sub.value_counts('is_first')
first_sub.columns=['id', 'doi', 'title', 'first_name', 'last_name', 'author_id',
       'is_corresponding', 'is_first', 'journal_id', 'journal_title',
                   'first_publish_year']

last_sub=last[last.journal_id=='jour.1346339']
last_sub.value_counts('is_last')
last_sub.columns=['id', 'doi', 'title', 'first_name', 'last_name', 'author_id',
       'is_corresponding', 'is_last', 'journal_id', 'journal_title',
                   'first_publish_year']

In [5]:
a=pd.merge(matched_pair_pnas_grouped_gender_newid,first_sub[[
    'author_id','first_publish_year','is_first']].drop_duplicates(
    subset=['author_id','first_publish_year']),
           how='left',on=['author_id','first_publish_year'])
a=a.fillna(0)
a=a.sort_values('is_first',ascending=False).drop_duplicates(['researcher_id','to_year'])
a.to_csv('../data/matches/matched_pair_science_grouped_gender_first.csv',header=True,index=False)
a.value_counts('is_first')

is_first
0.0    147104
1.0      8576
dtype: int64

In [6]:
b=pd.merge(matched_pair_pnas_grouped_gender_newid,last_sub[[
    'author_id','first_publish_year','is_last']].drop_duplicates(
    subset=['author_id','first_publish_year']),
           how='left',on=['author_id','first_publish_year'])
b=b.fillna(0)
b=b.sort_values('is_last',ascending=False).drop_duplicates(['researcher_id','to_year'])
b.to_csv('../data/matches/matched_pair_science_grouped_gender_last.csv',header=True,index=False)
b.value_counts('is_last')

is_last
0.0    149664
1.0      6016
dtype: int64