In [1]:
import pandas as pd
import os
absolutepath = os.path.abspath(os.getcwd())
fileDirectory = os.path.dirname(absolutepath)
parentDirectory = os.path.dirname(fileDirectory)
dataprocDirectory = os.path.join(fileDirectory, 'data/proc')
datarawDirectory = os.path.join(fileDirectory, 'data/raw')

# load spells
# load treament timing

In [2]:
spells = pd.read_csv(os.path.join(dataprocDirectory, 'author_pool.csv'))

In [3]:
afid_subfield_timing_2digit = pd.read_csv(os.path.join(dataprocDirectory, \
                                                       'afid_subfield_timing_2digit.csv'))
afid_subfield_timing_2digit.columns = ['afid', 'subfield_most_frequent_two_digit', 'year',
       'never_returned', 'JTTP_authid', 'reneger_treatment_2digit']
afid_subfield_timing_4digit = pd.read_csv(os.path.join(dataprocDirectory, \
                                                       'afid_subfield_timing_4digit.csv'))
afid_subfield_timing_4digit.columns = ['afid', 'subfield_most_frequent',
       'subfield_most_frequent_two_digit', 'year', 'never_returned',
       'JTTP_authid', 'reneger_treatment_4digit']

# merge spell and treatment timing
# each row is (author X potential treament timing)
# 1. define presence
# presence if jttp year <  max year 
# 2. define treatment
# treated if jttp year <  max year & jttp year >=  min year + 1 

In [4]:
spellsXtiming_2digit = spells.merge(afid_subfield_timing_2digit, 
                                    how = 'outer',
                                    on = ['afid','subfield_most_frequent_two_digit'])
spellsXtiming_2digit = spellsXtiming_2digit.dropna(subset = ['authid'])

In [5]:
# presence 
# 1 : present 
spellsXtiming_2digit.loc[(spellsXtiming_2digit.year<\
                          spellsXtiming_2digit.afid_max_year),
                         'present_2digit'] = 1
# treatment 
# 1 : treated 
spellsXtiming_2digit.loc[(spellsXtiming_2digit.year>=\
                          spellsXtiming_2digit.afid_min_year+1)&\
                         (spellsXtiming_2digit.year<\
                          spellsXtiming_2digit.afid_max_year),
                         'treatment_2digit'] = 1
# 2 : join after treatment 
spellsXtiming_2digit.loc[(spellsXtiming_2digit.year<\
                          spellsXtiming_2digit.afid_min_year+1)&\
                         (spellsXtiming_2digit.year<\
                          spellsXtiming_2digit.afid_max_year),
                         'treatment_2digit'] = 2
print(spellsXtiming_2digit.shape)

(724099, 15)


In [6]:
spellsXtiming_2digit.treatment_2digit.value_counts()

treatment_2digit
1.0    266333
2.0    170717
Name: count, dtype: int64

## merge in 4 digit treament status

In [7]:
afid_subfield_timing_4digit['four_digit_match'] = 1
digit_vars = ['afid','year','subfield_most_frequent',
              'subfield_most_frequent_two_digit','four_digit_match']
spellsXtiming_2digit = spellsXtiming_2digit.merge(afid_subfield_timing_4digit[digit_vars],
                                                  how = 'left',
                                                  on = ['afid','year',
                                                        'subfield_most_frequent',
                                                        'subfield_most_frequent_two_digit'])

# for those treated
# among all possible treatment dates
# the earliest one is the assigned

In [8]:
groupbyvar = ['authid','subfield_most_frequent','subfield_most_frequent_two_digit']
groupbyvar_outcome = groupbyvar+['year','reneger_treatment_2digit','JTTP_authid','four_digit_match']
spellsXtiming_2digit_treated = spellsXtiming_2digit[spellsXtiming_2digit.treatment_2digit==1].copy()
spellsXtiming_2digit_treated['min_treat_year'] = spellsXtiming_2digit_treated.groupby(groupbyvar).\
transform('min')[['year']]
spellsXtiming_2digit_authid = spellsXtiming_2digit_treated.loc[spellsXtiming_2digit_treated.year==\
                                                       spellsXtiming_2digit_treated.min_treat_year,\
                                                       groupbyvar_outcome].groupby(groupbyvar).\
agg({'year':'min','reneger_treatment_2digit':'min',
     'JTTP_authid':'sum','four_digit_match':'max'}).reset_index()

In [9]:
spellsXtiming_2digit_authid.columns = ['authid', 'subfield_most_frequent', 
                                       'subfield_most_frequent_two_digit',
                                       'treat_year','reneger_treatment_2digit', 
                                       'jttp_count','four_digit_match']

# for those present but not treated - 
# ie those who join after jttp arrival
# label them

In [10]:
all_joiners = set(spellsXtiming_2digit[spellsXtiming_2digit.treatment_2digit == 2].authid)

In [11]:
len(all_joiners)

159321

In [12]:
all_treated = set(spellsXtiming_2digit[spellsXtiming_2digit.treatment_2digit == 1].authid)

In [13]:
len(all_treated)

253824

In [14]:
only_joiner = all_joiners - all_treated

In [15]:
len(only_joiner)

142558

In [16]:
spellsXtiming_2digit_authid.count()

authid                              253824
subfield_most_frequent              253824
subfield_most_frequent_two_digit    253824
treat_year                          253824
reneger_treatment_2digit            253824
jttp_count                          253824
four_digit_match                     72739
dtype: int64

# combine to get authid level dataset

In [17]:
authid_level_dataset = spells[['authid','subfield_most_frequent','subfield_most_frequent_two_digit']].drop_duplicates()

In [18]:
authid_level_dataset = authid_level_dataset.merge(spellsXtiming_2digit_authid,
                                                  how = 'left',
                                                  on = ['authid',
                                                        'subfield_most_frequent',
                                                        'subfield_most_frequent_two_digit'])

In [19]:
authid_level_dataset['only_joiner'] = authid_level_dataset.authid.map(lambda x : 1*(x in only_joiner))

In [20]:
authid_level_dataset.shape

(594481, 8)

# authid x cohort dataset
# each cohort includes
# 1. those treated that year
# 2. those who are active the year before at one of the affiliations

In [21]:
spells_authid_first_last_year = spells.copy()
spells_authid_first_last_year['spell_min_year'] = spells_authid_first_last_year[['authid','afid_min_year']].groupby('authid')['afid_min_year'].transform('min')
spells_authid_first_last_year['spell_max_year'] = spells_authid_first_last_year[['authid','afid_max_year']].groupby('authid')['afid_max_year'].transform('max')
spells_authid_first_last_year = spells_authid_first_last_year[['authid','spell_min_year','spell_max_year']].drop_duplicates()

In [22]:
authid_level_dataset = authid_level_dataset.merge(spells_authid_first_last_year, how = 'left', on = 'authid')

In [24]:
authid_level_dataset

Unnamed: 0,authid,subfield_most_frequent,subfield_most_frequent_two_digit,treat_year,reneger_treatment_2digit,jttp_count,four_digit_match,only_joiner,spell_min_year,spell_max_year
0,6503846240,_2210,_22,,,,,1,2008.0,2016.0
1,6503847505,_2506,_25,2013.0,0.0,1.0,,0,2003.0,2020.0
2,6503862195,_2406,_24,,,,,0,2016.0,2017.0
3,6503867928,_1303,_13,2011.0,0.0,1.0,,0,1997.0,2016.0
4,6503871635,_2208,_22,2011.0,0.0,1.0,,0,2004.0,2020.0
...,...,...,...,...,...,...,...,...,...,...
594476,57219369439,_3104,_31,2011.0,0.0,3.0,1.0,0,1993.0,2020.0
594477,57219373749,_2705,_27,2013.0,0.0,1.0,,0,1996.0,2020.0
594478,57219373848,_2208,_22,,,,,0,1992.0,2020.0
594479,57219397075,_2504,_25,2011.0,0.0,1.0,,0,1997.0,2020.0


In [25]:
df_by_cohort_list = []
for c in [2011,2012,2013,2015,2016,2017]:
    print(c)
    print(authid_level_dataset.shape)
    cohort_c = authid_level_dataset[(authid_level_dataset.spell_min_year+1 <= c)&
                                    (authid_level_dataset.spell_max_year > c)].copy()
    #within cohort identify
    cohort_c['treatment_status'] = 0
    #treated in this cohort
    cohort_c.loc[cohort_c.treat_year==c,'treatment_status'] = 1
    #already treated or not yet treated
    cohort_c.loc[(cohort_c.treat_year<c)|(cohort_c.treat_year>c),'treatment_status'] = 2
    #never treated
    cohort_c.loc[pd.isna(cohort_c.treat_year),'treatment_status'] = 3
    print(cohort_c.treatment_status.value_counts())
    cohort_c['cohort'] = c
    print(cohort_c.shape)
    df_by_cohort_list.append(cohort_c)

2011
(594481, 10)
treatment_status
2    131815
3     97418
1     56100
Name: count, dtype: int64
(285333, 12)
2012
(594481, 10)
treatment_status
2    144263
3    116438
1     62835
Name: count, dtype: int64
(323536, 12)
2013
(594481, 10)
treatment_status
2    181335
3    145014
1     35165
Name: count, dtype: int64
(361514, 12)
2015
(594481, 10)
treatment_status
3    207652
2    184281
1     44093
Name: count, dtype: int64
(436026, 12)
2016
(594481, 10)
treatment_status
3    234346
2    210136
1     16537
Name: count, dtype: int64
(461019, 12)
2017
(594481, 10)
treatment_status
3    256186
2    183365
1     39094
Name: count, dtype: int64
(478645, 12)


In [26]:
df_by_cohort = pd.concat(df_by_cohort_list)

# drop jttp themselves

In [29]:
import ast

jttp_authid_list = []
for l in jttp_df.author_id.values:
    try:
        # Ensure that l is a string before attempting to eval
        if isinstance(l, str):
            jttp_authid_list.extend(ast.literal_eval(l))
        else:
            print(f"Skipping non-string value: {l}")
    except (ValueError, SyntaxError) as e:
        print(f"Error evaluating: {l}. Error: {e}")

jttp_authid_set = set(jttp_authid_list)
df_by_cohort['is_jttp'] = df_by_cohort.authid.apply(lambda x: 1 * (x in jttp_authid_set))


Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan
Skipping non-string value: nan


In [30]:
df_by_cohort = df_by_cohort[df_by_cohort.is_jttp!=1]

In [31]:
df_by_cohort.to_csv(os.path.join(dataprocDirectory, 'author_treatment_assignment_stacked.csv'),index=False)