In [1]:
import pandas as pd
import os
absolutepath = os.path.abspath(os.getcwd())
fileDirectory = os.path.dirname(absolutepath)
parentDirectory = os.path.dirname(fileDirectory)
dataprocDirectory = os.path.join(fileDirectory, 'data/proc')
datarawDirectory = os.path.join(fileDirectory, 'data/raw')

# the potential pool of scholars is
# all scholars that satisfy a), b), and c):
## a) in a two digit subfield that is ever treated across all cohorts
## b) in a jttp receiving school
## c) active at any time between 2010 and 2016

In [2]:
import psycopg2
import logging

In [5]:
def return_candiate_pool():
    # each row contains a authid X afid spell
    sql_text = f"""
    select authid,scopus_authors_aff_year_filled_chinese.afid,
    subfield_most_frequent,
    subfield_most_frequent_two_digit, 
    max(year) as afid_max_year, 
    min(year) as afid_min_year
    from scopus_authors_aff_year_filled_chinese
    left join scopus_affiliation on 
    scopus_authors_aff_year_filled_chinese.afid = scopus_affiliation.afid
    where scopus_affiliation.jttp_school = 1 and
    jttp_field = 1
    group by authid, 
    scopus_authors_aff_year_filled_chinese.afid, 
    subfield_most_frequent, 
    subfield_most_frequent_two_digit;
    """
    
    con = psycopg2.connect(host='id-hdb-psgr-cp7.ethz.ch',  
                      dbname='led', 
                      user='lixiang',
                      password='Wmhzgjwmhxgj99')

    cur = con.cursor()
    cur.execute(sql_text)
    rows = cur.fetchall()
    con.close()
    columns = ['authid','afid',
               'subfield_most_frequent',
               'subfield_most_frequent_two_digit',
               'afid_max_year',
               'afid_min_year']
    data = pd.DataFrame(rows,columns=columns)
    #keep all spells that terminated after 2011 and started before 2017 
    # fix - needs to be min year <= 2017
    data = data[((data.afid_max_year > 2011)&(data.afid_min_year < 2017))]
    data['spell_length'] = data['afid_max_year'] - data['afid_min_year'] + 1
    return data

In [6]:
data = return_candiate_pool()

In [7]:
data.spell_length.value_counts()

spell_length
5.0     204075
6.0     177244
7.0     172169
8.0     152651
9.0     131329
1.0     120415
10.0    116480
11.0     99230
12.0     90775
13.0     81615
14.0     74006
15.0     70381
16.0     64588
4.0      61614
3.0      55580
2.0      48743
17.0     44388
20.0     34768
18.0     31141
19.0     28025
21.0     23785
22.0     19846
23.0     19629
24.0     17325
25.0     16312
28.0     13091
26.0     12998
27.0     11093
30.0      9666
29.0      9645
31.0      9300
32.0        24
35.0        17
37.0        13
34.0        12
33.0        12
38.0        11
36.0         7
77.0         1
96.0         1
Name: count, dtype: int64

# keeping only :
# sum of spells at any jttp insitution larger than 1

In [8]:
data['total_spell'] = data[['authid','spell_length']].groupby(['authid']).transform('sum')[['spell_length']]

In [9]:
data.total_spell.value_counts()

total_spell
5.0      178728
7.0      168230
6.0      166317
8.0      156063
9.0      141131
          ...  
138.0        14
127.0        13
135.0        12
128.0        12
92.0         11
Name: count, Length: 214, dtype: int64

In [10]:
data = data[data.total_spell>1]

In [11]:
data.to_csv(os.path.join(dataprocDirectory, 'author_pool_unrestricted.csv'),index=False)