In [1]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
import time

In [2]:
scientists = pd.read_excel('./Input/DataScientists.xls')

In [3]:
scientists.head()

Unnamed: 0,name,country,institution,dblp,expertise
0,aaron elmore,united states,university of chicago,https://dblp.org/pers/e/Elmore:Aaron_J=.html,
1,abdalghani abujabal,germany,amazon alexa,https://dblp.org/pers/a/Abujabal:Abdalghani.html,
2,abdul quamar,united states,ibm research almaden,https://dblp.org/pers/q/Quamar:Abdul.html,
3,abdulhakim qahtan,netherlands,utrecht university,https://dblp.org/pid/121/4198.html,
4,abhijnan chakraborty,germany,max planck institute for software systems,https://dblp.org/pers/c/Chakraborty:Abhijnan.html,


In [4]:
scientists.shape

(1220, 5)

In [5]:
scientists.drop_duplicates(subset='dblp', inplace=True, ignore_index=True) #remove duplicates by url

In [6]:
scientists.shape

(1079, 5)

# Crawling for Q1 & Q2 (All collaborations, collaborations in yearly granularity)

In [7]:
def get_author_year_series(root):
    
    if root == None:
        return None
    
    author = root.attrib['pid']

    year_coauthor_dict = dict()
    for i in range(0, len(root)): 
    
        if root[i].tag == 'r': #only look at article entries
        
            publish_work = root[i][0].attrib['key']
            publish_year = root[i][0].attrib['mdate'][:4] #year
        
            current_year_coauthor_list = []
        
            for j in range(0, len(root[i][0])):
                if root[i][0][j].tag == 'author': 
                    current_year_coauthor_list.append(root[i][0][j].attrib['pid'])
        
            if publish_year not in year_coauthor_dict:
                year_coauthor_dict[publish_year] = current_year_coauthor_list
            
            else: year_coauthor_dict[publish_year] = year_coauthor_dict[publish_year] + current_year_coauthor_list
        
    for year in year_coauthor_dict:
        year_coauthor_dict[year] = [year_coauthor_dict[year]]
        
    year_coauthor_series = pd.Series(year_coauthor_dict, name = author)
    return year_coauthor_series

def get_author_root(url):
    try:
        r = requests.get(url[:-4] + 'xml').text
        root = ET.fromstring(r)
    except:
        return None
    
    return root

In [9]:
collab_network_list = []
problem_list = []
join_series_list = []

for i in range(0, len(scientists)):
    url = scientists.iloc[i]['dblp']    
    r = requests.get(url[:-4] + 'xml').text
    
    try:
        root = ET.fromstring(r)
    
    except:
        problem_list.append([root.attrib['pid'], url]) #track problematic entries
        continue
    
    author_pid = root.attrib['pid'] #figure out the pid of the author
    author_name = root.attrib['name']
    
    coauthors = []
    
    for j in range(0, len(root)): 
        if root[j].tag == 'r': #only look at article entries
            #print(root[i][0].attrib['key']) #article name
            for k in range(0, len(root[j][0])):
                if root[j][0][k].tag == 'author': #coauthors
                    #print(root[i][0][j].tag, "{0:<30}".format(root[i][0][j].text), 'pid: ' + root[i][0][j].attrib['pid'])
                    coauthors.append(root[j][0][k].attrib['pid'])
    
    coauthors = set(coauthors) #remove duplicates
    collab_network_list.append([author_name, author_pid, coauthors])

    join_series = get_author_year_series(root)
    join_series_list.append(join_series)
    
    if i%100 == 0: print('progress: ', i)
    
collab_network_csv = pd.DataFrame(collab_network_list, columns = ['author_name', 'author_pid', 'coauthors_list'])
problem_list_csv = pd.DataFrame(problem_list, columns = ['problem_pid', 'url'])

progress:  0
progress:  100
progress:  200
progress:  300
progress:  400
progress:  500
progress:  600
progress:  700
progress:  800
progress:  900
progress:  1000


In [12]:
print(len(collab_network_list), len(problem_list))
print(len(join_series_list))

976 103
976


In [13]:
collab_network_csv.to_csv('output/collab_network_csv.csv')
problem_list_csv.to_csv('output/problem_list_csv.csv')

In [14]:
join_series_list[:10]

[2024    [[42/469, 206/1695, 372/1694, 372/1979, 75/943...
 2023    [[163/0545, 347/1729, 75/9436, f/CFaloutsos, f...
 2022    [[163/0545, 147/1343, 194/6154, 287/9707, 288/...
 2021    [[50/7680-2, 155/5478, 117/1897-2, 75/9436, 14...
 2020    [[42/469-2, 147/1189, 75/9436, f/MJFranklin, 1...
 2019    [[191/0540, 163/0545, 147/1189, 75/9436, f/MJF...
 2018    [[228/8021, 228/7838, 10/6661, 75/9436, 131/41...
 2017    [[a/DivyakantAgrawal, a/AmrElAbbadi, 71/5974, ...
 Name: 75/9436, dtype: object,
 2023    [[226/4894, 40/4987, 162/9092, 295/3643, 295/3...
 2021    [[162/9092, 162/9092, 25/11105, 162/9092, 96/7...
 2019    [[250/2789, 96/7128, 162/9092, 54/7420, w/Gerh...
 2018    [[06/180, 162/9092, 96/7128, 28/8510, w/Gerhar...
 Name: 162/9092, dtype: object,
 2023    [[k/GeorgiaKoutrika, y/JunYang1, 82/7402, 76/3...
 2022    [[127/6195, 87/11444, 87/10957, o/FatmaOzcan, ...
 2021    [[o/FatmaOzcan, 87/10957, 127/6195, 87/11444, ...
 2020    [[127/6195, 257/5495, 27/3375, 87/10957, 24

In [17]:
year_granularity_df = pd.concat(join_series_list, axis=1, join='outer')

In [19]:
year_granularity_df.shape

(23, 976)

In [20]:
year_granularity_df.to_csv('output/year_granularity_df.csv')