In [1]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
import time

In [2]:
scientists = pd.read_excel('DataScientists.xls')

In [3]:
scientists.head()

Unnamed: 0,name,country,institution,dblp,expertise
0,aaron elmore,united states,university of chicago,https://dblp.org/pers/e/Elmore:Aaron_J=.html,
1,abdalghani abujabal,germany,amazon alexa,https://dblp.org/pers/a/Abujabal:Abdalghani.html,
2,abdul quamar,united states,ibm research almaden,https://dblp.org/pers/q/Quamar:Abdul.html,
3,abdulhakim qahtan,netherlands,utrecht university,https://dblp.org/pid/121/4198.html,
4,abhijnan chakraborty,germany,max planck institute for software systems,https://dblp.org/pers/c/Chakraborty:Abhijnan.html,


In [4]:
scientists.shape

(1220, 5)

In [5]:
scientists.drop_duplicates(subset='dblp', inplace=True, ignore_index=True) #remove duplicates by url

In [6]:
scientists.shape

(1079, 5)

# Crawling for Q1 (All collaborations)

In [7]:
collab_network_list = []
problem_list = []

for i in tqdm(range(0, len(scientists))):
    url = scientists.iloc[i]['dblp']    
    r = requests.get(url[:-4] + 'xml').text
    
    try:
        root = ET.fromstring(r)
    
    except:
        problem_list.append([root.attrib['pid'], url]) #track problematic entries
        continue
    
    author_pid = root.attrib['pid'] #figure out the pid of the author
    author_name = root.attrib['name']
    
    coauthors = []
    
    for j in range(0, len(root)): 
        if root[j].tag == 'r': #only look at article entries
            #print(root[i][0].attrib['key']) #article name
            for k in range(0, len(root[j][0])):
                if root[j][0][k].tag == 'author': #coauthors
                    #print(root[i][0][j].tag, "{0:<30}".format(root[i][0][j].text), 'pid: ' + root[i][0][j].attrib['pid'])
                    coauthors.append(root[j][0][k].attrib['pid'])
    
    coauthors = set(coauthors) #remove duplicates
    collab_network_list.append([author_name, author_pid, coauthors])
    
    time.sleep(0.5)
    

collab_network_csv = pd.DataFrame(collab_network_list, columns = ['author_name', 'author_pid', 'coauthors_list'])
problem_list_csv = pd.DataFrame(problem_list, columns = ['problem_pid', 'url'])

  0%|          | 0/1079 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print(url)

In [None]:
print(len(collab_network_list), len(problem_list))

In [None]:
collab_network_csv.to_csv('output/collab_network_csv.csv')
problem_list_csv.to_csv('output/problem_list_csv.csv')

# Crawling for Q2 (Collaboration at yearly granularity)

In [None]:
def get_author_year_series(root):
    
    if root == None:
        return None
    
    author = root.attrib['pid']

    year_coauthor_dict = dict()
    for i in range(0, len(root)): 
    
        if root[i].tag == 'r': #only look at article entries
        
            publish_work = root[i][0].attrib['key']
            publish_year = root[i][0].attrib['mdate'][:4] #year
        
            current_year_coauthor_list = []
        
            for j in range(0, len(root[i][0])):
                if root[i][0][j].tag == 'author': 
                    current_year_coauthor_list.append(root[i][0][j].attrib['pid'])
        
            if publish_year not in year_coauthor_dict:
                year_coauthor_dict[publish_year] = current_year_coauthor_list
            
            else: year_coauthor_dict[publish_year] = year_coauthor_dict[publish_year] + current_year_coauthor_list
        
    for year in year_coauthor_dict:
        year_coauthor_dict[year] = [year_coauthor_dict[year]]
        
    year_coauthor_series = pd.Series(year_coauthor_dict, name = author)
    return year_coauthor_series

def get_author_root(url):
    try:
        r = requests.get(url[:-4] + 'xml').text
        root = ET.fromstring(r)
    except:
        return None
    
    return root

join_series_list = []

for i in tqdm(range(0, len(scientists))):
    
    url = scientists.iloc[i]['dblp']
    root = get_author_root(url)
    join_series = get_author_year_series(root)
    join_series_list.append(join_series)
    
    time.sleep(0.5)
    
    

In [None]:
join_series_list