In [1]:
import pandas as pd
import numpy as np
import requests
import xml.etree.ElementTree as ET

In [2]:
scientists = pd.read_excel('DataScientists.xls')

In [3]:
scientists.head()

Unnamed: 0,name,country,institution,dblp,expertise
0,aaron elmore,united states,university of chicago,https://dblp.org/pers/e/Elmore:Aaron_J=.html,
1,abdalghani abujabal,germany,amazon alexa,https://dblp.org/pers/a/Abujabal:Abdalghani.html,
2,abdul quamar,united states,ibm research almaden,https://dblp.org/pers/q/Quamar:Abdul.html,
3,abdulhakim qahtan,netherlands,utrecht university,https://dblp.org/pid/121/4198.html,
4,abhijnan chakraborty,germany,max planck institute for software systems,https://dblp.org/pers/c/Chakraborty:Abhijnan.html,


In [4]:
scientists.shape

(1220, 5)

In [5]:
duplicate_frame = pd.DataFrame(columns = scientists.columns)

for i in range(0, len(scientists[scientists.duplicated(subset='dblp')])):
    duplicate = scientists[scientists.duplicated(subset='dblp')].iloc[i]['dblp']
    small_frame = scientists[scientists['dblp'] == duplicate]
    duplicate_frame = pd.concat([duplicate_frame, small_frame])

In [6]:
duplicate_frame

Unnamed: 0,name,country,institution,dblp,expertise
7,abolfazl asudeh,united states,university of illinois at chicago,https://dblp.org/pid/04/7892.html,
8,abolfazl asudeh,united states,university of michifan,https://dblp.org/pid/04/7892.html,
15,alfons kemper,germany,technical university of munich,https://dblp.org/pid/k/AlfonsKemper.html,
16,alfons kemper,germany,tu munich,https://dblp.org/pid/k/AlfonsKemper.html,
24,alon halevy,united states,facebook,https://dblp.uni-trier.de/pers/h/Halevy:Alon_Y...,
...,...,...,...,...,...
1200,zechao shang,united states,univesity of chicago,https://dblp.org/pers/s/Shang:Zechao.html,
1209,zhongle xie,china,nus ai innovation and commercialisation center,https://dblp.org/pid/169/3419.html,
1210,zhongle xie,china,zhejiang university,https://dblp.org/pid/169/3419.html,
1216,zoi kaoudi,qatar,qatar computing research institute,https://dblp.org/pers/k/Kaoudi:Zoi.html,


In [7]:
scientists.drop_duplicates(subset='dblp', inplace=True, ignore_index=True) #remove duplicates by url

In [8]:
scientists.shape

(1079, 5)

In [27]:
url = scientists.iloc[4]['dblp']
r = requests.get(url[:-4] + 'xml').text
root = ET.fromstring(r)

author = root.attrib['pid']

year_coauthor_dict = dict()
for i in range(0, len(root)): 
    
    if root[i].tag == 'r': #only look at article entries
        
        publish_work = root[i][0].attrib['key']
        publish_year = root[i][0].attrib['mdate'][:4] #year
        
        current_year_coauthor_list = []
        
        for j in range(0, len(root[i][0])):
            if root[i][0][j].tag == 'author': 
                current_year_coauthor_list.append(root[i][0][j].attrib['pid'])
        
        if publish_year not in year_coauthor_dict:
            year_coauthor_dict[publish_year] = current_year_coauthor_list
            
        else: year_coauthor_dict[publish_year] = year_coauthor_dict[publish_year] + current_year_coauthor_list
        
for year in year_coauthor_dict:
    year_coauthor_dict[year] = [year_coauthor_dict[year]]
        
year_coauthor_df = pd.DataFrame.from_dict(year_coauthor_dict, orient='index', columns = [author])
year_coauthor_df

Unnamed: 0,116/1678
2024,"[349/0104, 141/4293, 116/1678, 67/370, 345/640..."
2023,"[188/5728, 116/1678, 06/900-1, m/AnimeshMukher..."
2022,"[48/10293, 73/10042, 116/1678, 141/9244, 57/49..."
2021,"[240/9044, 294/1305, 40/7987, g/PKrishnaGummad..."
2020,"[240/9044, 116/1678, 130/0373, g/PKrishnaGumma..."
2019,"[48/10293, 116/1678, 73/10042, 141/9244, 57/49..."
2018,"[116/1678, 116/1678, 230/4496, 52/6987, g/PKri..."
2017,"[116/1678, 48/3394, p/VenkataNPadmanabhan, 97/..."


In [11]:
collab_network_list = []
problem_list = []

for i in range(0, len(scientists)):
    url = scientists.iloc[i]['dblp']    
    r = requests.get(url[:-4] + 'xml').text
    
    try:
        root = ET.fromstring(r)
    
    except:
        print(url) #problematic entries
        problem_list.append([root.attrib['pid'], url])
        continue
    
    author_pid = root.attrib['pid'] #figure out the pid of the author
    author_name = root.attrib['name']
    
    coauthors = []
    
    for j in range(0, len(root)): 
        if root[j].tag == 'r': #only look at article entries
            #print(root[i][0].attrib['key']) #article name
            for k in range(0, len(root[j][0])):
                if root[j][0][k].tag == 'author': #coauthors
                    #print(root[i][0][j].tag, "{0:<30}".format(root[i][0][j].text), 'pid: ' + root[i][0][j].attrib['pid'])
                    coauthors.append(root[j][0][k].attrib['pid'])
    
    coauthors = set(coauthors) #remove duplicates
    collab_network_list.append([author_name, author_pid, coauthors])

collab_network_csv = pd.DataFrame(collab_network_list, columns = ['author_name', 'author_pid', 'coauthors_list'])
problem_list_csv = pd.DataFrame(problem_list, columns = ['problem_pid', 'url'])

https://dblp.org/pid/39/1380.html
https://dblp.org/pers/hd/g/Gupta:Amit
https://dblp.org/pers/hd/m/Mukherjee_0001:Animesh
https://dblp.uni-trier.de/pers/c/Chakraborty:Anirban.html
https://dblp.org/pers/hd/p/Pal_0001:Arindam
https://dblp.uni-trier.de/pers/hd/y/Yang_0002:Bin
https://dblp.uni-trier.de/pers/hd/j/J=oacute=nsson_0001:Bj=ouml=rn_=THORN==oacute=r
https://dblp.uni-trier.de/pers/hd/g/Glavic:Boris
https://dblp.uni-trier.de/pers/hd/m/Mohan_0001:C=
https://dblp.org/pid/92/2769.html
https://dblp.org/pid/148/7268.html
https://dblp.org/pers/hd/c/Chen:Xiuying
https://dblp.org/pers/hd/z/Za=in:Choiru
https://dblp.org/pers/hd/y/Yu_0001:Cong
https://dblp.uni-trier.de/pers/hd/w/Wang:Daheng
https://dblp.org/pers/hd/p/P_0001:Deepak
https://dblp.uni-trier.de/pers/b/Barbosa:Denilson.html
https://dblp.org/pers/g/Georgakopoulos:Dimitrios.html
https://dblp.org/pers/hd/p/Papadias:Dimitris
https://dblp.org/pid/161/0102.html
https://dblp.uni-trier.de/pers/hd/k/Kim:Donghyun
https://dblp.org/pers/m/Man

In [12]:
collab_network_csv #lost 244 rows bruhh

Unnamed: 0,author_name,author_pid,coauthors_list
0,Aaron J. Elmore,75/9436,"{01/3996, 82/75, f/AlanDavidFekete, a/Divyakan..."
1,Abdalghani Abujabal,162/9092,"{57/4545, 61/11469, t/MartinTheobald, 70/8150,..."
2,Abdul Quamar,127/6195,"{54/4272, c/LauraChiticariu, f/AlanDavidFekete..."
3,Abdulhakim Ali Qahtan,121/4198,"{e/AKElmagarmid, 09/7444, 25/7045-12, 131/4202..."
4,Abhijnan Chakraborty,116/1678,"{58/657, 138/1229, 372/7472, 10/7062, 139/5123..."
...,...,...,...
971,Zi Huang,70/6862,"{117/5520, s/DiveshSrivastava, 217/2275, 73/39..."
972,Ziawasch Abedjan,38/8707,"{131/4767, 184/2435, 231/9857, 131/4202, 371/2..."
973,Zoi Kaoudi,65/1457,"{207/2074, m/IoanaManolescu, e/AKElmagarmid, 9..."
974,Zsolt István,50/10703,"{181/2612, 223/0854, 98/6277, 251/6665, 15/145..."


In [13]:
collab_network_csv.to_csv('collab_network_csv.csv')
problem_list_csv.to_csv('problem_list_csv.csv')