In [1]:
Example_out = {
  "paperId": "649def34f8be52c8b66281af98ae884c09aef38b",
  "externalIds": {
    "ArXiv": "...",
    "DBLP": "...",
    "PubMedCentral": "..."
  },
  "url": "https://www.semanticscholar.org/paper/649def34f8be52c8b66281af98ae884c09aef38b",
  "title": "Construction of the Literature Graph in Semantic Scholar",
  "abstract": "We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery.",
  "venue": "International Conference on Software Engineering",
  "publicationVenue": {
    "id": "a36dc29e-4ea1-4567-b0fe-1c06daf8bee8",
    "name": "International Conference on Software Engineering",
    "type": "conference",
    "alternate_names": [
      "IEEE Int Conf Semicond Electron",
      "IEEE International Conference on Semiconductor Electronics",
      "ICSE",
      "Int Conf Softw Eng"
    ],
    "url": "http://www.icse-conferences.org/"
  },
  "year": 2018,
  "referenceCount": 321,
  "citationCount": 987,
  "influentialCitationCount": 654,
  "isOpenAccess": True,
  "openAccessPdf": {
    "url": "https://www.aclweb.org/anthology/N18-3011.pdf",
    "status": "HYBRID"
  },
  "fieldsOfStudy": [
    "Computer Science"
  ],
  "s2FieldsOfStudy": [
    {
      "category": "Computer Science",
      "source": "external"
    },
    {
      "category": "Computer Science",
      "source": "s2-fos-model"
    },
    {
      "category": "Mathematics",
      "source": "s2-fos-model"
    }
  ],
  "publicationTypes": [
    "Journal Article",
    "Review"
  ],
  "publicationDate": "2015-01-17",
  "journal": {
    "name": "Remote Sensing of Environment",
    "pages": "255-271",
    "volume": "176"
  },
  "citationStyles": {
    "bibtex": "@['JournalArticle', 'Conference']{Ammar2018ConstructionOT,\n author = {Waleed Ammar and Dirk Groeneveld and Chandra Bhagavatula and Iz Beltagy and Miles Crawford and Doug Downey and Jason Dunkelberger and Ahmed Elgohary and Sergey Feldman and Vu A. Ha and Rodney Michael Kinney and Sebastian Kohlmeier and Kyle Lo and Tyler C. Murray and Hsu-Han Ooi and Matthew E. Peters and Joanna L. Power and Sam Skjonsberg and Lucy Lu Wang and Christopher Wilhelm and Zheng Yuan and Madeleine van Zuylen and Oren Etzioni},\n booktitle = {NAACL},\n pages = {84-91},\n title = {Construction of the Literature Graph in Semantic Scholar},\n year = {2018}\n}\n"
  },
  "authors": [
    {
      "authorId": "1741101",
      "name": "Oren Etzioni"
    }
  ]
}

### Get papers for every id:

In [2]:
import pandas as pd
ids = open("data/IDs_real.txt").read().splitlines()
print(len(ids))
id_groups = [ids[x:x+100] for x in range(0, len(ids), 100)]
print(len(id_groups))

2057
21


In [3]:
import requests
BASE_URL = "https://api.semanticscholar.org/graph/"
VERSION = "v1/"
RESOURCE = "author/batch"
URL = BASE_URL + VERSION + RESOURCE

# params2 = {"ids": ids_groups[0][:5]}

params = {"fields": "authorId,name,aliases,citationCount,papers.fieldsOfStudy,papers.authors,papers.title,papers.year,papers.externalIds,papers.citationCount,papers.abstract"}

In [5]:
content = []
for i in range(len(id_groups)):
    json_data = {"ids": id_groups[i]}
    r = requests.post(URL, params = params, json = json_data)
    print(i)
    current_content = r.json()
    content += current_content

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


### Finding neighbors

In [7]:
neighbors = []
for i in range(len(content)):
    authorDict = content[i]
    if authorDict == "message":
        continue
    paperDicts = authorDict["papers"]
    for paperDict in paperDicts:
        neighbors += [author["authorId"] for author in paperDict["authors"]]

In [8]:
true_neighbor_ids = set(neighbors + ids) - set(ids)
len(true_neighbor_ids), list(true_neighbor_ids)[:100]

(107793,
 ['2170073650',
  '31472086',
  '2110978546',
  '1393909859',
  '2110739435',
  '144333997',
  '152666047',
  '2068806092',
  '145271904',
  '2157418607',
  '86954831',
  '144150258',
  '4548083',
  '2060364398',
  '144355808',
  '103114264',
  '51127126',
  '145266047',
  '2169282994',
  '20774932',
  '2085548328',
  '1763640',
  '46714697',
  '145782546',
  '2104853',
  '1410176686',
  '31728784',
  '50580383',
  '1381347486',
  '92908105',
  '2109676973',
  '144277037',
  '50331281',
  '2869970',
  '2071839203',
  '10756122',
  '2118605714',
  '11325091',
  '1387468405',
  '2108046508',
  '2189262',
  '145687745',
  '2060356854',
  '120894401',
  '27121461',
  '2069577274',
  '120052601',
  '2164123465',
  '2149384030',
  '1393699478',
  '1409288763',
  '4710424',
  '40462390',
  '49969088',
  '49859114',
  '14278808',
  '11467102',
  '5117557',
  '5845498',
  '2059273032',
  '2056273317',
  '32551959',
  '5674770',
  '2141985',
  '4473727',
  '2100854986',
  '2697030',
  '

#### Save neighbors

In [9]:
neighbor_groups = [list(true_neighbor_ids)[x:x+100] for x in range(0, len(true_neighbor_ids), 100)]
neighbor_content = []
print(len(neighbor_groups))

1078


In [13]:
import time
start_time = time.time()

for i in range(127, len(neighbor_groups)):
    json_data = {"ids": neighbor_groups[i]}
    r = requests.post(URL, params = params, json = json_data)
    print(i)
    current_content = r.json()
    neighbor_content += current_content

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127


KeyboardInterrupt: 

In [21]:
current_content

{'message': 'Internal server error'}

##### Need to do web scraping for all neighbors and include in content

### New Webscraping with neighbors

In [283]:
import time

start_time = time.time()
time.sleep(5)
if (time.time() - start_time) > 5.:
    print("yep")

yep


### Creating author dataset

In [16]:
content = neighbor_content

In [17]:
header = ["AuthorID", "Name", "Aliases", "Citation Count", "Field"]
author_dataset = []
skipped = 0
for authorDict in content:
    current_list = []
    if authorDict == "message":
        skipped += 1
        print(skipped)
        continue
    current_list.append(authorDict["authorId"])
    current_list.append(authorDict["name"])
    current_list.append(authorDict["aliases"])
    current_list.append(authorDict["citationCount"])
    if len([str(paper["fieldsOfStudy"]) for paper in authorDict["papers"]]) > 1:
        current_list.append(max([str(paper["fieldsOfStudy"]) for paper in authorDict["papers"]]))
    else:
        current_list.append("None")
    author_dataset.append(current_list)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109


In [15]:
author_dataframe = pd.DataFrame(data=author_dataset, columns=header)
author_dataframe

Unnamed: 0,AuthorID,Name,Aliases,Citation Count,Field
0,2107389568,Z. He,"[Z He, Zhongyang He]",40,['Mathematics']
1,1643347009,Jessy Xinyi Han,,26,['Computer Science']
2,47979799,William R. Hobbs,"[William Hobbs, William R. Hobbs, Will Hobbs, ...",753,['Sociology']
3,2068360159,I. Weaver,"[I. Weaver, Iain Weaver]",0,['Sociology']
4,47559215,Yutao Chen,[Yutao Chen],211,['Psychology']
...,...,...,...,...,...
1952,32174870,Ksenia D. Mukhina,"[Ksenia D Mukhina, Ksenia Mukhina]",136,"['Sociology', 'Computer Science']"
1953,1696916,C. Veenman,"[C. J. Veenman, C.j. Veenman, Cor J. Veenman, ...",3579,['Medicine']
1954,2071105981,Nick S. Jones,"[Nick Jones, Nick S Jones]",25,['Medicine']
1955,35618632,Chico Q. Camargo,"[Chico Camargo, Chico Q Camargo]",276,"['Physics', 'Computer Science', 'Geography', '..."


### Creating paper dataset

In [267]:
content[0]["papers"][0]["externalIds"]["DOI"]

'10.1109/SPMB52430.2021.9672258'

In [268]:
header_paper = ["paperId", "title", "year", "externalId.DOI", "Citation Count", "fields", "authorIds", "authorNames"]
paper_dataset = []
for authorDict in content:
    paperDicts = authorDict["papers"]
    for paperDict in paperDicts:
        current_list = [paperDict["paperId"], paperDict["title"], paperDict["year"]]
        if "DOI" in paperDict["externalIds"].keys():
            current_list.append(paperDict["externalIds"]["DOI"])
        else:
            current_list.append("None")
        current_list.append(paperDict["citationCount"])
        current_list.append(paperDict["fieldsOfStudy"])
        current_list.append([author["authorId"] for author in paperDict["authors"]])
        current_list.append([author["name"] for author in paperDict["authors"]])
        paper_dataset.append(current_list)
    

In [269]:
paper_dataframe = pd.DataFrame(data=paper_dataset, columns=header_paper)
paper_dataframe

Unnamed: 0,paperId,title,year,externalId.DOI,Citation Count,fields,authorIds,authorNames
0,cd29131082e7cc7f4d9c333afed8b58d4cf44ce5,Feature Reconstruction Based Channel Selection...,2021.0,10.1109/SPMB52430.2021.9672258,1,,"[2150507237, 2107389568, 2110194494]","[J. R. Msonda, Z. He, C. Lu]"
1,33b43646408161ad66a80ba8079d6e1aee244b65,Wavelet multipliers and signals,1999.0,10.1017/S0334270000010523,38,[Mathematics],"[2107389568, 11214739]","[Z. He, M. W. Wong]"
2,3bda43967a06cf4611c5e5c8636477c230663201,Principal Component Feature for Speech Recogni...,1999.0,,1,[Mathematics],[2107389568],[Z. He]
3,3972ac23a6eaf45dea73cab154706088498018b8,Chasm in Hegemony: Explaining and Reproducing ...,2021.0,10.1145/3460083,3,"[Computer Science, Sociology]","[2108343919, 1643347009, 2051502400, 120446735...","[Yiguang Zhang, Jessy Xinyi Han, Ilica Mahajan..."
4,5dabe81f9d6b53e75b0078322b3f0741f2788041,Chasm in Hegemony,2021.0,10.1145/3543516.3460109,2,[Computer Science],"[2108343919, 1643347009, 2051502400, 120446735...","[Yiguang Zhang, Jessy Xinyi Han, Ilica Mahajan..."
...,...,...,...,...,...,...,...,...
28371,f204e92575b3c37a67a9103d6ed00292578978b3,Mechanism of action of f 3-bungarotoxin on syn...,,,0,,"[2113780702, 145366990, 2173858]","[Indira Sen, P. Grantham, J. Cooper]"
28372,13d1782fc7384202e39af2aa9b3b77df2395bfe3,"Risk Analysis of Dam Failure, Floods and Growi...",2018.0,10.2139/ssrn.3254595,0,,[2059392121],[Binny Mathew]
28373,c633303e309a59baeb0c62e80e22b01f0b44e101,Asura: A Tale of Vanquished,2018.0,10.2139/SSRN.3270075,1,[History],[2059392121],[Binny Mathew]
28374,d2e119f3eab633aabf0e787df3540fa726145e87,Inequality of Gender-Based Victimization In Ma...,2018.0,10.2139/ssrn.3254517,0,[Sociology],[2059392121],[Binny Mathew]


### Paper Abstract Dataset

In [270]:
header_paper_abstract = ["PaperId", "Abstract"]
paper_abstract_dataset = []
for authorDict in content:
    paperDicts = authorDict["papers"]
    for paperDict in paperDicts:
        current_list = []
        current_list.append(paperDict["paperId"])
        current_list.append(paperDict["abstract"])
        paper_abstract_dataset.append(current_list)

In [275]:
paper_abstract_dataframe = pd.DataFrame(data=paper_abstract_dataset, columns=header_paper_abstract)
paper_abstract_dataframe

Unnamed: 0,PaperId,Abstract
0,cd29131082e7cc7f4d9c333afed8b58d4cf44ce5,There has been a surge in the use of consumer ...
1,33b43646408161ad66a80ba8079d6e1aee244b65,Abstract The Schatten-von Neumann property of ...
2,3bda43967a06cf4611c5e5c8636477c230663201,Using curve fitting and principalcom ponentana...
3,3972ac23a6eaf45dea73cab154706088498018b8,In networks with a minority and a majority com...
4,5dabe81f9d6b53e75b0078322b3f0741f2788041,In networks with a minority and a majority com...
...,...,...
28371,f204e92575b3c37a67a9103d6ed00292578978b3,The neurochemical activity of fl-bungarotoxin ...
28372,13d1782fc7384202e39af2aa9b3b77df2395bfe3,People have exploited water resource to a leve...
28373,c633303e309a59baeb0c62e80e22b01f0b44e101,
28374,d2e119f3eab633aabf0e787df3540fa726145e87,"Mahesh Dattani being a director, directed many..."


### Save dataframes

In [281]:
author_dataframe.to_csv("data/AuthorDataframe.csv", index=False)
paper_dataframe.to_csv("data/PaperDataset.csv", index=False)
paper_abstract_dataframe.to_csv("data/PaperAbstractDataset.csv", index=False)