In [1]:
Example_out = {
  "paperId": "649def34f8be52c8b66281af98ae884c09aef38b",
  "externalIds": {
    "ArXiv": "...",
    "DBLP": "...",
    "PubMedCentral": "..."
  },
  "url": "https://www.semanticscholar.org/paper/649def34f8be52c8b66281af98ae884c09aef38b",
  "title": "Construction of the Literature Graph in Semantic Scholar",
  "abstract": "We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery.",
  "venue": "International Conference on Software Engineering",
  "publicationVenue": {
    "id": "a36dc29e-4ea1-4567-b0fe-1c06daf8bee8",
    "name": "International Conference on Software Engineering",
    "type": "conference",
    "alternate_names": [
      "IEEE Int Conf Semicond Electron",
      "IEEE International Conference on Semiconductor Electronics",
      "ICSE",
      "Int Conf Softw Eng"
    ],
    "url": "http://www.icse-conferences.org/"
  },
  "year": 2018,
  "referenceCount": 321,
  "citationCount": 987,
  "influentialCitationCount": 654,
  "isOpenAccess": True,
  "openAccessPdf": {
    "url": "https://www.aclweb.org/anthology/N18-3011.pdf",
    "status": "HYBRID"
  },
  "fieldsOfStudy": [
    "Computer Science"
  ],
  "s2FieldsOfStudy": [
    {
      "category": "Computer Science",
      "source": "external"
    },
    {
      "category": "Computer Science",
      "source": "s2-fos-model"
    },
    {
      "category": "Mathematics",
      "source": "s2-fos-model"
    }
  ],
  "publicationTypes": [
    "Journal Article",
    "Review"
  ],
  "publicationDate": "2015-01-17",
  "journal": {
    "name": "Remote Sensing of Environment",
    "pages": "255-271",
    "volume": "176"
  },
  "citationStyles": {
    "bibtex": "@['JournalArticle', 'Conference']{Ammar2018ConstructionOT,\n author = {Waleed Ammar and Dirk Groeneveld and Chandra Bhagavatula and Iz Beltagy and Miles Crawford and Doug Downey and Jason Dunkelberger and Ahmed Elgohary and Sergey Feldman and Vu A. Ha and Rodney Michael Kinney and Sebastian Kohlmeier and Kyle Lo and Tyler C. Murray and Hsu-Han Ooi and Matthew E. Peters and Joanna L. Power and Sam Skjonsberg and Lucy Lu Wang and Christopher Wilhelm and Zheng Yuan and Madeleine van Zuylen and Oren Etzioni},\n booktitle = {NAACL},\n pages = {84-91},\n title = {Construction of the Literature Graph in Semantic Scholar},\n year = {2018}\n}\n"
  },
  "authors": [
    {
      "authorId": "1741101",
      "name": "Oren Etzioni"
    }
  ]
}

### Get papers for every id:

In [2]:
import pandas as pd
ids = open("IDs_real.txt").read().splitlines()
print(len(ids))
id_groups = [ids[x:x+50] for x in range(0, len(ids), 50)]
print(len(id_groups))
print(id_groups[0])

2057
42
['2107389568', '1643347009', '47979799', '2068360159', '47559215', '2027186576', '6437306', '2144708089', '2109842138', '1796260814', '144490412', '116693203', '47526637', '1729586298', '2095141092', '145963427', '1831395', '1838478', '40504742', '2115496186', '2558997', '119218337', '146089746', '2141510269', '1778398', '8169994', '39613066', '10807072', '1470825038', '2064358', '123861276', '1712865', '2058192976', '40196311', '102718695', '31270684', '1878375', '51290625', '2168321727', '33978150', '144937884', '51034107', '32670167', '89974507', '102788905', '3178240', '2058460749', '1589493589', '5315446', '5704024']


In [3]:
import requests
BASE_URL = "https://api.semanticscholar.org/graph/"
VERSION = "v1/"
RESOURCE = "author/batch"
URL = BASE_URL + VERSION + RESOURCE

# params2 = {"ids": ids_groups[0][:5]}

params = {"fields": "authorId,name,aliases,citationCount,papers.fieldsOfStudy,papers.authors,papers.title,papers.year,papers.externalIds,papers.citationCount,papers.abstract"}

In [60]:
def make_author_dataset(lst): 
    df = pd.read_csv("author_dataset.csv")
    header = ["AuthorID", "Name", "Aliases", "Citation Count", "Field"]
    author_dataset = []
    skipped = 0
    for authorDict in lst:
        current_list = []
        if authorDict == "message":
            skipped += 1
            print(skipped)
            continue
        current_list.append(authorDict["authorId"])
        current_list.append(authorDict["name"])
        current_list.append(authorDict["aliases"])
        current_list.append(authorDict["citationCount"])
        if len([str(paper["fieldsOfStudy"]) for paper in authorDict["papers"]]) > 1:
            current_list.append(max([str(paper["fieldsOfStudy"]) for paper in authorDict["papers"]]))
        else:
            current_list.append("None")
        author_dataset.append(current_list)
    df_author = pd.DataFrame(data=author_dataset, columns=header)
    df_author = pd.concat([df_author, df])
    df_author.to_csv("author_dataset.csv", index=False)
    return df_author

In [61]:
def make_paper_dataset(lst):
    df = pd.read_csv("paper_dataset.csv")
    header_paper = ["paperId", "title", "year", "externalId.DOI", "Citation Count", "fields", "authorIds", "authorNames"]
    paper_dataset = []
    for authorDict in lst:
        paperDicts = authorDict["papers"]
        # remove duplicate papers
        for paperDict in paperDicts:
            current_list = []
            current_list.append(paperDict["paperId"])
            current_list.append(paperDict["title"])
            current_list.append(paperDict["year"])
            if "DOI" in paperDict["externalIds"].keys():
                current_list.append(paperDict["externalIds"]["DOI"])
            else:
                current_list.append("None")
            current_list.append(paperDict["citationCount"])
            current_list.append(paperDict["fieldsOfStudy"])
            current_list.append([author["authorId"] for author in paperDict["authors"]])
            current_list.append([author["name"] for author in paperDict["authors"]])
            paper_dataset.append(current_list)
    df_paper = pd.DataFrame(data=paper_dataset, columns=header_paper)
    df_paper = pd.concat([df_paper, df])
    df_paper.to_csv("paper_dataset.csv", index=False)
    return df_paper
    

In [62]:
def make_paper_abstract(lst):
    df = pd.read_csv("paper_abstract_dataset.csv")
    header_paper_abstract = ["PaperId", "Abstract"]
    paper_abstract_dataset = []
    for authorDict in lst:
        paperDicts = authorDict["papers"]
        for paperDict in paperDicts:
            current_list = []
            current_list.append(paperDict["paperId"])
            current_list.append(paperDict["abstract"])
            paper_abstract_dataset.append(current_list)
    df_abstract = pd.DataFrame(data=paper_abstract_dataset, columns=header_paper_abstract)
    df_abstract = pd.concat([df_abstract, df])
    df_abstract.to_csv("paper_abstract_dataset.csv", index=False)
    return df_abstract

In [63]:
def get_data_in_batch(json_data, internal_error, i, update=False):
    lst = []
    r = requests.post(URL, params = params, json = json_data)
    data = r.json()
    if type(data) is list:
        for n,m in enumerate(data):
            if type(m) is dict:
                lst.append(m)
            else:
                print("error1", m)
    else:
        print("error2", data)
        internal_error.append(i)

    if update:
        df1 = make_author_dataset(lst)
        df2 = make_paper_dataset(lst)
        df3 = make_paper_abstract(lst)
        return df1, df2, df3, internal_error


In [29]:
content = []

In [31]:
#for i in range(len(id_groups)):
for i in range(len(id_groups)):
    print(i)
    json_data = {"ids": id_groups[i]}
    get_data_in_batch(json_data, content)

0
1
2
3
4
5
6
7
8
error1 None
9
10
11
12
13
14
15
16
error2 {'message': 'Internal server error'}
17
18
error1 None
19
20
21
error1 None
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
error1 None
38
39
40
41


### Finding neighbors

In [48]:
def get_neighbors_from_csv():
    df = pd.read_csv("neighbors.csv")
    df = df.dropna()
    return list(df["neighbors"].values)

In [55]:
neighbors = get_neighbors_from_csv()
neighbors = [int(neighbor) for neighbor in neighbors]
true_neighbor_ids = neighbors

In [32]:
def find_neighbors(lst):
    neighbors = []
    for i in range(len(lst)):
        authorDict = lst[i]
        if authorDict == "message":
            continue
        paperDicts = authorDict["papers"]
        for paperDict in paperDicts:
            neighbors += [author["authorId"] for author in paperDict["authors"]]
    return set(neighbors + ids) - set(ids)

In [36]:
true_neighbor_ids = find_neighbors(content)
len(true_neighbor_ids), list(true_neighbor_ids)[:5]

(108597, ['51226150', '2293749', '4998049', '4107174', '121803596'])

#### Save neighbors

In [68]:
internal_error_groups = list(pd.read_csv("internal_error_groups.csv")["0"].values)
print(internal_error_groups)

[22, 23, 25, 32, 34, 43, 55, 70, 72, 75, 76, 78, 83, 94]


In [57]:
neighbor_groups = [list(true_neighbor_ids)[x:x+30] for x in range(0, len(true_neighbor_ids), 30)]
print(len(neighbor_groups))

3620


In [64]:
import time
start_time = time.time()

for i in range(37, 100):
    print(i)
    json_data = {"ids": neighbor_groups[i]}
    df1, df2, df3, internal_error_groups = get_data_in_batch(json_data=json_data, internal_error=internal_error_groups, i=i, update=True)
    pd.DataFrame(internal_error_groups).to_csv("internal_error_groups.csv")

37
38
39
40
41
42
43
error2 {'message': 'Internal server error'}
44
45
46
47
48
49
50
51
52
53
54
55
error2 {'message': 'Internal server error'}
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
error2 {'message': 'Internal server error'}
71
72
error2 {'message': 'Endpoint request timed out'}
73
74
75
error2 {'message': 'Internal server error'}
76
error2 {'message': 'Internal server error'}
77
78
error2 {'message': 'Internal server error'}
79
80
81
82
83
error2 {'message': 'Internal server error'}
84
85
86
87
88
89
90
91
92
93
94
error2 {'message': 'Internal server error'}
95
96
97
98
99


In [67]:
print(internal_error_groups)
pd.DataFrame(internal_error_groups).to_csv("internal_error_groups.csv", index=False)

[22, 23, 25, 32, 34, 43, 55, 70, 72, 75, 76, 78, 83, 94]


In [52]:
df1

Unnamed: 0,AuthorID,Name,Aliases,Citation Count,Field
0,51226150,Charlotte Wehking,,59,['Computer Science']
1,2293749,J. Arribas,"[Jose Arribas, J. R. Arribas, José Ramon Arrib...",12617,"['Psychology', 'Medicine']"
2,4998049,S. Siabani,"[S A Siabani, S Siabani, Soraya Siabani]",29215,"['Political Science', 'Medicine']"
3,4107174,A. Iannielli,"[A Iannielli, Angelo Iannielli]",463,['Medicine']
4,121803596,S. Bernstein,"[S Bernstein, Sh Bernstein, Shawnee Bernstein,...",366,['Medicine']
...,...,...,...,...,...
295,50780388,Ruben Martins,[Ruben Carlos Goncalves Martins],1375,['Mathematics']
296,114296274,Molly W. Andolina,[Molly Andolina],1700,['Sociology']
297,8391894,Dongil Chung,,245,['Psychology']
298,144174680,C. Vignesh,"[C. Vignesh, Chakravarthy Vignesh, C Vignesh, ...",51,['Medicine']


In [53]:
df2

Unnamed: 0,paperId,title,year,externalId.DOI,Citation Count,fields,authorIds,authorNames
0,0f3db475f8b4f31e36683bccc737e21b48fcf06d,Process Automation at Generali CEE Holding: A ...,2021.0,10.1007/978-3-662-63047-1_2,0,[Computer Science],"[2099186844, 1396050817, 51226150]","[Jan Marek, Kurt Blümlein, Charlotte Wehking]"
1,199b54e89a983240c75407581ca4f1bd9958e877,Approaching Digitalization at an SME Manufactu...,2021.0,10.1007/978-3-030-80003-1_14,2,,"[2136009567, 51226150, 2059819586, 2091906597,...","[Michael Reiner Kamm, Charlotte Wehking, L. Ka..."
2,499f176be0e62e94dd4adba93a924b5842116c58,Orchestration of employees' creativity: A phas...,2021.0,10.24251/HICSS.2021.603,0,[Computer Science],"[51226150, 2059819586, 93264014, 1695746]","[Charlotte Wehking, L. Kaiser, Bernd Schenk, J..."
3,7f276498e32afad3a6ca80983bfc818437874ae2,How to Organize Digital Innovation? The Role o...,2021.0,10.1142/S0219877021500103,0,[Business],"[2059819586, 51226150, 1695746]","[L. Kaiser, Charlotte Wehking, J. Brocke]"
4,93dbc56340f2e8e75bfc3769ff3400bcdf188477,More than experience? - On the unique opportun...,2021.0,10.1016/J.IHEDUC.2021.100804,28,[Computer Science],"[11746271, 2119955, 51226150, 2423134, 1739074...","[Jennifer Fromm, Jaziar Radianti, Charlotte We..."
...,...,...,...,...,...,...,...,...
23751,491d225ee33c0c59d5e5a86aec44b7636c0a69ce,Lost in Space,1999.0,10.1177/0022002799043006006,19,[Economics],"[145157410, 48987554]","[Michael Shin, M. Ward]"
23752,98bf97c5b50b97373ba135591e3f37797db2160d,"The Diffusion of Democracy, 1946–1994",1998.0,10.1111/0004-5608.00112,313,[Economics],"[1400041070, 48987554, 2614162, 39629841, 2115...","[J. O’Loughlin, M. Ward, Corey Lofdahl, Jordin..."
23753,25c1d5119c2222e1efe29c8f98be58d8efc069ff,The Ironies of Affirmative Action: Empirical A...,1997.0,,4,[Sociology],"[97133679, 121917287, 1573557754, 2061237083, ...","[H. Fukurai, D. Davies, Anne-Marie Shin, dionn..."
23754,d9b98f7551239c93ecfff25df1252c48a4a414f9,A Comment on R. K. Ormrod and D. B. Cole's “To...,1997.0,10.1111/0033-0124.00064,1,[Sociology],[145157410],[Michael Shin]


In [54]:
df3

Unnamed: 0,PaperId,Abstract
0,0f3db475f8b4f31e36683bccc737e21b48fcf06d,
1,199b54e89a983240c75407581ca4f1bd9958e877,
2,499f176be0e62e94dd4adba93a924b5842116c58,Digital innovation is a promising but challeng...
3,7f276498e32afad3a6ca80983bfc818437874ae2,"Innovation, especially digital innovation, is ..."
4,93dbc56340f2e8e75bfc3769ff3400bcdf188477,
...,...,...
23751,491d225ee33c0c59d5e5a86aec44b7636c0a69ce,This study examines the political geography of...
23752,98bf97c5b50b97373ba135591e3f37797db2160d,We examine the relationship between the tempor...
23753,25c1d5119c2222e1efe29c8f98be58d8efc069ff,The 1990s may be said to have been tumultuous ...
23754,d9b98f7551239c93ecfff25df1252c48a4a414f9,
