In [7]:
import requests
from scholarly import scholarly, ProxyGenerator
from neo4j import GraphDatabase
URI = "neo4j://localhost:7687"


In [8]:
pg = ProxyGenerator()
# success = pg.FreeProxies()
success = pg.ScraperAPI("b1bb916c1201f9bfc83672dfcc15e3d4")
if success:
    scholarly.use_proxy(pg)
else:
    print("No Proxy found.")

In [9]:
# Retrieve the author's data, fill-in, and print
# Get an iterator for the author results
search_query = scholarly.search_author('Masahiro Suzuki, weblab.t.u-tokyo.ac.jp')
# Retrieve the first result from the iterator
first_author_result = next(search_query)

In [58]:
more_info = scholarly.fill(first_author_result, sections=["indices"])

In [59]:
more_info

{'container_type': 'Author',
 'filled': ['indices'],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': 'r2nt5kUAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=r2nt5kUAAAAJ',
 'name': 'Masahiro Suzuki',
 'affiliation': 'The University of Tokyo',
 'email_domain': '@weblab.tu-tokyo.ac.jp',
 'interests': ['Artificial intelligence', 'Deep learning'],
 'citedby': 588,
 'citedby5y': 490,
 'hindex': 9,
 'hindex5y': 9,
 'i10index': 9,
 'i10index5y': 8}

In [10]:
def run_cypher_match(neo4j_driver, db_name, query, params=None):
    session = neo4j_driver.session(database=db_name)
    try:
        result = session.run(query, parameters=params)
        return [line for line in result]
    finally:
        session.close()

def get_author_batch(size=50):
    with GraphDatabase.driver(URI, auth=("neo4j", "openreview")) as driver:
        batch = run_cypher_match(
        driver,
        "open-review-data",
        f"MATCH (a:Author) WHERE a.scholar_id IS NULL RETURN a.id AS id, a.name AS name LIMIT {size}",
    )
    return batch


def find_author_info_gscholar(id, name):
    if "@" in id:
        domain = id.split("@")[1]
        search_query = scholarly.search_author(f'{name}, {domain}')
        # Retrieve the first result from the iterator
        try:
            first_author_result = next(search_query)
            print("found with name and domain.")
        except StopIteration:
            search_query = scholarly.search_author(f'{name}')
            try:
                first_author_result = next(search_query)
                print("found with name.")
            except StopIteration:
                return
        try:
            more_info = scholarly.fill(first_author_result, sections=["indices"])
            print(more_info)
            del more_info["container_type"]
            del more_info["name"]
            del more_info["source"]
            del more_info["url_picture"]
            del more_info["filled"]
            return more_info
        except:
            print("Couldn't extract hindex")
            return
    else:
        search_query = scholarly.search_author(f'{name}')
        try:
            first_author_result = next(search_query)
        except StopIteration:
            return

def set_author_gscholar_info(id, gscholar_properties):
    query = """
    MATCH (a:Author {id: $id})
    SET a += $newProperties
    RETURN a
    """    
    with GraphDatabase.driver(URI, auth=("neo4j", "openreview")) as driver:
        with driver.session(database="open-review-data") as session:
            session.execute_write(lambda tx: tx.run(query, id=id, newProperties=gscholar_properties))
        

In [11]:
def main():
    author_batch = get_author_batch()
    while len(author_batch)>0:
        for author in author_batch:
            print(author)
            info = find_author_info_gscholar(author["id"], author["name"])
            if info is not None:
                set_author_gscholar_info(author["id"], info)
            else:
                print("author not found on gscholar")
                set_author_gscholar_info(author["id"], {"scholar_id": "na"})
        author_batch = get_author_batch()

In [12]:
main()

<Record id='sylvaingelly@google.com' name='Sylvain Gelly'>
found with name.
{'container_type': 'Author', 'filled': ['indices'], 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>, 'scholar_id': 'm7LvuTkAAAAJ', 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=m7LvuTkAAAAJ', 'name': 'Sylvain Gelly', 'affiliation': 'Google Brain Zurich', 'email_domain': '@m4x.org', 'interests': ['Machine Learning', 'Artificial Intelligence', 'Reinforcement Learning'], 'citedby': 48358, 'citedby5y': 45288, 'hindex': 45, 'hindex5y': 40, 'i10index': 72, 'i10index5y': 59}
<Record id='bs@tuebingen.mpg.de' name='Bernhard Sch ̈olkopf'>
author not found on gscholar
<Record id='abdu@weblab.t.u-tokyo.ac.jp' name='Abdul Rahman Abdul Ghani'>
found with name.
{'container_type': 'Author', 'filled': ['indices'], 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>, 'scholar_id': '8ery-IUAAAAJ', 'url_picture': 'https://scholar.google.com/citations?

MaxTriesExceededException: Cannot Fetch from Google Scholar.