# TODO

- [ ] Fix the arrays in the data ingestion
- [ ] Fix the datatypes in general
- [X] Fix the connection pooling issue (ClientError: failed to obtain a connection from the pool within 60.0s (timeout))

# Neo4j

In [1]:
import neo4j
import os
import json

In [2]:
SEMANTIC_PATH = './semanticscholar_raw_data'

In [3]:
def execute(query: str) -> None:
    """
    Executes a Cypher @query.
    TODO: It might be possible to avoid opening multiple connections to the database.
    """
    driver = neo4j.GraphDatabase.driver(uri="neo4j://localhost")
    driver.session().run(query)
    driver.close()

In [4]:
def delete_graph() -> None:
    """
    Deletes every node and edge of the graph.
    """
    query = """
        MATCH (n)
        DETACH DELETE n;
    """

    execute(query)

In [5]:
delete_graph()

In [6]:
def parse_journal_name(paper) -> str:
    """
    Not every file has a field 'journal' in the json.
    This function treats those edge cases.
    """
    if 'journal' not in paper or not paper['journal']:
        return None
    else:
        return paper.get('journal', {'name': None}).get('name', None)

In [7]:
def create_papers():
    """
    Create the nodes of label `Paper`.
    """
    for fname in os.listdir(SEMANTIC_PATH):
        print(f'Creating paper of {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        query = f"""
        CREATE (n:Paper {{
            paper_id: "{paper['paperId']}",
            publication_venue: "{paper['publicationVenue']}",
            title: "{paper['title']}",
            venue: "{paper['venue']}",
            year: "{paper['year']}",
            fieldsOfStudy: "{paper['fieldsOfStudy']}",
            publicationDate:" {paper['publicationDate']}",
            journal_name: "{parse_journal_name(paper)}"
        }})
        """

        execute(query)

In [8]:
create_papers()

Creating paper of 1226d86b1333a93141b76d88e3c3c9c5c8ac1fb0.json
Creating paper of a3a1a28874469b96efec78de0de1b49a6dc1c976.json
Creating paper of e919bcc14e08cc7bf2f844934d04ebfd7ad100f4.json
Creating paper of fe82515510f5810731f8e0bdd40520029372c0dc.json
Creating paper of 5611b3273898a7a5e9b73e6283211f9fd2ca452d.json
Creating paper of 9eab9d0931883ec8f40e21ebf042793d21c74363.json
Creating paper of 344de6cbbd40ee5fd56d06e6670fdcf0dc891ad4.json
Creating paper of bb135ecae2eac09bb4415e75eb71c947bfc146e5.json
Creating paper of d1e17d79fa406c44dc57c7f2e0a293549d4bdf3a.json
Creating paper of c367dc3b1efd1b32a3532d3f6334cf6560784342.json
Creating paper of d8b3b2d2c5b9de45b9dc5e3c12f22ae7e5ad232c.json
Creating paper of e661bc3eed70d9e3228f503ff45f66297100b14d.json
Creating paper of c64655d0f667e256123b7860cc7a0e2ce2680988.json
Creating paper of 35edd926ea34cb70cc1dfee35819dad67b1cad19.json
Creating paper of 9c042c2c64c2abc926d0d31668bc3ba10b19bc1d.json
Creating paper of b1a2ce46197a700d835657

In [9]:
def create_authors() -> None:
    """
    For each paper, generate a node with label `Author` for that paper.
    We are using the MERGE here since we don't want to duplicate authors.
    """
    for fname in os.listdir(SEMANTIC_PATH):
        print(f'Creating the authors of {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())
    
        for author in paper['authors']:
            query = f"""
            MERGE (n:Author {{
                name: "{author['name']}",
                author_id: "{author['authorId']}"
            }})
            """
            execute(query)

In [10]:
create_authors()

Creating the authors of 1226d86b1333a93141b76d88e3c3c9c5c8ac1fb0.json
Creating the authors of a3a1a28874469b96efec78de0de1b49a6dc1c976.json
Creating the authors of e919bcc14e08cc7bf2f844934d04ebfd7ad100f4.json
Creating the authors of fe82515510f5810731f8e0bdd40520029372c0dc.json
Creating the authors of 5611b3273898a7a5e9b73e6283211f9fd2ca452d.json
Creating the authors of 9eab9d0931883ec8f40e21ebf042793d21c74363.json
Creating the authors of 344de6cbbd40ee5fd56d06e6670fdcf0dc891ad4.json
Creating the authors of bb135ecae2eac09bb4415e75eb71c947bfc146e5.json
Creating the authors of d1e17d79fa406c44dc57c7f2e0a293549d4bdf3a.json
Creating the authors of c367dc3b1efd1b32a3532d3f6334cf6560784342.json
Creating the authors of d8b3b2d2c5b9de45b9dc5e3c12f22ae7e5ad232c.json
Creating the authors of e661bc3eed70d9e3228f503ff45f66297100b14d.json
Creating the authors of c64655d0f667e256123b7860cc7a0e2ce2680988.json
Creating the authors of 35edd926ea34cb70cc1dfee35819dad67b1cad19.json
Creating the authors

In [11]:
def link_author_to_paper() -> None:
    """
    Create the edge `Wrote` and `IsCorrespondingAuthor`, linking Authors and Papers.
    The first author is considered the corresponding author.
    """
    for fname in os.listdir(SEMANTIC_PATH):
        print(f'Linking authors of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

            is_first = True
            for author in paper['authors']:
                if is_first:
                    # The first author is the main corresponding author.
                    query = f"""
                        MATCH (a:Author {{author_id: '{author['authorId']}'}})
                        WITH a
                        MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                        WITH a, p
                        CREATE (a)-[e:IsCorrespondingAuthor]->(p);
                    """
                    execute(query)
                    is_first = False
                
                query = f"""
                    MATCH (a:Author {{author_id: '{author['authorId']}'}})
                    WITH a
                    MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                    WITH a, p
                    CREATE (a)-[e:Wrote]->(p);
                """
    
                execute(query)

In [12]:
link_author_to_paper()

Linking authors of file 1226d86b1333a93141b76d88e3c3c9c5c8ac1fb0.json
Linking authors of file a3a1a28874469b96efec78de0de1b49a6dc1c976.json
Linking authors of file e919bcc14e08cc7bf2f844934d04ebfd7ad100f4.json
Linking authors of file fe82515510f5810731f8e0bdd40520029372c0dc.json
Linking authors of file 5611b3273898a7a5e9b73e6283211f9fd2ca452d.json
Linking authors of file 9eab9d0931883ec8f40e21ebf042793d21c74363.json
Linking authors of file 344de6cbbd40ee5fd56d06e6670fdcf0dc891ad4.json
Linking authors of file bb135ecae2eac09bb4415e75eb71c947bfc146e5.json
Linking authors of file d1e17d79fa406c44dc57c7f2e0a293549d4bdf3a.json
Linking authors of file c367dc3b1efd1b32a3532d3f6334cf6560784342.json
Linking authors of file d8b3b2d2c5b9de45b9dc5e3c12f22ae7e5ad232c.json
Linking authors of file e661bc3eed70d9e3228f503ff45f66297100b14d.json
Linking authors of file c64655d0f667e256123b7860cc7a0e2ce2680988.json
Linking authors of file 35edd926ea34cb70cc1dfee35819dad67b1cad19.json
Linking authors of f