# TODO

- [ ] Fix the arrays in the data ingestion
- [ ] Fix the datatypes in general
- [X] Fix the connection pooling issue (ClientError: failed to obtain a connection from the pool within 60.0s (timeout))

# Libraries and contants

In [1]:
# !pip install tqdm

In [2]:
import os
import json
import random

import neo4j
import numpy as np
from tqdm import tqdm

In [3]:
SEMANTIC_PATH = '../semanticscholar_raw_data'

SEED = 13

In [4]:
random.seed(SEED)
np.random.seed(SEED)

# Neo4j

In [5]:
DRIVER = neo4j.GraphDatabase.driver(uri="neo4j://localhost")
# result = driver.execute_query(query)

In [6]:
def execute(query: str):
    """
    Executes a Cypher @query.
    TODO: It might be possible to avoid opening multiple connections to the database.
    """
    # driver = neo4j.GraphDatabase.driver(uri="neo4j://localhost")
    # result = driver.session().run(query)
    # # summary = result.consume()
    # driver.close()
    # return summary

    result = DRIVER.execute_query(query)
    return result

In [7]:
def delete_graph() -> None:
    """
    Deletes every node and edge of the graph.
    """
    query = """
        MATCH (n)
        DETACH DELETE n;
    """

    execute(query)

In [8]:
delete_graph()

## Papers

In [9]:
def parse_journal_name(paper) -> str:
    """
    Not every file has a field 'journal' in the json.
    This function treats those edge cases.
    """
    if 'journal' not in paper or not paper['journal']:
        return None
    else:
        return paper.get('journal', {'name': None}).get('name', None)

In [10]:
def create_papers():
    """
    Create the nodes of label `Paper`.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Creating paper of {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        query = f"""
        CREATE (n:Paper {{
            paper_id: "{paper['paperId']}",
            publication_venue: "{paper['publicationVenue']}",
            title: "{paper['title']}",
            venue: "{paper['venue']}",
            year: "{paper['year']}",
            fieldsOfStudy: "{paper['fieldsOfStudy']}",
            publicationDate:" {paper['publicationDate']}"
        }})
        """

        execute(query)

In [11]:
%%time
create_papers()

100%|████████████████████████████████████████| 739/739 [00:03<00:00, 230.53it/s]

CPU times: user 599 ms, sys: 137 ms, total: 736 ms
Wall time: 3.21 s





In [12]:
def create_paper__paper_id__range_index():
    """
    Create indexes
    """
    query = """
        CREATE RANGE INDEX paper__paper_id__range_index IF NOT EXISTS
        FOR (n:Paper)
        ON (n.paper_id)
    """

    execute(query)

In [13]:
%%time
create_paper__paper_id__range_index()

CPU times: user 2.53 ms, sys: 1.05 ms, total: 3.58 ms
Wall time: 4.18 ms


## Authors

In [14]:
def create_authors() -> None:
    """
    For each paper, generate a node with label `Author` for that paper.
    We are using the MERGE here since we don't want to duplicate authors.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Creating the authors of {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())
    
        for author in paper['authors']:
            query = f"""
            MERGE (n:Author {{
                name: "{author['name']}",
                author_id: "{author['authorId']}"
            }})
            """
            execute(query)

In [15]:
%%time
create_authors()

100%|█████████████████████████████████████████| 739/739 [00:11<00:00, 66.94it/s]

CPU times: user 2.81 s, sys: 503 ms, total: 3.31 s
Wall time: 11 s





In [16]:
def create_author__author_id__range_index():
    """
    Create indexes
    """
    query = """
        CREATE RANGE INDEX author__author_id__range_index IF NOT EXISTS
        FOR (n:Author)
        ON (n.author_id)
    """

    execute(query)

In [17]:
%%time
create_author__author_id__range_index()

CPU times: user 2.34 ms, sys: 0 ns, total: 2.34 ms
Wall time: 5.14 ms


In [18]:
def link_author_to_paper() -> None:
    """
    Create the edge `Wrote` and `IsCorrespondingAuthor`, linking Authors and Papers.
    The first author is considered the corresponding author.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Linking authors of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

            is_first = True
            for author in paper['authors']:
                if is_first:
                    # The first author is the main corresponding author.
                    query = f"""
                        MATCH (a:Author {{author_id: '{author['authorId']}'}})
                        WITH a
                        MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                        WITH a, p
                        CREATE (a)-[e:IsCorrespondingAuthor]->(p);
                    """
                    execute(query)
                    is_first = False
                
                query = f"""
                    MATCH (a:Author {{author_id: '{author['authorId']}'}})
                    WITH a
                    MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                    WITH a, p
                    CREATE (a)-[e:Wrote]->(p);
                """
    
                execute(query)

In [19]:
%%time
link_author_to_paper()

100%|█████████████████████████████████████████| 739/739 [00:26<00:00, 28.23it/s]

CPU times: user 3.12 s, sys: 545 ms, total: 3.67 s
Wall time: 26.2 s





## Citations

In [20]:
def link_citations_between_papers() -> None:
    """
    Generate the edge Cited linking a Paper to a Paper.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Linking citations of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        for citation in paper.get('citations', []):
            query = f"""
                MATCH (a:Paper {{paper_id: '{citation['paperId']}'}}), (p:Paper {{paper_id: '{paper['paperId']}'}})
                CREATE (a)-[e:Cites]->(p);
            """
            execute(query)

In [21]:
%%time
link_citations_between_papers()

100%|█████████████████████████████████████████| 739/739 [01:12<00:00, 10.15it/s]

CPU times: user 10.9 s, sys: 1.72 s, total: 12.7 s
Wall time: 1min 12s





# Journals

In [1]:
def create_journals()->None:
    for fname in os.listdir(SEMANTIC_PATH):
        print(f'Creating papers of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())
            query = f"""
            MERGE (n:Journal {{
                year: "{paper['year']}",
                journal_name: "{parse_journal_name(paper)}"
                
            }})
            """
        execute(query)

In [None]:
%%time
create_journals()

In [None]:
def link_journals()-> None:
    for fname in os.listdir(SEMANTIC_PATH):
        print(f'Linking journal of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())
            
            query = f"""
                    MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                    WITH p
                    MATCH (j:Journal {{journal_name: '{parse_journal_name(paper)}'}})
                    WITH p, j
                    CREATE (p)-[e:publishedin]->(j);"""
            execute(query)

In [None]:
%%time
link_journals()

## Reviews

We will have to generate synthetic data here to represent the reviews.

Typically, each paper has 3 reviewers, who are usually relevant authors.
The author cannot review its own paper.

The strategy that we will be using is to select up to 3 authors who:
1. wrote papers cited by the paper in question; and
2. didn't wrote the paper itself.

In [22]:
def get_possible_reviewers():
    """
    Auxiliary function that returns an aggregation of all possible reviewers of a paper.
    The logic of a "possible reviewer" is to select an author who:
    1. wrote paper(s) cited by the paper in question; and who
    2. didn't wrote the paper itself.
    """
    query = """
        MATCH (a:Author)-[w1:Wrote]->(mp:Paper)-[c:Cites]->(cp:Paper)
        WITH mp, cp, a
        MATCH (wcp:Author)-[w2:Wrote]->(cp)
        WHERE NOT (wcp)-[:Wrote]->(mp)
        RETURN mp.paper_id AS paper_id, collect(wcp.author_id) AS possible_reviewer_ids;
    """

    return execute(query)

In [23]:
def link_reviewer_to_paper() -> None:
    """
    This function generates synthetic data.
    """
    result = get_possible_reviewers()
    
    for paper_id, possible_reviewers in tqdm(result[0]):
        # Papers can have a different amount of reviewers, varying from 1 to 4, following the distribution specified by `p`.
        # Edge case: If the paper doesn't cite any other paper, it will have 0 reviewers.
        reviewer_qty = min(
            np.random.choice(np.arange(1, 5), p=[0.1, 0.3, 0.5, 0.1]),
            len(possible_reviewers)
        )

        reviewers = random.sample(possible_reviewers, reviewer_qty)
        for reviewer in reviewers:
            query = f"""
                MATCH (a:Author {{author_id: '{reviewer}'}}), (p:Paper {{paper_id: '{paper_id}'}})
                CREATE (a)-[e:Reviewed]->(p);
            """

            execute(query)

link_reviewer_to_paper()

100%|█████████████████████████████████████████| 736/736 [00:09<00:00, 80.58it/s]


## Keywords

# Querying

<b>Query 1 
<p>Find the top 3 most cited papers of each conference.</span>
<p>MATCH (p:Paper)-[:cited]->(cited:Paper)
WITH p.journal_name AS journal, p.title AS title, COUNT(*) AS num_citations
ORDER BY journal, num_citations DESC
WITH journal, COLLECT({title: title, num_citations: num_citations}) AS papers
WITH journal, papers, [i IN RANGE(1, SIZE(papers)) | i] AS ranks
UNWIND ranks AS rank
WITH journal, papers[rank - 1].title AS title, papers[rank - 1].num_citations AS num_citations, rank
WHERE rank <= 3
RETURN journal, title, num_citations, rank
ORDER BY journal, rank
<p><b>Manually Test</b>
<p>MATCH (p:Paper)-[c:cited]->(cited:Paper)
WITH p.journal_name AS journal, p.title AS title, COUNT(c) AS num_citations
WHERE journal = '2014 IEEE International Conference on Big Data (Big Data)'
RETURN journal, title, num_citations
ORDER BY num_citations DESC


<b>Query 4
<p> Find the h-indexes of the authors in your graph

<p>MATCH (a:Author)-[:Wrote]->(p:Paper)-[:cited]->(cited:Paper)
WITH a, p, COUNT(*) AS num_citations
ORDER BY num_citations DESC
WITH a, COLLECT(num_citations) AS citation_counts
WITH a, [i IN RANGE(1, SIZE(citation_counts)) | CASE WHEN citation_counts[i - 1] >= i THEN i ELSE 0 END] AS h_values
WITH a, MAX(h_values) AS h_index
WITH a, MAX(REDUCE(s = 0, h IN h_index | CASE WHEN h > s THEN h ELSE s END)) AS max_h_index
RETURN a.author_id AS author_id, a.name AS author_name, max_h_index
</p>

<b>Manually test
<p>MATCH (a:Author)-[:Wrote]->(p:Paper)-[:cited]->(cited:Paper)
WITH a, p, COUNT(*) AS num_citations
ORDER BY num_citations DESC
WHERE a.name = 'M. Mokbel'
return a.name, p.title, num_citations</p>

# TODO

- [ ] Add the abstract of the paper;