# TODO

- [ ] Fix the arrays in the data ingestion
- [ ] Fix the datatypes in general
- [X] Fix the connection pooling issue (ClientError: failed to obtain a connection from the pool within 60.0s (timeout))

# Libraries and contants

In [1]:
# !pip install tqdm
# !pip install spacy
# !pip install git+https://github.com/LIAAD/yake

In [2]:
import os
import json
import random

import neo4j
import yake
import numpy as np
from tqdm import tqdm

In [3]:
SEMANTIC_PATH = '../semanticscholar_raw_data'

DEFAULT_JOURNAL_NAME = 'Unknown'

SEED = 13

In [4]:
random.seed(SEED)
np.random.seed(SEED)

# Neo4j

In [5]:
DRIVER = neo4j.GraphDatabase.driver(uri="neo4j://localhost")

In [6]:
def execute(query: str):
    """
    Executes a Cypher @query and returns its result.
    """
    result = DRIVER.execute_query(query)
    return result

In [7]:
def delete_graph() -> None:
    """
    Deletes every node and edge of the graph.
    """
    query = """
        MATCH (n)
        DETACH DELETE n;
    """

    execute(query)

In [8]:
delete_graph()

## Papers

Our dataset does not provide the keywords automatically extracted for us.
Therefore, we will be trying to extract them from the abstract using an external library called [Yake](https://liaad.github.io/yake/).

In [9]:
def parse_journal_name(paper) -> str:
    """
    Not every file has a field 'journal' in the json.
    This function treats those edge cases.
    """
    if 'journal' not in paper or not paper['journal']:
        return DEFAULT_JOURNAL_NAME
    else:
        return paper.get('journal', {'name': DEFAULT_JOURNAL_NAME}).get('name', DEFAULT_JOURNAL_NAME)

In [10]:
def sanitize_abstract(abstract: str) -> str:
    if abstract:
        return (
            abstract
            .replace('"', "'")
            .replace('\\', '\\\\')
        )
    else:
        return abstract

In [11]:
def create_papers():
    """
    Create the nodes of label `Paper`.
    """
    # This is used to extract the keywords from the abstract.
    kw_extractor = yake.KeywordExtractor(
        lan='en',
        n=3,  # Max n-gram size
        top=5  # Number of keywords
    )
    
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Creating paper of {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        keywords = kw_extractor.extract_keywords(paper['abstract']) if paper['abstract'] else ''
        keywords = list(map(lambda x: str.lower(x[0]) if x else '', keywords))

        query = f"""
        CREATE (n:Paper {{
            paper_id: "{paper['paperId']}",
            publication_venue: "{paper['publicationVenue']}",
            title: "{paper['title']}",
            venue: "{paper['venue']}",
            year: "{paper['year']}",
            fieldsOfStudy: "{paper['fieldsOfStudy']}",
            publicationDate: "{paper['publicationDate']}",
            abstract: "{sanitize_abstract(paper['abstract'])}",
            keywords: "{keywords}"
        }})
        """
        # print(query)
        try:
            execute(query)
        except:
            print(query)

In [12]:
%%time
create_papers()

100%|█████████████████████████████████████████| 244/244 [00:02<00:00, 89.07it/s]

CPU times: user 2.08 s, sys: 53.7 ms, total: 2.13 s
Wall time: 2.75 s





In [13]:
def create_paper__paper_id__range_index():
    """
    Create indexes
    """
    query = """
        CREATE RANGE INDEX paper__paper_id__range_index IF NOT EXISTS
        FOR (n:Paper)
        ON (n.paper_id)
    """

    execute(query)

In [14]:
%%time
create_paper__paper_id__range_index()

CPU times: user 1.59 ms, sys: 0 ns, total: 1.59 ms
Wall time: 2.32 ms


## Authors

In [15]:
def create_authors() -> None:
    """
    For each paper, generate a node with label `Author` for that paper.
    We are using the MERGE here since we don't want to duplicate authors.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Creating the authors of {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())
    
        for author in paper['authors']:
            query = f"""
            MERGE (n:Author {{
                name: "{author['name']}",
                author_id: "{author['authorId']}"
            }})
            """
            execute(query)

In [16]:
%%time
create_authors()

100%|█████████████████████████████████████████| 244/244 [00:03<00:00, 64.84it/s]

CPU times: user 1.01 s, sys: 172 ms, total: 1.19 s
Wall time: 3.77 s





In [17]:
def create_author__author_id__range_index():
    """
    Create indexes
    """
    query = """
        CREATE RANGE INDEX author__author_id__range_index IF NOT EXISTS
        FOR (n:Author)
        ON (n.author_id)
    """

    execute(query)

In [18]:
%%time
create_author__author_id__range_index()

CPU times: user 2.43 ms, sys: 0 ns, total: 2.43 ms
Wall time: 4.11 ms


In [19]:
def link_author_to_paper() -> None:
    """
    Create the edge `Wrote` and `IsCorrespondingAuthor`, linking Authors and Papers.
    The first author is considered the corresponding author.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Linking authors of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

            is_first = True
            for author in paper['authors']:
                if is_first:
                    # The first author is the main corresponding author.
                    query = f"""
                        MATCH (a:Author {{author_id: '{author['authorId']}'}})
                        WITH a
                        MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                        WITH a, p
                        CREATE (a)-[e:IsCorrespondingAuthor]->(p);
                    """
                    execute(query)
                    is_first = False
                
                query = f"""
                    MATCH (a:Author {{author_id: '{author['authorId']}'}})
                    WITH a
                    MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                    WITH a, p
                    CREATE (a)-[e:Wrote]->(p);
                """
    
                execute(query)

In [20]:
%%time
link_author_to_paper()

100%|█████████████████████████████████████████| 244/244 [00:08<00:00, 29.89it/s]

CPU times: user 924 ms, sys: 173 ms, total: 1.1 s
Wall time: 8.16 s





## Citations

In [21]:
def link_citations_between_papers() -> None:
    """
    Generate the edge Cited linking a Paper to a Paper.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Linking citations of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        for citation in paper.get('citations', []):
            query = f"""
                MATCH (a:Paper {{paper_id: '{citation['paperId']}'}}), (p:Paper {{paper_id: '{paper['paperId']}'}})
                CREATE (a)-[e:Cites]->(p);
            """
            execute(query)

In [22]:
%%time
link_citations_between_papers()

100%|█████████████████████████████████████████| 244/244 [00:08<00:00, 27.71it/s]

CPU times: user 1.48 s, sys: 185 ms, total: 1.66 s
Wall time: 8.81 s





## Journals

In [23]:
def create_journals() -> None:
    """
    Create the Journal nodes.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        journal_name = parse_journal_name(paper)
         # or paper['journal'] == 'None'
        if journal_name != DEFAULT_JOURNAL_NAME:
            query = f"""
                MERGE (n:Journal {{
                    year: "{paper['year']}",
                    journal_name: "{journal_name}"
                    
                }})
            """
        # else:
        #     query = f"""
        #         MERGE (n:Journal {{
        #             year: "{paper['year']}",
        #             journal_name: "{DEFAULT_JOURNAL_NAME}"
                    
        #         }})
        #     """
            execute(query)

In [24]:
%%time
create_journals()

100%|████████████████████████████████████████| 244/244 [00:00<00:00, 280.00it/s]

CPU times: user 269 ms, sys: 36.6 ms, total: 305 ms
Wall time: 875 ms





In [25]:
def link_journals()-> None:
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        query = f"""
            MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                , (j:Journal {{journal_name: '{parse_journal_name(paper)}', year: '{paper['year']}'}})
            WITH p, j
            CREATE (p)-[e:PublishedIn]->(j);
        """
        execute(query)

In [27]:
%%time
link_journals()

100%|████████████████████████████████████████| 244/244 [00:01<00:00, 173.45it/s]

CPU times: user 180 ms, sys: 41.7 ms, total: 222 ms
Wall time: 1.41 s





## Reviews

We will have to generate synthetic data here to represent the reviews.

Typically, each paper has 3 reviewers, who are usually relevant authors.
The author cannot review its own paper.

The strategy that we will be using is to select up to 3 authors who:
1. wrote papers cited by the paper in question; and
2. didn't wrote the paper itself.

In [29]:
def get_possible_reviewers():
    """
    Auxiliary function that returns an aggregation of all possible reviewers of a paper.
    The logic of a "possible reviewer" is to select an author who:
    1. wrote paper(s) cited by the paper in question; and who
    2. didn't wrote the paper itself.
    """
    query = """
        MATCH (a:Author)-[w1:Wrote]->(mp:Paper)-[c:Cites]->(cp:Paper)
        WITH mp, cp, a
        MATCH (wcp:Author)-[w2:Wrote]->(cp)
        WHERE NOT (wcp)-[:Wrote]->(mp)
        RETURN mp.paper_id AS paper_id, collect(wcp.author_id) AS possible_reviewer_ids;
    """

    return execute(query)

In [30]:
def link_reviewer_to_paper() -> None:
    """
    This function generates synthetic data.
    """
    result = get_possible_reviewers()
    
    for paper_id, possible_reviewers in tqdm(result[0]):
        # Papers can have a different amount of reviewers, varying from 1 to 4, following the distribution specified by `p`.
        # Edge case: If the paper doesn't cite any other paper, it will have 0 reviewers.
        reviewer_qty = min(
            np.random.choice(np.arange(1, 5), p=[0.1, 0.3, 0.5, 0.1]),
            len(possible_reviewers)
        )

        reviewers = random.sample(possible_reviewers, reviewer_qty)
        for reviewer in reviewers:
            query = f"""
                MATCH (a:Author {{author_id: '{reviewer}'}}), (p:Paper {{paper_id: '{paper_id}'}})
                CREATE (a)-[e:Reviewed]->(p);
            """

            execute(query)

In [31]:
%%time
link_reviewer_to_paper()

100%|█████████████████████████████████████████| 243/243 [00:03<00:00, 74.15it/s]

CPU times: user 533 ms, sys: 86.3 ms, total: 619 ms
Wall time: 3.31 s





## Keywords

Our dataset does not provide the keywords automatically extracted for us.
Therefore, we will be trying to extract them from the abstract using an external library called [spacy](https://spacy.io/).

# Queries

<b>Query 1</b>

Find the top 3 most cited papers of each conference.

MATCH (p:Paper)-[:cited]->(cited:Paper) WITH p.journal_name AS journal, p.title AS title, COUNT(*) AS num_citations ORDER BY journal, num_citations DESC WITH journal, COLLECT({title: title, num_citations: num_citations}) AS papers WITH journal, papers, [i IN RANGE(1, SIZE(papers)) | i] AS ranks UNWIND ranks AS rank WITH journal, papers[rank - 1].title AS title, papers[rank - 1].num_citations AS num_citations, rank WHERE rank <= 3 RETURN journal, title, num_citations, rank ORDER BY journal, rank

<b>Manually Test</b>

MATCH (p:Paper)-[c:cited]->(cited:Paper) WITH p.journal_name AS journal, p.title AS title, COUNT(c) AS num_citations WHERE journal = '2014 IEEE International Conference on Big Data (Big Data)' RETURN journal, title, num_citations ORDER BY num_citations DESC

<b>Query 4</b>

Find the h-indexes of the authors in your graph

MATCH (a:Author)-[:Wrote]->(p:Paper)-[:cited]->(cited:Paper) WITH a, p, COUNT(*) AS num_citations ORDER BY num_citations DESC WITH a, COLLECT(num_citations) AS citation_counts WITH a, [i IN RANGE(1, SIZE(citation_counts)) | CASE WHEN citation_counts[i - 1] >= i THEN i ELSE 0 END] AS h_values WITH a, MAX(h_values) AS h_index WITH a, MAX(REDUCE(s = 0, h IN h_index | CASE WHEN h > s THEN h ELSE s END)) AS max_h_index RETURN a.author_id AS author_id, a.name AS author_name, max_h_index

<b>Manually test</b>

MATCH (a:Author)-[:Wrote]->(p:Paper)-[:cited]->(cited:Paper) WITH a, p, COUNT(*) AS num_citations ORDER BY num_citations DESC WHERE a.name = 'M. Mokbel' return a.name, p.title, num_citations


# TODO

- [ ] Add the abstract of the paper;

# Known limitations

1. Our data source doesn't always provide the Journal name.
When we don't have it, we are not creating a node for the journal where that paper was published.
2. Papers without abstracts won't have their keywords extracted.