# Laboratory 1: Property Graphs
### Luis Alfredo Leon Villapún
### Liliia Aliakberova

# Part A.2 Instantiating / Loading
* * *
In this section we are asked to load the data into our desired graph. To do this, we are going to use mainly the Semantic Scholar dataset. To test, it is recommended to use the sample data comprising 100 papers.  
Please note that:  
- this datasets are in json format, so we will have to use the <i>apoc</i> library in our Neo4j installation.
- this datasets contain modified information to suit this tasks requirements, so even though a part of this dataset is real, some relationships will be fake.

## Creating the connector
Let's first create the connector to handle the messages with Neo4j.

In [1]:
# Install if needed
# !pip install neo4j

In [2]:
"""
Simple connector script. Creates a connection to a Neo4j server.
References:
   - https://neo4j.com/developer/python/
   - https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4
"""


from neo4j import GraphDatabase
import logging


class Neo4jConnector:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        logging.basicConfig(filename="connector.log")

    def close(self):
        self.driver.close()

    def query(self, query):
        session = None
        response = None
        try:
            session = self.driver.session()
            response = list(session.run(query))
        except Exception as e:
            logging.error(f"Query Failed: {e}")
        finally:
            if session is not None:
                session.close()
            return response
        
    def drop(self):
        self.query(query="MATCH (n) -[e] -> () DELETE n, e")
        self.query(query="MATCH (n) DELETE n")
        logging.info("Deleted database. Success.")

In [3]:
from getpass import getpass

uri = "neo4j://localhost:7687"
user = "neo4j"
password = getpass("Input your password to connect")
conn = Neo4jConnector(uri, user, password)

Input your password to connect········


In [4]:
# Uncomment to drop the database (you will have to rerun the loading cells)
conn.drop()

## Authors Json File
First, we will load the authors json file to extract data about the authors in this file.

In [5]:
def load_authors(conn):
    query = """
        CALL apoc.load.json('file:///samples/authors/authors-sample.jsonl')
        YIELD value
        MERGE (a:Author {name: value.name})
        SET a.author_id = value.authorid, 
            a.citationcount = value.citationcount, 
            a.hindex = value.hindex, 
            a.papercount = value.papercount
    """
    session = conn.driver.session()
    response = list(session.run(query))
    session.close()
    print("Success")

In [6]:
load_authors(conn)

Success


## Publication Venues Json File
This file contains extra data on conferences and journals. We will use it to create Event nodes.

In [7]:
def load_publication_venues(conn):
    query = """
    CALL apoc.load.json('file:///samples/publication-venues/publication-venues-sample.jsonl')
    YIELD value
    WITH *
    WHERE value.type = 'conference'
    MERGE (e: Event {event_name: value.name})
    SET e.event_id = value.id,
        e.url = value.url,
        e.type= 'conference',
        e.edition = apoc.coll.randomItem(['I', 'II', 'III', 'IV', 'V'])
    WITH value
    WHERE value.type= 'journal'
    MERGE (j: Document {name: value.name})
    SET j.type = 'journal',
        j.document_id = apoc.create.uuid()
    """
    session = conn.driver.session()
    response = list(session.run(query))
    session.close()
    print("Success")

In [8]:
load_publication_venues(conn)

Success


## Papers Json File
This file contains data about multiple nodes and edges of our interest, here, we will create Article nodes related to the papers, extract their authors, create their edge 'WRITEN_BY' and then do the same with the journals related to the papers. We are also adding some random data to create the 'REVIEWED_BY', 'PRESENTED_AT' and 'CITED_BY' edges.

In [9]:
import random, json

articles_path = "/Users/alfredo.leon/Desktop/SDMLab1/neo4jcontainerdata/samples/papers/"

def load_articles(conn, path=articles_path, regenerate=False):
    if regenerate:
        with open(f"{path}papers-sample.jsonl") as papers:
            with open(f"{path}papers-sample-modified.jsonl", "w") as papers_modified:
                for paper in papers:
                        citations = []
                        reviews = []
                        editions = []
                        paper = json.loads(paper.strip("\n"))

                        num_citations = random.randint(0, 5)
                        if num_citations > 0 and paper['corpusid'] not in [196432386, 211536971, 188341797, 220921189, 227616401]:
                            cited_ids = random.sample([196432386, 211536971, 188341797, 220921189, 227616401], num_citations)
                            for cite_id in cited_ids:
                                citations.append({"article_id": cite_id})
                        paper['citations'] = citations

                        num_reviews = random.randint(0, 3)
                        if num_reviews > 0:
                            author_ids = [author['authorId'] for author in paper['authors']]
                            reviewer_ids = random.sample(["143973205","152609652","149287618","118712934","134641171",
                                                          "121778794","145466716","116765497","103479334","146897159",
                                                          "122972940","146578621"], num_reviews)
                            if len(set(author_ids).intersection(set(reviewer_ids))) == 0:
                                for reviewer_id in reviewer_ids:
                                    reviews.append({"reviewer_id": reviewer_id})
                        paper['reviewers'] = reviews

                        num_topics = random.randint(1, 11)
                        topics = []
                        if num_topics > 0:
                            topics = random.sample(["data management", "indexing", "data modeling", "big data", "data processing", 
                                                    "data storage", "data querying", "biology", "gene therapy", "proteins", "crispr"], num_topics)
                        paper['topics'] = {"topic": topic for topic in topics}

                        paper['presented_at'] = {
                            'conference_id': random.choice(["30a20a37-c3ce-47f2-8a81-e59c4f39502c", 
                                                            "b41a8ed2-0658-4761-b625-32bf9c5fbf69", 
                                                            "32700c08-d377-4a78-9532-370593d65166"]),
                            'edition': random.choice(['I', 'II', 'III', 'IV', 'V']),
                        }

                        papers_modified.write(json.dumps(paper) + "\n")
                    
    query = """
        CALL apoc.load.json('file:///samples/papers/papers-sample-modified.jsonl')
        YIELD value
        MERGE (article: Article {title: value.title})
        SET article.article_id = value.corpusid,
            article.content = value.url
        WITH article, value
        UNWIND value.authors AS author
        MERGE (auth: Author {name: author.name})
        SET auth.author_id = author.authorId
        MERGE (auth)<-[:WRITEN_BY]-(article)
        WITH article, value
        UNWIND value.journal AS journal
        MERGE (j: Document {name: journal.name})
        SET j.type = 'journal',
            j.volume = journal.volume,
            j.document_id = apoc.create.uuid()
        MERGE (article)-[:PUBLISHED_AT]->(j)
        WITH article, value
        UNWIND value.citations as citation
        MATCH (cited: Article {article_id: citation.article_id})
        MERGE (cited)-[:CITED_BY]->(article)
        WITH article, value
        UNWIND value.reviewers AS review
        MATCH (reviewer: Author {author_id: review.reviewer_id})
        MERGE (article)-[:REVIEWED_BY]->(reviewer)
        WITH article, value
        UNWIND value.presented_at as presented_at
        MATCH (event: Event {event_id: presented_at.conference_id})
        MERGE (article)-[:PRESENTED_AT {edition: presented_at.edition}]->(event)
        MERGE (d: Document {type: 'proceeding', name: event.event_name, document_id: apoc.create.uuid()})-[:EDITED_BY {edition: presented_at.edition}]->(event)
        WITH article, value
        UNWIND value.topics as topic
        MERGE (t: Topic {keyword: topic.topic})
        MERGE (article)-[:RELATES_TO]->(t)
    """
    session = conn.driver.session()
    response = list(session.run(query))
    session.close()
    print("Success")

In [10]:
load_articles(conn)

Success
