In [2]:
from neo4j import GraphDatabase
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db='pwcgraph'):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response



In [7]:
conn = Neo4jConnection(uri="bolt://localhost:7687", user="neo4j", pwd="0000")

In [4]:
import time



In [4]:
# clear the database
conn.query("MATCH (n) DETACH DELETE n")

[]

#  Write Nodes 

In [5]:
# paper
conn.query('CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper)     ASSERT p.id IS UNIQUE')
# author 
conn.query('CREATE CONSTRAINT authors IF NOT EXISTS ON (a:Author)     ASSERT a.id IS UNIQUE')
# task 
conn.query('CREATE CONSTRAINT tasks IF NOT EXISTS ON (t:Task)         ASSERT t.id IS UNIQUE')
# evaluation
conn.query('CREATE CONSTRAINT evaluations IF NOT EXISTS ON (e:Evaluation) ASSERT e.id IS UNIQUE')
# dataset
conn.query('CREATE CONSTRAINT datasets IF NOT EXISTS ON (d:Dataset)     ASSERT d.id IS UNIQUE')
# code 
conn.query('CREATE CONSTRAINT codes IF NOT EXISTS ON (c:Code)         ASSERT c.id IS UNIQUE')
# method 
conn.query('CREATE CONSTRAINT methods IF NOT EXISTS ON (m:Method)     ASSERT m.id IS UNIQUE')






[]

In [6]:
import pandas as pd
# load nodes from ../data?knowledge_graph/
papers = pd.read_csv("../data/knowledge_graph/papers_nodes.csv")
authors = pd.read_csv("../data/knowledge_graph/authors_nodes.csv")
tasks = pd.read_csv("../data/knowledge_graph/tasks_nodes.csv")
evaluations = pd.read_csv("../data/knowledge_graph/evaluations_nodes.csv")
datasets = pd.read_csv("../data/knowledge_graph/datasets_nodes.csv")
codes = pd.read_csv("../data/knowledge_graph/codes_nodes.csv")
methods = pd.read_csv("../data/knowledge_graph/methods_nodes.csv")

# drop duplicates preserving the first occurrence
papers = papers.drop_duplicates(subset=['id'], keep='first')
authors = authors.drop_duplicates(subset=['id'], keep='first')
tasks = tasks.drop_duplicates(subset=['id'], keep='first')
evaluations = evaluations.drop_duplicates(subset=['id'], keep='first')
datasets = datasets.drop_duplicates(subset=['id'], keep='first')
codes = codes.drop_duplicates(subset=['id'], keep='first')
methods = methods.drop_duplicates(subset=['id'], keep='first')

# remove rows with null id or Nan (should not happen)
papers = papers.dropna(subset=['id'])
authors = authors.dropna(subset=['id'])
tasks = tasks.dropna(subset=['id'])
evaluations = evaluations.dropna(subset=['id'])
datasets = datasets.dropna(subset=['id'])
codes = codes.dropna(subset=['id'])
methods = methods.dropna(subset=['id'])







In [7]:
tasks = tasks[['id','name',]]

def add_tasks(tasks):
    # Adds task nodes to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MERGE (c:Task {id: row.id, name: row.name})
            RETURN count(*) as total
            '''
    return conn.query(query, parameters = {'rows':tasks.to_dict('records')})

# write 
add_tasks(tasks)


[<Record total=2877>]

In [None]:
import numpy as np
# add papers
attrs = ['id', 'title', 'abstract', 'year', 'url', 'pwc_id', 'label',]
papers = papers[attrs]

def add_papers(papers):
    # Adds paper nodes to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MERGE (p:Paper {id: row.id, title: row.title, year: row.year, url: row.url, pwc_id: row.pwc_id})
            RETURN count(*) as total
            '''
    
    return conn.query(query, parameters = {'rows':papers.to_dict('records')})

# write  
# divide into batches 
papers_batches = np.array_split(papers, 100)
for batch in papers_batches:
    add_papers(batch)
    time.sleep(1)
    print("Papers added:", len(batch))




    


In [9]:
authors.columns

Index(['name', 'id', 'label', 'source'], dtype='object')

In [None]:
# add authors 
attrs = ['id', 'name',]
authors = authors[attrs]

def add_authors(authors):
    # Adds author nodes to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MERGE (a:Author {id: row.id, name: row.name})
            RETURN count(*) as total
            '''
    
    return conn.query(query, parameters = {'rows':authors.to_dict('records')})

# write 
# divide into batches
authors_batches = np.array_split(authors, 100)
for batch in authors_batches:
    add_authors(batch)
    time.sleep(1)
    print("Authors added:", len(batch))

In [11]:
# add methods
attrs = ['id', 'name',]
methods = methods[attrs]

def add_methods(methods):
    # Adds method nodes to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MERGE (m:Method {id: row.id, name: row.name})
            RETURN count(*) as total
            '''
    
    return conn.query(query, parameters = {'rows':methods.to_dict('records')})

# write
add_methods(methods)


[<Record total=1948>]

In [12]:
# add datasets
attrs = ['id', 'name',]
datasets = datasets[attrs]

def add_datasets(datasets):
    # Adds dataset nodes to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MERGE (d:Dataset {id: row.id, name: row.name})
            RETURN count(*) as total
            '''
    
    return conn.query(query, parameters = {'rows':datasets.to_dict('records')})

# write 
add_datasets(datasets)


[<Record total=3611>]

In [13]:
import ast

evaluations['metrics'] = evaluations['metrics'].apply(lambda x : ast.literal_eval(x))



In [14]:
evaluations['metrics_key'] = evaluations['metrics'].apply(lambda x : list(x.keys()))
evaluations['metrics_value'] = evaluations['metrics'].apply(lambda x : list(x.values()))

In [15]:
def extract_key (x):
    try : 
        return x[0]
    except :
        return None
def extract_value (x):
    try : 
        return x[0]
    except :
        return None
evaluations['metric_key'] = evaluations['metrics_key'].apply(extract_key)
evaluations['metric_value'] = evaluations['metrics_value'].apply(extract_value)

In [None]:
attrs = ['id', 'metric_key', 'metric_value','model_name']
evaluations = evaluations[attrs]

def add_evaluations(evaluations):
    # Adds evaluation nodes to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MERGE (e:Evaluation {id: row.id, metric_key: row.metric_key, metric_value: row.metric_value, model_name: row.model_name})
            RETURN count(*) as total
            '''
    
    return conn.query(query, parameters = {'rows':evaluations.to_dict('records')})

# write 
# divide into batches
evaluations_batches = np.array_split(evaluations, 100)
for batch in evaluations_batches:
    add_evaluations(batch)
    time.sleep(1)
    print("evaluations added:", len(batch))

# Write edges 

In [24]:
#(method) -> (paper)
papers_methods = pd.read_csv('../data/knowledge_graph/method_paper.csv')



def add_edges(papers_methods):
    # Adds edges to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MATCH (p:Paper {id: row.target}), (m:Method {id: row.source})
            CREATE (m)-[:FIRST_MENTIONED_IN]->(p)
            '''
    
    return conn.query(query, parameters = {'rows':papers_methods.to_dict('records')})

# WRITE 
add_edges(papers_methods)





[]

In [None]:
# (author) -> (paper)
papers_authors = pd.read_csv('../data/knowledge_graph/authors_papers.csv')

def add_edges(papers_authors):
    # Adds edges to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MATCH (a:Author {id: row.source}), (p:Paper {id: row.target})
            CREATE (a)-[:AUTHOR_OF]->(p)
            '''
    
    return conn.query(query, parameters = {'rows':papers_authors.to_dict('records')})

# write batches 
papers_authors_batches = np.array_split(papers_authors, 100)
for batch in papers_authors_batches:
    add_edges(batch)
    time.sleep(1)
    print("edges added:", len(batch))


In [None]:
# (task) -> (paper)
task_papers = pd.read_csv('../data/knowledge_graph/tasks_papers.csv')

def add_edges(task_papers):
    # Adds edges to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MATCH (d:Task {id: row.source}), (p:Paper {id: row.target})
            CREATE (d)-[:BENCHMARKED_IN]->(p)
            '''
    
    return conn.query(query, parameters = {'rows':task_papers.to_dict('records')})

# write batches
task_papers_batches = np.array_split(task_papers, 100)
for batch in task_papers_batches:
    add_edges(batch)
    time.sleep(1)
    print("edges added:", len(batch))

In [34]:
# (EVALUATION) -> (PAPER)
evaluation_papers = pd.read_csv('../data/knowledge_graph/evals_papers.csv')

def add_edges(evaluation_papers):
    # Adds edges to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MATCH (e:Evaluation {id: row.source}), (p:Paper {id: row.target})
            CREATE (e)-[:SUBMITED_BY]->(p)
            '''
    
    return conn.query(query, parameters = {'rows':evaluation_papers.to_dict('records')})

# write batches
evaluation_papers_batches = np.array_split(evaluation_papers, 100)
for batch in evaluation_papers_batches:
        add_edges(batch)
        time.sleep(1)
        print("edges added:", len(batch))

edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 313
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 312
edges added: 3

In [36]:
# (task) -> (eval)
task_evaluations = pd.read_csv('../data/knowledge_graph/evals_tasks.csv')
task_evaluations

def add_edges(task_evaluations):
    # Adds edges to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MATCH (d:Task {id: row.source}), (e:Evaluation {id: row.target})
            CREATE (d)-[:SCORED_IN]->(e)
            '''
    
    return conn.query(query, parameters = {'rows':task_evaluations.to_dict('records')})

# WRITE BATCHES
task_evaluations_batches = np.array_split(task_evaluations, 15)
for batch in task_evaluations_batches:
    add_edges(batch)
    time.sleep(1)
    print("edges added:", len(batch))

edges added: 2078
edges added: 2078
edges added: 2078
edges added: 2078
edges added: 2078
edges added: 2078
edges added: 2078
edges added: 2078
edges added: 2078
edges added: 2078
edges added: 2077
edges added: 2077
edges added: 2077
edges added: 2077
edges added: 2077


In [9]:
# (dataset) -> (eval)
import pandas as pd
import numpy as np 
evals_datasets = pd.read_csv('../data/knowledge_graph/evals_datasets.csv')
evals_datasets

def add_edges(evals_datasets):
    # Adds edges to the Neo4j graph.
    query = '''
            UNWIND $rows AS row
            MATCH (d:Dataset {id: row.source}), (e:Evaluation {id: row.target})
            CREATE (d)-[:USED_IN]->(e)
            '''
    
    return conn.query(query, parameters = {'rows':evals_datasets.to_dict('records')})

# WRITE BATCHES
dataset_evaluations_batches = np.array_split(evals_datasets, 15)
for batch in dataset_evaluations_batches:
    add_edges(batch)
    time.sleep(1)
    print("edges added:", len(batch))


edges added: 2082
edges added: 2081
edges added: 2081
edges added: 2081
edges added: 2081
edges added: 2081
edges added: 2081
edges added: 2081
edges added: 2081
