In [1]:
import pandas as pd
import requests
import time
from neo4j import GraphDatabase

In [2]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "adminadmin"

In [3]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [4]:
def create_nodes_and_relationships(paper, set):
    with driver.session() as session:
        print(paper)
        title = paper.get('title', "Unknown Title")
        year = paper.get('year')
        authors = paper.get('authors')
        arxiv = paper.get('arxiv')
        doi = paper.get('doi')
        venue = paper.get('venue') if paper.get('venue') else "Unknown Venue"
        journal = paper.get('journal') if paper.get('journal') else {'name': 'Unkown Journal'}
        journal_name = journal.get('name', "Unknown Journal")
        
        # Create nodes
        session.run("OPTIONAL MATCH (p:Paper {title: $title}) "
                    "WITH p WHERE p IS NULL "
                    "CREATE (:Paper {title: $title, set: $set, selected: $selected, doi: $doi, arxiv: $arxiv, referenceCount: $referenceCount, citationCount: $citationCount, url: $url})", 
                        title=title, set=set, selected=False, doi=doi, arxiv=arxiv, referenceCount=paper.get('referenceCount'), citationCount=paper.get('citationCount'), url=paper.get('url'))
        session.run("MERGE (y:PublishedYear {year: $year})", year=year)
        session.run("MERGE (p:PublicationVenue {venue: $venue})", venue=venue)
        session.run("MERGE (l:Journal {name: $name})", name=journal_name)

        # Create relationships
        session.run("MATCH (t:Paper {title: $title}), (y:PublishedYear {year: $year}) "
                    "MERGE (t)-[:IN_YEAR]->(y)", title=title, year=year)

        session.run("MATCH (t:Paper {title: $title}), (p:PublicationVenue {venue: $venue}) "
                    "MERGE (t)-[:PUBLISHED_AT]->(p)", title=title, venue=venue)

        session.run("MATCH (t:Paper {title: $title}), (l:Journal {name: $name}) "
                    "MERGE (t)-[:PUBLISHED_IN]->(l)", title=title, name=journal_name)

        # Create Author nodes and relationships
        for author in authors:
            session.run("MERGE (a:Author {name: $name, id: $id})", name=author.get('name'), id=author.get('authorId') if author.get('authorId') else author.get('name'))
            session.run("MATCH (t:Paper {title: $title}), (a:Author {name: $name}) "
                        "MERGE (t)-[:AUTHORED_BY]->(a)", title=title, name=author.get('name'))

In [5]:
def fetch_paper_details(paper_id, id_type):
    id_type = id_type.upper()
    url = 'https://api.semanticscholar.org/graph/v1/paper/'+id_type+':'+paper_id
    query_params = {'fields': 'title,url,venue,year,authors,referenceCount,citationCount,publicationTypes,journal'}
    res = requests.get(url, query_params)
    if(res.status_code == 200):
        return res.json()
    else:
        print(f"Error while fetching paper details for Id: {paper_id} and Id_Type: {id_type}")

In [6]:
def create_graph_for_set(set):
    file_name = '../datafiles/set'+str(set)+'_ids.csv'
    ids = pd.read_csv(file_name)
    all_papers = []

    for i, row in ids.iterrows():
        paper = fetch_paper_details(row['id'], row['id_type'])
        if paper:
            paper[row['id_type']] = row['id']
            all_papers.append(paper)
            create_nodes_and_relationships(paper, set)
            time.sleep(2)

    df = pd.DataFrame(all_papers)
    file = '../datafiles/set'+str(set)+'_papers.csv'
    df.to_csv(file)

    print(f"Graph created for set {set} and papers saved in file {file}")

In [7]:
def get_selected_paper_ids(driver, set):
    ids = []
    with driver.session() as session:
        result = session.run("MATCH (p:Paper {set: $set, selected:$selected}) RETURN p", set=set, selected=True)
        for record in list(result):
            data = record.data().get('p')
            doi = data.get('doi')
            arxiv_id = data.get('arxiv')
            if doi:
                ids.append({'id_type': 'doi', 'id': doi, 'title': data.get('title')})
            elif arxiv_id:
                ids.append({'id_type': 'arxiv', 'id': arxiv_id, 'title': data.get('title')})

    return ids

In [8]:
def get_references_or_citations(id_type, paper_id, data_type):
    id_type = id_type.upper()
    url = 'https://api.semanticscholar.org/graph/v1/paper/'+id_type+':'+paper_id+'/'+data_type
    query_params = {'fields': 'externalIds,title,url,venue,year,authors,referenceCount,citationCount,publicationTypes,journal'}
    res = requests.get(url, query_params)
    if(res.status_code == 200):
        return res.json().get('data')
    else:
        print(f"Error while fetching {data_type} for Id: {paper_id}")

In [9]:
def get_refs_and_cites_selected_papers(id_type, paper_id):
    references = get_references_or_citations(id_type, paper_id, 'references')
    citations = get_references_or_citations(id_type, paper_id, 'citations')
    time.sleep(2)
    return {'references': references, 'citations': citations}

In [10]:
def fetch_next_set_papers(selected_paper_ids):
    next_set_papers = []
    for paper in selected_paper_ids:
        ref_cites = get_refs_and_cites_selected_papers(paper['id_type'], paper['id'])
        references = ref_cites['references'] if ref_cites['references'] else []
        citations = ref_cites['citations'] if ref_cites['citations'] else []
        data = {'main_paper': paper['title'], 'cited': references, 'citing': citations}
        next_set_papers.append(data)
    
    return next_set_papers

In [11]:
def create_cite_relation(main_title, child_title, relation_type):
    with driver.session() as session:
        if relation_type == 'cited':
            session.run("MATCH (p:Paper {title: $main_title}), (c:Paper {title: $child_title}) "
                    "MERGE (p)-[:CITED_PAPER]->(c)", main_title=main_title, child_title=child_title)
        else:
            session.run("MATCH (p:Paper {title: $main_title}), (c:Paper {title: $child_title}) "
                    "MERGE (p)-[:CITING_PAPER]->(c)", main_title=main_title, child_title=child_title)

In [12]:
def process_next_set_papers(set):
    selected_papers = get_selected_paper_ids(driver, set-1)
    print(len(selected_papers))

    all_papers = []
    next_set_papers = fetch_next_set_papers(selected_papers)
    for paper_set in next_set_papers:
        for key in ['cited', 'citing']:
            for paper in paper_set[key]:
                if key == 'cited':
                    paper = paper.get('citedPaper')
                else:
                    paper = paper.get('citingPaper')

                paper_ids = paper.get('externalIds')
                if paper_ids and (paper_ids.get('DOI') or paper_ids.get('ArXiv')):
                    doi = paper_ids.get('DOI')
                    arxiv = paper_ids.get('ArXiv')
                    paper['doi'] = doi
                    paper['arxiv'] = arxiv

                    create_nodes_and_relationships(paper, set)
                    create_cite_relation(paper_set['main_paper'], paper['title'], key)
                    all_papers.append(paper)

    papers_df = pd.DataFrame(all_papers)
    file_name = '../datafiles/set'+str(set)+'_papers.csv'
    papers_df.to_csv(file_name)
    print(f'Set {set} papers saved in {file_name}')

In [16]:
set = 1

In [None]:
create_graph_for_set(set)

In [13]:
# set = set+1
process_next_set_papers(3)

5
{'paperId': 'cc9a4a1e04942207ded92745e83e49c268d9d147', 'externalIds': {'ArXiv': '1903.09916', 'DBLP': 'conf/cns/HuLZYTW19', 'MAG': '2969448887', 'DOI': '10.1109/CNS.2019.8802632', 'CorpusId': 85500468}, 'url': 'https://www.semanticscholar.org/paper/cc9a4a1e04942207ded92745e83e49c268d9d147', 'title': 'Characterizing Location-based Mobile Tracking in Mobile Ad Networks', 'venue': 'IEEE Conference on Communications and Network Security', 'year': 2019, 'referenceCount': 25, 'citationCount': 14, 'publicationTypes': ['JournalArticle', 'Conference'], 'journal': {'pages': '223-231', 'name': '2019 IEEE Conference on Communications and Network Security (CNS)'}, 'authors': [{'authorId': '1872598', 'name': 'Boyang Hu'}, {'authorId': '51200762', 'name': 'Qicheng Lin'}, {'authorId': '46323367', 'name': 'Yao Zheng'}, {'authorId': '2480351', 'name': 'Qiben Yan'}, {'authorId': '83334170', 'name': 'Matthew Troglia'}, {'authorId': '2115979102', 'name': 'Qingyang Wang'}], 'doi': '10.1109/CNS.2019.88026