In [None]:
%pip install neo4j

In [3]:
from neo4j import GraphDatabase
import pke
import json
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import os
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Node Types
#### Paper Node
- Title
- paperId
- Full Abstract
#### Entity Node
- entity_name
#### Author Node
- authorId
- Name
#### Sentence Node
- Sentence text

### Topic Node
- name

### Edge Types
- (Sentence)-[:PREVIOUS_SENTENCE]->Sentence
- (Sentence)-[:NEXT_SENTENCE]->Sentence
- (Paper)-[:AUTHORED]->Author
- (Paper)-[:HAS_KEYWORD]->Keyphrase
- (Keyphrase)-[:RELATION]->Keyphrase
- (Paper)-[:IN_TOPIC]->Topic

In [None]:
def savePickle(data, save_path) -> None:
    try:
        with open(save_path, "wb") as f:
            pickle.dump(data, f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to save pickle at: {save_path}")


def loadPickle(load_path) -> None:
    try:
        with open(load_path, "rb") as f:
            return pickle.load(f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to load pickle at: {load_path}")


In [4]:
abstracts = []
root_path = "./data/sample"
sample_data = ["weather_CO2.jsonl", "paleoclimate.jsonl", "rewilding.jsonl", "rockfish.jsonl", "arctic.jsonl", "climate.jsonl", "shark_climate.jsonl"]

for data_path in sample_data:
    with open(f'{root_path}/{data_path}', 'r') as json_file:
        json_list = list(json_file)

    result = json.loads(json_list[0])

    for result_dict in result["data"]:
        abstracts.append(result_dict)

len(abstracts)

300

In [6]:
data = [y for y in (x for x in abstracts) if y["abstract"] is not None]
len(data)

185

In [None]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform([x['abstract'] for x in data])

In [None]:
# initialize keyphrase extraction model, here TopicRank
extractor = pke.unsupervised.TopicRank()

In [None]:
for item, topicId in zip(data, topics):
    item['topic'] = '_'.join([x[0] for x in model.get_topic(topicId)])

    abstract = item['abstract']
    entities = []

    if abstract is None:
        print("Found a none!")
    else:
        extractor.load_document(input=abstract, language='en')
        extractor.candidate_selection()
        extractor.candidate_weighting()
        item['ents'] = [(x[0] for x in extractor.get_n_best(n=3))]

In [None]:
savePickle(data, './data/checkpoint.pickle')

In [None]:
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "hackathon"))

In [None]:
def create_paper_node(tx, paperId: str, title: str, abstract: str) -> None:
    tx.run("CREATE (a:Paper {paperId: $paperId, title: $title, abstract: $abstract})", paperId=paperId, title=title, abstract=abstract)

def create_entity_node(tx, entity_name: str) -> None:
    tx.run("CREATE (a:Entity {entity_name: $entity_name})", entity_name=entity_name)

def create_topic_node(tx, topic_name: str) -> None:
    tx.run("CREATE (a:Topic {name: $topic_name})", name=topic_name)

def create_author_node(tx, authorId: str, author_name: str) -> None:
    tx.run("CREATE (a:Author {authorId: $authorId, author_name: $author_name})", authorId=authorId, author_name=author_name)

In [None]:
def create_authored_relationship(tx, paperId, authorId):
    tx.run("MATCH (a:Author) WHERE a.authorId = $authorId"
            "MATCH (b:Paper) WHERE b.paperId = $paperId"
            "CREATE (a)-[:AUTHORED]->(b)",
           authorId=authorId, paperId=paperId)

def create_entity_relationship(tx, paperId, entity_name):
    tx.run("MATCH (a:Entity) WHERE a.entity_name = $entity_name"
            "MATCH (b:Paper) WHERE b.paperId = $paperId"
            "CREATE (b)-[:HAS_KEYWORD]->(a)",
           entity_name=entity_name, paperId=paperId)

def create_in_topic_relationship(tx, paperId, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.name = $topic_name"
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId"
            "CREATE (paper)-[:IN_TOPIC]->(topic)",
           topic_name=topic_name, paperId=paperId)

In [None]:
with driver.session() as session:
    pass

driver.close()