In [None]:
%pip install neo4j

In [1]:
from neo4j import GraphDatabase
import pke
import json
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import os
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Node Types
#### Paper Node
- Title
- paperId
- Full Abstract
#### Entity Node
- entity_name
#### Author Node
- authorId
- Name
#### Sentence Node
- Sentence text

### Topic Node
- name

### Edge Types
- (Sentence)-[:PREVIOUS_SENTENCE]->Sentence
- (Sentence)-[:NEXT_SENTENCE]->Sentence
- (Paper)-[:AUTHORED]->Author
- (Paper)-[:HAS_KEYWORD]->Keyphrase
- (Keyphrase)-[:RELATION]->Keyphrase
- (Paper)-[:IN_TOPIC]->Topic

In [2]:
def savePickle(data, save_path) -> None:
    try:
        with open(save_path, "wb") as f:
            pickle.dump(data, f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to save pickle at: {save_path}")


def loadPickle(load_path) -> None:
    try:
        with open(load_path, "rb") as f:
            return pickle.load(f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to load pickle at: {load_path}")


In [3]:
abstracts = []
root_path = "./data/sample"
sample_data = ["weather_CO2.jsonl", "paleoclimate.jsonl", "rewilding.jsonl", "rockfish.jsonl", "arctic.jsonl", "climate.jsonl", "shark_climate.jsonl"]

for data_path in sample_data:
    with open(f'{root_path}/{data_path}', 'r') as json_file:
        json_list = list(json_file)

    result = json.loads(json_list[0])

    for result_dict in result["data"]:
        abstracts.append(result_dict)

len(abstracts)

350

In [4]:
data = [y for y in (x for x in abstracts) if y["abstract"] is not None]
len(data)

222

In [5]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform([x['abstract'] for x in data])

Batches: 100%|██████████| 7/7 [00:09<00:00,  1.32s/it]
2022-09-20 12:12:53,370 - BERTopic - Transformed documents to Embeddings
2022-09-20 12:13:01,493 - BERTopic - Reduced dimensionality
2022-09-20 12:13:01,515 - BERTopic - Clustered reduced embeddings


In [6]:
# initialize keyphrase extraction model, here TopicRank
extractor = pke.unsupervised.TopicRank()

In [18]:
for item, topicId in zip(data, topics):
    item['topic'] = '_'.join([x[0] for x in model.get_topic(topicId)])

    abstract = item['abstract']
    entities = []

    if abstract is None:
        print("Found a none!")
    else:
        extractor.load_document(input=abstract, language='en')
        extractor.candidate_selection()
        extractor.candidate_weighting()
        item['ents'] = [x[0] for x in extractor.get_n_best(n=3)]

In [20]:
savePickle(data, './data/checkpoint.pkl')

In [19]:
data[0]

{'paperId': '1a8013583dd77f9dd4cc7d54fad50312c7685143',
 'title': 'Marine CO2 Patterns in the Northern Salish Sea',
 'abstract': 'Marine carbon dioxide (CO2) system data has been collected from December 2014 to June 2018 in the northern Salish Sea (NSS; British Columbia, Canada) and consisted of continuous measurements at two sites as well as spatially- and seasonally-distributed discrete seawater samples. The array of CO2 observing activities included high-resolution CO2 partial pressure (pCO2) and pHT (total scale) measurements made at the Hakai Institute’s Quadra Island Field Station (QIFS) and from an Environment Canada weather buoy, respectively, as well as discrete seawater measurements of pCO2 and total dissolved inorganic carbon (TCO2) obtained during a number of field campaigns. A relationship between NSS alkalinity and salinity was developed with the discrete datasets and used with the continuous measurements to highly resolve the marine CO2 system. Collectively, these datase

In [21]:
data[100]

{'paperId': '39eecef1392d65f97de6d7d541025d05240a1306',
 'title': 'The impact of acute thermal stress on the metabolome of the black rockfish (Sebastes schlegelii)',
 'abstract': 'Acute change in water temperature causes heavy economic losses in the aquaculture industry. The present study investigated the metabolic and molecular effects of acute thermal stress on black rockfish (Sebastes schlegelii). Gas chromatography time-of-flight mass spectrometry (GC-TOF-MS)-based metabolomics was used to investigate the global metabolic response of black rockfish at a high water temperature (27°C), low water temperature (5°C) and normal water temperature (16°C). Metabolites involved in energy metabolism and basic amino acids were significantly increased upon acute exposure to 27°C (P < 0.05), and no change in metabolite levels occurred in the low water temperature group. However, certain fatty acid levels were elevated after cold stress (P < 0.05), and this effect was not observed in the 27°C gro

In [22]:
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "hackathon"))

In [26]:
def create_paper_node(tx, paperId: str, title: str, abstract: str) -> None:
    tx.run("CREATE (a:Paper {paperId: $paperId, title: $title, abstract: $abstract})", paperId=paperId, title=title, abstract=abstract)

def create_entity_node(tx, entity_name: str) -> None:
    tx.run("CREATE (a:Entity {entity_name: $entity_name})", entity_name=entity_name)

def create_topic_node(tx, topic_name: str) -> None:
    tx.run("CREATE (a:Topic {topic_name: $topic_name})", topic_name=topic_name)

def create_author_node(tx, authorId: str, author_name: str) -> None:
    tx.run("CREATE (a:Author {authorId: $authorId, author_name: $author_name})", authorId=authorId, author_name=author_name)

In [29]:
def create_authored_relationship(tx, paperId, authorId):
    tx.run("MATCH (a:Author) WHERE a.authorId = $authorId "
            "MATCH (b:Paper) WHERE b.paperId = $paperId "
            "CREATE (a)-[:AUTHORED]->(b)",
           authorId=authorId, paperId=paperId)

def create_entity_relationship(tx, paperId, entity_name):
    tx.run("MATCH (a:Entity) WHERE a.entity_name = $entity_name "
            "MATCH (b:Paper) WHERE b.paperId = $paperId "
            "CREATE (b)-[:HAS_KEYWORD]->(a)",
           entity_name=entity_name, paperId=paperId)

def create_in_topic_relationship(tx, paperId, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (paper)-[:IN_TOPIC]->(topic)",
           topic_name=topic_name, paperId=paperId)

In [30]:
seen_authors = set()
seen_ents = set()
seen_topics = set()

with driver.session() as session:
    for item in data:
        # Create the core paper node
        session.execute_write(create_paper_node, item['paperId'], item['title'], item['abstract'])

        if item['topic'] not in seen_topics:
            seen_topics.add(item['topic'])
            session.execute_write(create_topic_node, item['topic'])
        
        session.execute_write(create_in_topic_relationship, item['paperId'], item['topic'])

        # Create the entity nodes
        for ent in item['ents']:
            if ent not in seen_ents:
                seen_ents.add(ent)
                session.execute_write(create_entity_node, ent)

            session.execute_write(create_entity_relationship, item['paperId'], ent)

        # Create the author nodes and relationships
        for author in item['authors']:
            if author['authorId'] not in seen_authors:
                seen_authors.add(author['authorId'])
                session.execute_write(create_author_node, author['authorId'], author['name'])

            session.execute_write(create_authored_relationship, item['paperId'], author['authorId'])


driver.close()