In [7]:
from neo4j import GraphDatabase
import json
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import os
import pickle
from typing import List
from SentenceGraph.core import SentenceGraph, Format, TextNodeType
from keybert import KeyBERT
from transformers.pipelines import pipeline
from transformers import AutoTokenizer

In [2]:
def savePickle(data, save_path) -> None:
    try:
        with open(save_path, "wb") as f:
            pickle.dump(data, f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to save pickle at: {save_path}")


def loadPickle(load_path) -> None:
    try:
        with open(load_path, "rb") as f:
            return pickle.load(f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to load pickle at: {load_path}")


In [3]:
abstracts = []
root_path = "./data/sample"
sample_data = ["weather_CO2.jsonl", "paleoclimate.jsonl", "rewilding.jsonl", "rockfish.jsonl", "arctic.jsonl", "climate.jsonl", "shark_climate.jsonl"]

for data_path in sample_data:
    with open(f'{root_path}/{data_path}', 'r') as json_file:
        json_list = list(json_file)

    result = json.loads(json_list[0])

    for result_dict in result["data"]:
        abstracts.append(result_dict)

len(abstracts)

350

In [4]:
data = [y for y in (x for x in abstracts) if y["abstract"] is not None]
len(data)

222

In [8]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

# TODO: https://huggingface.co/allenai/specter might also be a good option
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased", model_max_length=512)
topic_embedder = pipeline("feature-extraction", model="allenai/scibert_scivocab_cased", tokenizer=tokenizer)

topic_model = BERTopic(
    embedding_model=topic_embedder,
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    verbose=True
)
topics, probs = topic_model.fit_transform([x['abstract'] for x in data])

Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 222/222 [01:54<00:00,  1.94it/s]
2023-02-04 22:20:54,384 - BERTopic - Transformed documents t

In [None]:
nlp = spacy.load("en_core_web_md")
kw_extractor = KeyBERT(model="gsarti/scibert-nli")

In [18]:
# Using gsarti/scibert-nli
kw_extractor = KeyBERT(model="gsarti/scibert-nli")
keywords = kw_extractor.extract_keywords(data[4]['abstract'])
keywords

[('planetary', 0.207),
 ('summer', 0.1884),
 ('meteorological', 0.1599),
 ('daily', 0.1496),
 ('cumulus', 0.1469)]

In [22]:
# Using base sentence-transformers/allenai-specter
kw_extractor = KeyBERT(model="sentence-transformers/allenai-specter")
keywords = kw_extractor.extract_keywords(data[4]['abstract'], keyphrase_ngram_range=(1, 3), stop_words='english',use_maxsum=True, nr_candidates=20, top_n=7)
keywords

[('transport', 0.601),
 ('cumulus', 0.6023),
 ('ensembles', 0.6156),
 ('winds', 0.6606),
 ('microphysics', 0.6634)]

In [None]:
kw_extractor = KeyBERT(model="sentence-transformers/allenai-specter", stop_words='english')
keywords = kw_extractor.extract_keywords(data[4]['abstract'])
keywords

In [14]:
# Using the base KeyBERT model
kw_extractor = KeyBERT()
keywords = kw_extractor.extract_keywords(data[4]['abstract'])
keywords

[('meteorological', 0.4599),
 ('atmospheric', 0.3993),
 ('weather', 0.3982),
 ('co2', 0.381),
 ('winds', 0.3064)]

In [10]:
topic_ent_dict = {}
# This loop is the main processing loop
for item, topicId in zip(data, topics):
    topicNormalized = '_'.join([x[0] for x in topic_model.get_topic(topicId)])
    item['topic'] = topicNormalized

    abstract = item['abstract']
    entities = []

    if abstract is None:
        print("Found a none!")
    else:
        # Extract entities
        kw_extractor.load_document(input=abstract, language='en')
        kw_extractor.candidate_selection()
        kw_extractor.candidate_weighting()
        ents_keep = [x[0].lower() for x in kw_extractor.get_n_best(n=2)]
        item['ents'] = ents_keep

        try:
            topic_ent_dict[topicNormalized].extend(ents_keep)
        except KeyError:
            topic_ent_dict[topicNormalized] = ents_keep

        # Process with spaCy
        doc = nlp(item['abstract'])
        
        # TODO: Clean text and remove newlines/etc
        # spaCy returns a generator, so use a list comprehension to make our lives easier 
        item['sentences'] = [sent.text for sent in doc.sents]

In [49]:
test = nlp(data[4]['abstract'].replace("\n", " ").rstrip())
test

Abstract. Atmospheric transport model errors are one of the main contributors to the uncertainty affecting CO2 inverse flux estimates. In this study, we determine the leading causes of transport errors over the US upper Midwest with a large set of simulations generated with the Weather Research and Forecasting (WRF) mesoscale model. The various WRF simulations are performed using different meteorological driver datasets and physical parameterizations including planetary boundary layer (PBL) schemes, land surface models (LSMs), cumulus parameterizations and microphysics parameterizations. All the different model configurations were coupled to CO2 fluxes and lateral boundary conditions from the CarbonTracker inversion system to simulate atmospheric CO2 mole fractions. PBL height, wind speed, wind direction, and atmospheric CO2 mole fractions are compared to observations during a month in the summer of 2008, and statistical analyses were performed to evaluate the impact of both physics pa

In [30]:
savePickle(topic_ent_dict, './data/topic_ent_dict_checkpoint.pkl')

In [31]:
entity_labels = topic_ent_dict.keys()
for topic in entity_labels:
    topic_ent_dict[topic] = list(set(topic_ent_dict[topic]))

# Creating the graph

In [33]:
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "hackathon"))

## Cypher 

### Node Creation queries


In [34]:
def create_paper_node(tx, paperId: str, title: str, abstract: str) -> None:
    tx.run("CREATE (a:Paper {paperId: $paperId, title: $title, abstract: $abstract})", paperId=paperId, title=title, abstract=abstract)

def create_keyword_node(tx, entity_name: str, label_name: str) -> None:
    tx.run("CREATE (a:Keyword {entity_name: $entity_name, label_name: $label_name})", entity_name=entity_name, label_name=label_name)

def create_topic_node(tx, topic_name: str) -> None:
    tx.run("CREATE (a:Topic {topic_name: $topic_name})", topic_name=topic_name)
    
def create_sentence_node(tx, paperId: str, sentence_id: str, sentence_txt: str) -> None:
    tx.run("CREATE (s:Sentence {paperId: $paperId, sentence_id: $sentence_id, sentence_txt: $sentence_txt})", paperId=paperId, sentence_id=sentence_id, sentence_txt=sentence_txt)

def create_author_node(tx, authorId: str, author_name: str) -> None:
    tx.run("CREATE (a:Author {authorId: $authorId, author_name: $author_name})", authorId=authorId, author_name=author_name)

### Keyword Relationships

TODO: Lets start switching over to using Keyword instead of Entity. More user friendly imo

In [35]:
def create_paper_has_keyword_relationship(tx, paperId, entity_name):
    tx.run("MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (paper)-[:HAS_KEYWORD]->(kw)",
           entity_name=entity_name, paperId=paperId)

def create_keyword_in_paper_relationship(tx, paperId, entity_name):
    tx.run("MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (kw)-[:IN_PAPER]->(paper)",
           entity_name=entity_name, paperId=paperId)

### Topic Relationships

In [36]:
def create_paper_in_topic_relationship(tx, paperId, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (paper)-[:IN_TOPIC]->(topic)",
           topic_name=topic_name, paperId=paperId)

def create_topic_has_paper_relationship(tx, paperId, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (topic)-[:HAS_PAPER]->(paper)",
           topic_name=topic_name, paperId=paperId)

def create_keyword_in_topic_relationship(tx, entity_name, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "CREATE (kw)-[:IN_TOPIC]->(topic)",
           topic_name=topic_name, entity_name=entity_name)

def create_topic_has_keyword_relationship(tx, entity_name, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "CREATE (topic)-[:HAS_KEYWORD]->(kw)",
           topic_name=topic_name, entity_name=entity_name)

### Sentence Relationships

In [37]:
def create_paper_has_sentence_relationship(tx, sentence_id, paperId):
    tx.run("MATCH (s:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (p:Paper) WHERE p.paperId = $paperId "
            "CREATE (p)-[:HAS_SENTENCE]->(s)",
           sentence_id=sentence_id, paperId=paperId)

def create_sentence_in_paper_relationship(tx, sentence_id, paperId):
    tx.run("MATCH (s:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (p:Paper) WHERE p.paperId = $paperId "
            "CREATE (s)-[:IN_PAPER]->(p)",
           sentence_id=sentence_id, paperId=paperId)

def create_semantic_sentence_relationship(tx, sentence_id1, sentence_id2, score):
    tx.run("MATCH (s1:Sentence) WHERE s1.sentence_id = $sentence_id1 "
            "MATCH (s2:Sentence) WHERE s2.sentence_id = $sentence_id2 "
            "CREATE (s1)-[:SIMILAR {score: $score}]->(s2)",
           sentence_id1=sentence_id1, sentence_id2=sentence_id2, score=score)

def create_keyword_in_sentence_relationship(tx, entity_name, sentence_id):
    tx.run("MATCH (sent:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "CREATE (kw)-[:IN_SENTENCE]->(sent)",
           sentence_id=sentence_id, entity_name=entity_name)

def create_sentence_has_keyword_relationship(tx, entity_name, sentence_id):
    tx.run("MATCH (sent:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "CREATE (sent)-[:HAS_KEYWORD]->(kw)",
           sentence_id=sentence_id, entity_name=entity_name)

In [38]:
def create_authored_relationship(tx, paperId, authorId):
    tx.run("MATCH (a:Author) WHERE a.authorId = $authorId "
            "MATCH (b:Paper) WHERE b.paperId = $paperId "
            "CREATE (a)-[:AUTHORED]->(b)"
            "CREATE (b)-[:AUTHORED]->(a)",
           authorId=authorId, paperId=paperId)

## Populating the graph (from scratch)
Run this code only during local development or if youre recreating a graph from scratch. Not meant to touch the production graph.

In [8]:
def get_sentence_id(paperId: str, count: int) -> str:
    f"{paperId}-{count}"

In [39]:
seen_authors = set()
seen_keywords = set()
seen_topics = set()

with driver.session() as session:
    for item in data:
        # Create the core paper node
        session.execute_write(create_paper_node, item['paperId'], item['title'], item['abstract'])

        if item['topic'] not in seen_topics:
            seen_topics.add(item['topic'])
            session.execute_write(create_topic_node, item['topic'])
        
        session.execute_write(create_paper_in_topic_relationship, item['paperId'], item['topic'])
        session.execute_write(create_topic_has_paper_relationship, item['paperId'], item['topic'])

        # Create the entity nodes
        for ent in item['ents']:
            if ent not in seen_keywords:
                seen_keywords.add(ent)
                label_name = f"{item['topic']}"
                session.execute_write(create_keyword_node, ent, label_name)
                session.execute_write(create_keyword_in_topic_relationship, ent, item['topic'])
                session.execute_write(create_topic_has_keyword_relationship, ent, item['topic'])

            session.execute_write(create_paper_has_keyword_relationship, item['paperId'], ent)
            session.execute_write(create_keyword_in_paper_relationship, item['paperId'], ent)

        # Create the author nodes and relationships
        for author in item['authors']:
            if author['authorId'] not in seen_authors:
                seen_authors.add(author['authorId'])
                session.execute_write(create_author_node, author['authorId'], author['name'])

            session.execute_write(create_authored_relationship, item['paperId'], author['authorId'])

        for count, sentence in enumerate(item['sentences']):
            sentence_id = get_sentence_id(item['paperId'], count)
            session.execute_write(create_sentence_node, item['paperId'], sentence_id, sentence)
            session.execute_write(create_paper_has_sentence_relationship, sentence_id, item['paperId'])
            session.execute_write(create_sentence_in_paper_relationship, sentence_id, item['paperId'])

driver.close()

Transaction failed and will be retried in 0.9465203664762873s (Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it))
Transaction failed and will be retried in 1.9095661851587753s (Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [WinError 10061] No connection could be made because the target machine actively refused it)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [WinError 10061] No connection could be made because the target machine actively refused it))


KeyboardInterrupt: 

### Creating Sentence-Keyword relationships
After we populate the graph with all the keywords, we can create any keyword/sentence relationships to enrich the graph with more granular mappings of where keywords might be located in the text. 

In [None]:
with driver.session() as session:
    for item in data:
        for count, sentence in enumerate(item['sentences']):
            sentence_id = get_sentence_id(item['paperId'], count)
            for keyword in seen_keywords:
                # We lowercased the keywords when we first created the list, so no need to do so now. 
                if keyword in sentence.lower():
                    session.execute_write(create_keyword_in_sentence_relationship, keyword, sentence_id)
                    session.execute_write(create_sentence_has_keyword_relationship, keyword, sentence_id)

### Creating the semantic  graph
Using the SentenceGraph package (😉) we can also create a similarity matrix between all the paper abstracts.

TODO: This was originally planned with sentence similarity in mind, but we should measure the compute cost with that since thats a huge number more edge connections as opposed to computing all the similarities between abstracts.

In [5]:
from collections import namedtuple
paper_nodes = []

for count, item in enumerate(data):
    if item['abstract'] is None:
        pass
    else:
        paper_nodes.append(TextNodeType(item['paperId'], item['abstract']))

In [6]:
sentenceGraph = SentenceGraph(model_name="gsarti/scibert-nli")

In [7]:
sim_graph = sentenceGraph.createGraph(paper_nodes, format=Format.Pandas)
sim_graph

Unnamed: 0,1a8013583dd77f9dd4cc7d54fad50312c7685143,cf2e18da6885b7f44e72216dde95a45efc04221a,dff3b7616297f52f37619b9cfef86a6624e97469,6e76815933e4ee7cb93cc175fac2e7a92798228a,055e7dad4b75e174807ef9b21502128a7d80141f,d3cce3624459f1cbb0e91790bf719ea855c466b3,2ce5bf06ebcbedc7d4be3186c79d3f4f9907fce3,c2bf66f4ea310d52f7c97834bb83d15eb9e3f260,7a2e8b1a5dc675ecf07e883644c0898289e77aea,1d7d686e743630b1c3225aea5361b014d0e00d8a,...,029629bf585bca44db93de26230c810ea7f09754,ae68029e9c3bae9fc719bd382c87a75471b05584,8755ef9fb736d8e74808dbcc51fb59d2229a5bdb,787688d21b11c067a51dc635682d98a078b536be,9fc38632ec56da09955f4f65fd4ff9c5ed5c22e7,5e3890002106dcfdc85eae179d05698182471b1f,8a8be0b7565d6ff1a9e2003686f037cf161434e0,8e2e7d7446568064eb75454c03fecfb6f02687fe,042167a71d7f16f3dde05a93c19728eb13cd08c3,b8a200589c50aa909e13a7bb2b05d92c07b6ec92
1a8013583dd77f9dd4cc7d54fad50312c7685143,1.000000,0.459637,0.263743,0.590329,0.422801,0.331429,0.452401,0.448926,0.479724,0.167367,...,0.120076,0.344435,0.264619,0.401102,0.338029,0.415100,0.139466,0.266890,0.399270,0.338335
cf2e18da6885b7f44e72216dde95a45efc04221a,0.459637,1.000000,0.218518,0.489537,0.466331,0.394165,0.561479,0.588183,0.349300,0.292236,...,0.189778,0.237999,0.082025,0.450878,0.233646,0.238506,0.076023,0.124299,0.375527,0.233427
dff3b7616297f52f37619b9cfef86a6624e97469,0.263743,0.218518,1.000000,0.347183,0.308128,0.278324,0.354708,0.277958,0.217595,0.299843,...,0.254710,0.050628,0.038151,0.196835,0.092076,0.203671,-0.109848,0.017265,0.035105,0.094935
6e76815933e4ee7cb93cc175fac2e7a92798228a,0.590329,0.489537,0.347183,1.000000,0.601934,0.525450,0.622337,0.623303,0.368770,0.444974,...,0.267220,0.270101,0.200949,0.433413,0.320499,0.385130,0.119686,0.213913,0.383534,0.319923
055e7dad4b75e174807ef9b21502128a7d80141f,0.422801,0.466331,0.308128,0.601934,1.000000,0.696281,0.692240,0.573306,0.332204,0.411865,...,0.241378,0.196287,0.130070,0.317937,0.258612,0.315173,0.115656,0.114490,0.334248,0.259629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5e3890002106dcfdc85eae179d05698182471b1f,0.415100,0.238506,0.203671,0.385130,0.315173,0.255663,0.377126,0.235571,0.233394,0.150517,...,0.252593,0.555229,0.585951,0.500209,0.669019,1.000000,0.141863,0.637868,0.579914,0.669180
8a8be0b7565d6ff1a9e2003686f037cf161434e0,0.139466,0.076023,-0.109848,0.119686,0.115656,0.169218,0.145915,0.109430,0.259576,-0.036670,...,-0.048377,0.245453,0.168234,0.229477,0.267905,0.141863,1.000000,0.304684,0.220064,0.263994
8e2e7d7446568064eb75454c03fecfb6f02687fe,0.266890,0.124299,0.017265,0.213913,0.114490,0.019775,0.182836,0.151602,0.187214,0.032307,...,0.158997,0.565494,0.533360,0.501249,0.783880,0.637868,0.304684,1.000000,0.564081,0.782991
042167a71d7f16f3dde05a93c19728eb13cd08c3,0.399270,0.375527,0.035105,0.383534,0.334248,0.288436,0.393009,0.361973,0.268137,0.212846,...,0.145558,0.806661,0.434421,0.722260,0.693657,0.579914,0.220064,0.564081,1.000000,0.693238
