In [2]:
from neo4j import GraphDatabase
import pke
import json
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import os
import pickle
import concise_concepts
from SentenceGraph.core import SentenceGraph, Format

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def savePickle(data, save_path) -> None:
    try:
        with open(save_path, "wb") as f:
            pickle.dump(data, f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to save pickle at: {save_path}")


def loadPickle(load_path) -> None:
    try:
        with open(load_path, "rb") as f:
            return pickle.load(f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to load pickle at: {load_path}")


In [4]:
abstracts = []
root_path = "./data/sample"
sample_data = ["weather_CO2.jsonl", "paleoclimate.jsonl", "rewilding.jsonl", "rockfish.jsonl", "arctic.jsonl", "climate.jsonl", "shark_climate.jsonl"]

for data_path in sample_data:
    with open(f'{root_path}/{data_path}', 'r') as json_file:
        json_list = list(json_file)

    result = json.loads(json_list[0])

    for result_dict in result["data"]:
        abstracts.append(result_dict)

len(abstracts)

350

In [5]:
data = [y for y in (x for x in abstracts) if y["abstract"] is not None]
len(data)

222

In [6]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

model = BERTopic(
    vectorizer_model=vectorizer_model,
    language='english', calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform([x['abstract'] for x in data])

Batches: 100%|██████████| 7/7 [00:20<00:00,  2.96s/it]
2022-12-16 15:13:28,775 - BERTopic - Transformed documents to Embeddings
2022-12-16 15:13:40,101 - BERTopic - Reduced dimensionality
2022-12-16 15:13:40,123 - BERTopic - Clustered reduced embeddings


In [10]:
# initialize keyphrase extraction model, here TopicRank
extractor = pke.unsupervised.TopicRank()
nlp = spacy.load("en_core_web_md")

In [11]:
topic_ent_dict = {}
# This loop is the main processing loop
for item, topicId in zip(data, topics):
    topicNormalized = '_'.join([x[0] for x in model.get_topic(topicId)])
    item['topic'] = topicNormalized

    abstract = item['abstract']
    entities = []

    if abstract is None:
        print("Found a none!")
    else:
        # Extract entities
        extractor.load_document(input=abstract, language='en')
        extractor.candidate_selection()
        extractor.candidate_weighting()
        ents_keep = [x[0] for x in extractor.get_n_best(n=2)]
        item['ents'] = ents_keep

        try:
            topic_ent_dict[topicNormalized].extend(ents_keep)
        except KeyError:
            topic_ent_dict[topicNormalized] = ents_keep

        # Tokenize sentences
        doc = nlp(item['abstract'])
        # spaCy returns a generator, so use a list comprehension to make our lives easier 
        item['sentences'] = [sent for sent in doc.sents]

In [11]:
savePickle(topic_ent_dict, './data/topic_ent_dict_checkpoint.pkl')

In [12]:
entity_labels = topic_ent_dict.keys()
for topic in entity_labels:
    topic_ent_dict[topic] = list(set(topic_ent_dict[topic]))

In [19]:
savePickle(data, './data/data_checkpoint.pkl')

# Creating the graph

In [22]:
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "hackathon"))

## Populating the graph

### Node Creation queries


In [29]:
def create_paper_node(tx, paperId: str, title: str, abstract: str) -> None:
    tx.run("CREATE (a:Paper {paperId: $paperId, title: $title, abstract: $abstract})", paperId=paperId, title=title, abstract=abstract)

def create_entity_node(tx, entity_name: str, label_name: str) -> None:
    tx.run("CREATE (a:Entity {entity_name: $entity_name, label_name: $label_name})", entity_name=entity_name, label_name=label_name)

def create_topic_node(tx, topic_name: str) -> None:
    tx.run("CREATE (a:Topic {topic_name: $topic_name})", topic_name=topic_name)
    
def create_sentence_node(tx, paperId: str, sentence_id: str, sentence_txt: str) -> None:
    tx.run("CREATE (s:Sentence {paperId: $paperId, sentence_id: $sentence_id, sentence_txt: $sentence_txt})", paperId=paperId, sentence_id=sentence_id, sentence_txt=sentence_txt)

def create_author_node(tx, authorId: str, author_name: str) -> None:
    tx.run("CREATE (a:Author {authorId: $authorId, author_name: $author_name})", authorId=authorId, author_name=author_name)

### Keyword Relationships

TODO: Lets start switching over to using Keyword instead of Entity. More user friendly imo

In [None]:
def create_paper_has_keyword_relationship(tx, paperId, entity_name):
    tx.run("MATCH (kw:Entity) WHERE kw.entity_name = $entity_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (paper)-[:HAS_KEYWORD]->(kw)",
           entity_name=entity_name, paperId=paperId)

def create_keyword_in_paper_relationship(tx, paperId, entity_name):
    tx.run("MATCH (kw:Entity) WHERE kw.entity_name = $entity_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (kw)-[:IN_PAPER]->(paper)",
           entity_name=entity_name, paperId=paperId)

### Topic Relationships

In [None]:
def create_paper_in_topic_relationship(tx, paperId, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (paper)-[:IN_TOPIC]->(topic)",
           topic_name=topic_name, paperId=paperId)

def create_topic_has_paper_relationship(tx, paperId, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (topic)-[:HAS_PAPER]->(paper)",
           topic_name=topic_name, paperId=paperId)

def create_keyword_in_topic_relationship(tx, entity_name, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (kw:Entity) WHERE kw.entity_name = $entity_name "
            "CREATE (kw)-[:IN_TOPIC]->(topic)",
           topic_name=topic_name, entity_name=entity_name)

def create_topic_has_keyword_relationship(tx, entity_name, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (kw:Entity) WHERE kw.entity_name = $entity_name "
            "CREATE (topic)-[:HAS_KEYWORD]->(kw)",
           topic_name=topic_name, entity_name=entity_name)

### Sentence Relationships

In [15]:
def create_paper_has_sentence_relationship(tx, sentence_id, paperId):
    tx.run("MATCH (s:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (p:Paper) WHERE p.paperId = $paperId "
            "CREATE (p)-[:HAS_SENTENCE]->(s)",
           sentence_id=sentence_id, paperId=paperId)

def create_sentence_in_paper_relationship(tx, sentence_id, paperId):
    tx.run("MATCH (s:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (p:Paper) WHERE p.paperId = $paperId "
            "CREATE (s)-[:IN_PAPER]->(p)",
           sentence_id=sentence_id, paperId=paperId)

def create_semantic_sentence_relationship(tx, sentence_id1, sentence_id2, score):
    tx.run("MATCH (s1:Sentence) WHERE s1.sentence_id = $sentence_id1 "
            "MATCH (s2:Sentence) WHERE s2.sentence_id = $sentence_id2 "
            "CREATE (s1)-[:SIMILAR {score: $score}]->(s2)",
           sentence_id1=sentence_id1, sentence_id2=sentence_id2, score=score)

def create_keyword_in_sentence_relationship(tx, entity_name, sentence_id):
    tx.run("MATCH (sent:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (kw:Entity) WHERE kw.entity_name = $entity_name "
            "CREATE (kw)-[:IN_SENTENCE]->(sent)",
           sentence_id=sentence_id, entity_name=entity_name)

def create_sentence_has_keyword_relationship(tx, entity_name, sentence_id):
    tx.run("MATCH (sent:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (kw:Entity) WHERE kw.entity_name = $entity_name "
            "CREATE (sent)-[:HAS_KEYWORD]->(kw)",
           sentence_id=sentence_id, entity_name=entity_name)

In [None]:
def create_authored_relationship(tx, paperId, authorId):
    tx.run("MATCH (a:Author) WHERE a.authorId = $authorId "
            "MATCH (b:Paper) WHERE b.paperId = $paperId "
            "CREATE (a)-[:AUTHORED]->(b)",
           authorId=authorId, paperId=paperId)

In [16]:
seen_authors = set()
seen_ents = set()
seen_topics = set()

with driver.session() as session:
    for item in data:
        # Create the core paper node
        session.execute_write(create_paper_node, item['paperId'], item['title'], item['abstract'])

        if item['topic'] not in seen_topics:
            seen_topics.add(item['topic'])
            session.execute_write(create_topic_node, item['topic'])
        
        session.execute_write(create_paper_in_topic_relationship, item['paperId'], item['topic'])

        # Create the entity nodes
        for ent in item['ents']:
            if ent not in seen_ents:
                seen_ents.add(ent)
                label_name = f"{item['topic']}"
                session.execute_write(create_entity_node, ent, label_name)

            session.execute_write(create_paper_has_keyword_relationship, item['paperId'], ent)

        # Create the author nodes and relationships
        for author in item['authors']:
            if author['authorId'] not in seen_authors:
                seen_authors.add(author['authorId'])
                session.execute_write(create_author_node, author['authorId'], author['name'])

            session.execute_write(create_authored_relationship, item['paperId'], author['authorId'])


driver.close()

NameError: name 'driver' is not defined