In [1]:
from neo4j import GraphDatabase
import json
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import pickle
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %%capture
# %pip install --upgrade joblib==1.1.0

In [3]:
def savePickle(data, save_path) -> None:
    try:
        with open(save_path, "wb") as f:
            pickle.dump(data, f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to save pickle at: {save_path}")


def loadPickle(load_path) -> None:
    try:
        with open(load_path, "rb") as f:
            return pickle.load(f)
    except Exception as e:
        raise Exception(f"Error: {e} with trying to load pickle at: {load_path}")


In [4]:
abstracts = []
root_path = "./data/sample"
sample_data = ["weather_CO2.jsonl", "paleoclimate.jsonl", "rewilding.jsonl", "rockfish.jsonl", "arctic.jsonl", "climate.jsonl", "shark_climate.jsonl"]

for data_path in sample_data:
    with open(f'{root_path}/{data_path}', 'r') as json_file:
        json_list = list(json_file)

    result = json.loads(json_list[0])

    for result_dict in result["data"]:
        abstracts.append(result_dict)

len(abstracts)

350

In [5]:
data = [y for y in (x for x in abstracts) if y["abstract"] is not None]
len(data)

222

In [6]:
# we add this to remove stopwords
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
abstract_encoder = SentenceTransformer("sentence-transformers/allenai-specter")

topic_model = BERTopic(
    embedding_model=abstract_encoder,
    vectorizer_model=vectorizer_model,
    language='english', 
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform([x['abstract'].replace("\n", " ").rstrip() for x in data])

Batches: 100%|██████████| 7/7 [01:45<00:00, 15.07s/it]
2023-03-27 18:30:59,827 - BERTopic - Transformed documents to Embeddings
2023-03-27 18:31:07,855 - BERTopic - Reduced dimensionality
2023-03-27 18:31:08,060 - BERTopic - Clustered reduced embeddings


In [7]:
topic_model.save("models/climate_scholar_specter")

In [8]:
nlp = spacy.load("en_core_web_md")
kw_extractor = KeyBERT(model=abstract_encoder)

In [9]:
topic_ent_dict = {}
# This loop is the main processing loop
for item, topicId in zip(data, topics):
    topicNormalized = '_'.join([x[0] for x in topic_model.get_topic(topicId)])
    item['topic'] = topicNormalized

    abstract = item['abstract'].replace("\n", " ").rstrip()
    entities = []

    if abstract is not None:
        # Extract keywords
        keywords = kw_extractor.extract_keywords(abstract, keyphrase_ngram_range=(1, 2), stop_words='english',use_maxsum=True, nr_candidates=20, top_n=3,  use_mmr=True, diversity=0.5)
        item['keywords'] = [x[0].lower() for x in keywords]

        try:
            topic_ent_dict[topicNormalized].extend(item['keywords'])
        except KeyError:
            topic_ent_dict[topicNormalized] = item['keywords']
        
        item['embeddings'] = abstract_encoder.encode(abstract).tolist()

        # We could map a paper to multiple topics using sentences!
        # TODO: Unsure if we're actually going to do anything regarding sentences. Might be adding too much noise
        # Process with spaCy
        # doc = nlp(item['abstract'])
        # item['sentences'] = [sent.text for sent in doc.sents]

In [10]:
savePickle(topic_ent_dict, './data/topic_ent_dict_checkpoint.pkl')

# Creating the graph

In [20]:
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "hackathon"))

## Cypher 

### Node Creation queries


In [21]:
# def create_paper_node(tx, paperId: str, title: str, abstract: str, embedding) -> None:
#     tx.run("CREATE (a:Paper {paperId: $paperId, title: $title, abstract: $abstract, embedding: $embedding})", paperId=paperId, title=title, abstract=abstract, embedding=embedding)

def create_paper_node(tx, paperId: str, title: str, abstract: str) -> None:
     tx.run("CREATE (a:Paper {paperId: $paperId, title: $title, abstract: $abstract})", paperId=paperId, title=title, abstract=abstract)

def create_keyword_node(tx, entity_name: str,) -> None:
    tx.run("CREATE (a:Keyword {entity_name: $entity_name})", entity_name=entity_name)

def create_topic_node(tx, topic_name: str) -> None:
    tx.run("CREATE (a:Topic {topic_name: $topic_name})", topic_name=topic_name)
    
def create_sentence_node(tx, paperId: str, sentence_id: str, sentence_txt: str) -> None:
    tx.run("CREATE (s:Sentence {paperId: $paperId, sentence_id: $sentence_id, sentence_txt: $sentence_txt})", paperId=paperId, sentence_id=sentence_id, sentence_txt=sentence_txt)

def create_author_node(tx, authorId: str, author_name: str) -> None:
    tx.run("CREATE (a:Author {authorId: $authorId, author_name: $author_name})", authorId=authorId, author_name=author_name)

### Keyword Relationships

TODO: Lets start switching over to using Keyword instead of Entity. More user friendly imo

In [22]:
def create_paper_has_keyword_relationship(tx, paperId, entity_name):
    tx.run("MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (paper)-[:HAS_KEYWORD]->(kw)",
           entity_name=entity_name, paperId=paperId)

def create_keyword_in_paper_relationship(tx, paperId, entity_name):
    tx.run("MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (kw)-[:IN_PAPER]->(paper)",
           entity_name=entity_name, paperId=paperId)

def create_keyword_cooccurence_relationship(tx, entity_name1, entity_name2):
    tx.run("MATCH (kw1:Keyword) WHERE kw1.entity_name = $entity_name1 "
            "MATCH (kw2:Keyword) WHERE kw2.entity_name = $entity_name2 "
            "CREATE (kw1)-[:COOCCURS_WITH]->(kw2)"
            "CREATE (kw2)-[:COOCCURS_WITH]->(kw1)",
           entity_name1=entity_name1, entity_name2=entity_name2)

### Topic Relationships

In [23]:
def create_paper_in_topic_relationship(tx, paperId, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (paper)-[:IN_TOPIC]->(topic)",
           topic_name=topic_name, paperId=paperId)

def create_topic_has_paper_relationship(tx, paperId, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (paper:Paper) WHERE paper.paperId = $paperId "
            "CREATE (topic)-[:HAS_PAPER]->(paper)",
           topic_name=topic_name, paperId=paperId)

def create_keyword_in_topic_relationship(tx, entity_name, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "CREATE (kw)-[:IN_TOPIC]->(topic)",
           topic_name=topic_name, entity_name=entity_name)

def create_topic_has_keyword_relationship(tx, entity_name, topic_name):
    tx.run("MATCH (topic:Topic) WHERE topic.topic_name = $topic_name "
            "MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "CREATE (topic)-[:HAS_KEYWORD]->(kw)",
           topic_name=topic_name, entity_name=entity_name)

### Sentence Relationships

In [24]:
def create_paper_has_sentence_relationship(tx, sentence_id, paperId):
    tx.run("MATCH (s:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (p:Paper) WHERE p.paperId = $paperId "
            "CREATE (p)-[:HAS_SENTENCE]->(s)",
           sentence_id=sentence_id, paperId=paperId)

def create_sentence_in_paper_relationship(tx, sentence_id, paperId):
    tx.run("MATCH (s:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (p:Paper) WHERE p.paperId = $paperId "
            "CREATE (s)-[:IN_PAPER]->(p)",
           sentence_id=sentence_id, paperId=paperId)

def create_semantic_sentence_relationship(tx, sentence_id1, sentence_id2, score):
    tx.run("MATCH (s1:Sentence) WHERE s1.sentence_id = $sentence_id1 "
            "MATCH (s2:Sentence) WHERE s2.sentence_id = $sentence_id2 "
            "CREATE (s1)-[:SIMILAR {score: $score}]->(s2)",
           sentence_id1=sentence_id1, sentence_id2=sentence_id2, score=score)

def create_keyword_in_sentence_relationship(tx, entity_name, sentence_id):
    tx.run("MATCH (sent:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "CREATE (kw)-[:IN_SENTENCE]->(sent)",
           sentence_id=sentence_id, entity_name=entity_name)

def create_sentence_has_keyword_relationship(tx, entity_name, sentence_id):
    tx.run("MATCH (sent:Sentence) WHERE s.sentence_id = $sentence_id "
            "MATCH (kw:Keyword) WHERE kw.entity_name = $entity_name "
            "CREATE (sent)-[:HAS_KEYWORD]->(kw)",
           sentence_id=sentence_id, entity_name=entity_name)

In [25]:
def create_authored_relationship(tx, paperId, authorId):
    tx.run("MATCH (a:Author) WHERE a.authorId = $authorId "
            "MATCH (b:Paper) WHERE b.paperId = $paperId "
            "CREATE (a)-[:AUTHORED]->(b)"
            "CREATE (b)-[:AUTHORED]->(a)",
           authorId=authorId, paperId=paperId)

## Populating the graph (from scratch)
Run this code only during local development or if youre recreating a graph from scratch. Not meant to touch the production graph.

In [26]:
def get_sentence_id(paperId: str, count: int) -> str:
    f"{paperId}-{count}"

In [27]:
seen_authors = set()
seen_keywords = set()
seen_topics = set()
seen_key_topic_pairs = {}

with driver.session() as session:
    for item in data:
        # Create the core paper node
        # session.execute_write(create_paper_node, item['paperId'], item['title'], item['abstract'], item['embeddings'])
        session.execute_write(create_paper_node, item['paperId'], item['title'], item['abstract'])


        if item['topic'] not in seen_topics:
            seen_topics.add(item['topic'])
            session.execute_write(create_topic_node, item['topic'])
        
        session.execute_write(create_paper_in_topic_relationship, item['paperId'], item['topic'])
        session.execute_write(create_topic_has_paper_relationship, item['paperId'], item['topic'])

        # Create the entity nodes
        for ent in item['keywords']:
            if ent not in seen_keywords:
                #print(f"Creating keyword node for {ent}")
                seen_keywords.add(ent)
                session.execute_write(create_keyword_node, ent)
                
                
            if ent not in seen_key_topic_pairs or item['topic'] not in seen_key_topic_pairs[ent]:
                print(f"Creating keyword-topic relationship for {ent} and {item['topic']}")
                session.execute_write(create_keyword_in_topic_relationship, ent, item['topic'])
                session.execute_write(create_topic_has_keyword_relationship, ent, item['topic'])
                if ent not in seen_key_topic_pairs:
                    seen_key_topic_pairs[ent] = set()
                seen_key_topic_pairs[ent].add(item['topic'])

            session.execute_write(create_paper_has_keyword_relationship, item['paperId'], ent)
            session.execute_write(create_keyword_in_paper_relationship, item['paperId'], ent)

        # Create the keyword co-occurence relationships
        # for ent in item['keywords']:
        #     for ent2 in item['keywords']:
        #         if ent != ent2:
        #             session.execute_write(create_keyword_cooccurence_relationship, ent, ent2)

        # Create the author nodes and relationships
        for author in item['authors']:
            if author['authorId'] not in seen_authors:
                seen_authors.add(author['authorId'])
                session.execute_write(create_author_node, author['authorId'], author['name'])

            session.execute_write(create_authored_relationship, item['paperId'], author['authorId'])

driver.close()

Creating keyword-topic relationship for seawater samples and co2_climate_ice_arctic_model_data_sea_atmospheric_global_weather
Creating keyword-topic relationship for global atmospheric and co2_climate_ice_arctic_model_data_sea_atmospheric_global_weather
Creating keyword-topic relationship for dioxide co2 and co2_climate_ice_arctic_model_data_sea_atmospheric_global_weather
Creating keyword-topic relationship for soil temperature and co2_climate_ice_arctic_model_data_sea_atmospheric_global_weather
Creating keyword-topic relationship for production vegetation and co2_climate_ice_arctic_model_data_sea_atmospheric_global_weather
Creating keyword-topic relationship for wavelet coherence and co2_climate_ice_arctic_model_data_sea_atmospheric_global_weather
Creating keyword-topic relationship for cascade refrigeration and co2_climate_ice_arctic_model_data_sea_atmospheric_global_weather
Creating keyword-topic relationship for natural refrigerants and co2_climate_ice_arctic_model_data_sea_atmosph