Example pipeline for text to KG (v1)

# Initialization

## User Defined Variables

In [1]:
SRC_DIR="/mnt/clbp/paul/cross-talk/texts.examples/"
REPORT_DIR="/mnt/clbp/paul/cross-talk/texts.examples/reports"
#!mkdir {REPORT_DIR}

## License Setup

In [2]:
import json
import os

with open("/mnt/clbp/jsl_lic.json") as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

In [3]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [4]:
import json
import os
import functools
import numpy as np
import pandas as pd
from scipy import spatial

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

params = {"spark.driver.memory":"26G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

23/09/26 16:48:54 WARN Utils: Your hostname, paul resolves to a loopback address: 127.0.1.1; using 172.18.0.2 instead (on interface eth0)
23/09/26 16:48:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/myuser/.ivy2/cache
The jars for the packages stored in: /home/myuser/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ab19bc20-accb-46e3-9985-9cb8397d612e;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/myuser/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.0.2 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guava#failureaccess;1.0.1 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.errorprone#error_prone_annotations;2.18.0 in central
	found com.google.j2objc#j2objc-annotations;1.3 in central
	found com.google.http-client#google-http-clie

Spark NLP Version : 5.0.2
Spark NLP_JSL Version : 5.0.2


# Coresolution of Text

## Set up the pipelines

In [5]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    SentenceDetector,
    Tokenizer,
    SpanBertCorefModel
)
import pyspark.sql.functions as F

# Step 1: Transforms raw texts to `document` annotation
document = DocumentAssembler() \
            .setInputCol("text") \
            .setOutputCol("document")

# Step 2: Sentence Detection
sentenceDetector = SentenceDetector() \
            .setInputCols("document") \
            .setOutputCol("sentences")

# Step 3: Tokenization
token = Tokenizer() \
            .setInputCols("sentences") \
            .setOutputCol("tokens") \
            .setContextChars(["(", ")", "?", "!", ".", ",", ":"])

# Step 4: Coreference Resolution
corefResolution= SpanBertCorefModel().pretrained("spanbert_base_coref")\
            .setInputCols(["sentences", "tokens"]) \
            .setOutputCol("corefs") \
            .setCaseSensitive(False)
            
# Define the pipeline
pipeline = Pipeline(stages=[document, sentenceDetector, token, corefResolution])

# Create the dataframe
data = spark.createDataFrame([["Ana is a Graduate Student at UT Dallas. She loves working in Natural Language Processing at the Institute. Her hobbies include blogging, dancing and singing."]]).toDF("text")

# Fit the dataframe to the pipeline to get the model
model = pipeline.fit(data)

spanbert_base_coref download started this may take some time.
Approximate size to download 540.1 MB
[ | ]spanbert_base_coref download started this may take some time.
Approximate size to download 540.1 MB
Download done! Loading the resource.
[ \ ]

2023-09-26 16:49:31.078665: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


In [6]:
def resolve(text,results):
    pieces = []
    from_pieces_for_eval = []
    to_pieces_for_eval = []
    current = 0
    for ix in results.index:
        row = results.loc[ix,"coref"]
        begin,end = row['begin'],row['end']
        metadata = row['metadata']
        head = metadata['head']
        if head != "ROOT":
            text2_up_to = text[current:begin]
            pieces.append(text2_up_to)
            pieces.append(head)
            
            from_pieces_for_eval.append(text2_up_to)
            from_pieces_for_eval.append("["+text[begin:end+1]+"]")

            to_pieces_for_eval.append(text2_up_to)
            to_pieces_for_eval.append("["+head+"]")

            current = (end+1)
    pieces.append(text[current:])
    from_pieces_for_eval.append(text[current:])
    to_pieces_for_eval.append(text[current:])
    return "".join(pieces),"".join(from_pieces_for_eval),"".join(to_pieces_for_eval)

def resolveText2Text(text):
    data_2 = spark.createDataFrame([[text]]).toDF("text")
    model = pipeline.fit(data_2)

    #results = model.transform(data_2).selectExpr("explode(corefs) AS coref").toPandas()
    #text2,from_text,to_text = resolve(text,results)
    try:
        results = model.transform(data_2).selectExpr("explode(corefs) AS coref").toPandas()
        text2,from_text,to_text = resolve(text,results)
    except:
        print("Failed on the following paragraph:")
        print(text)        
        text2 = text
        from_text,to_text = text,text
    return text2,from_text,to_text

In [7]:
!mkdir $SRC_DIR/coref_resolved/

mkdir: cannot create directory ‘/mnt/clbp/paul/cross-talk/texts.examples//coref_resolved/’: File exists


In [8]:
import glob
import json
path = SRC_DIR
files = glob.glob(path + '/*.json')
changed_sentences = []
unchanged_sentences = []
for file in files:
    print(file)
    with open(file) as f:
        data = json.load(f)
    paragraphs = data['text'].split("\n\n")
    new_paragraphs = []
    for text in paragraphs:
        text2,from_text,to_text = resolveText2Text(text)
        if text2 != text:
            changed_sentences.append([text,from_text,to_text,"?"])
            #print("Orig text:",text)
            #print("From text:",from_text)
            #print("To text  :",to_text)
            #print()
        else:
            unchanged_sentences.append(text)
        new_paragraphs.append(text2)
    new_file = path+"/coref_resolved/"+file.split("/")[-1]
    with open(new_file,"w") as f:
        data['text'] = "\n\n".join(new_paragraphs)
        f.write(json.dumps(data))

    orig_contents = json.loads(open(file).read())
    orig_sentences = orig_contents["text"].split(".")
    new_contents = json.loads(open(new_file).read())
    new_sentences = new_contents["text"].split(".")

/mnt/clbp/paul/cross-talk/texts.examples/physical_activity.json


                                                                                

/mnt/clbp/paul/cross-talk/texts.examples/spinal_cord.json


                                                                                

/mnt/clbp/paul/cross-talk/texts.examples/personal_medical_history.json


                                                                                

In [9]:
# Step 1: Transforms raw texts to `document` annotation
document = DocumentAssembler() \
            .setInputCol("text") \
            .setOutputCol("document")

# Step 2: Sentence Detection
sentenceDetector = SentenceDetector() \
            .setInputCols("document") \
            .setOutputCol("sentences")

# Step 3: Tokenization
token = Tokenizer() \
            .setInputCols("sentences") \
            .setOutputCol("tokens") \
            .setContextChars(["(", ")", "?", "!", ".", ","])


# Step 4: Coreference Resolution
corefResolution= SpanBertCorefModel().pretrained("spanbert_base_coref")\
            .setInputCols(["sentences", "tokens"]) \
            .setOutputCol("corefs") \
            .setCaseSensitive(False)

# Define the pipeline
pipeline = Pipeline(stages=[document, sentenceDetector, token, corefResolution])

# Create the dataframe
data = spark.createDataFrame([["The spinal column is biomechanically stabilized by three subsystems: 1) passive subsystem that includes bone, cartilage, ligaments, intervertebral disc; 2) active subsystem that includes the paraspinal muscles; and 3) the neural control subsystem. These subsystems are often conceptualized separately, but they are functionally interdependent. Motor control and function include muscle recruitment, strength, and endurance. An important feedback component of neuromotor control is proprioception, which refers to afferent information arising from internal peripheral areas that contribute to postural control, joint stability, and several conscious sensations. Movement and control disorders presumably lead to a proprioceptive deficit, because of stress on local muscle spindles and joint receptors in the painful area resulting from stress to a joint caused by an individual’s maladaptive movement. Subsequently, abnormal joint and tissue loading during daily activities and postures may affect local proprioceptors and maintain this vicious cycle. Changes in muscle activity have been linked to spinal pain (muscle-tension or pain-spasm-pain model) or restriction of spinal motion (pain adaptation)."]]).toDF("text")

# Fit the dataframe to the pipeline to get the model
model = pipeline.fit(data)

results = model.transform(data).toPandas()

spanbert_base_coref download started this may take some time.
Approximate size to download 540.1 MB
[OK!]


                                                                                

# Text to triplets

In [10]:
import openai

openai.api_key = open("/mnt/clbp/.openai_api_key.txt").read().strip()

max_tokens = 4097
min_num_words_in_sentences = 4
min_num_words_in_paragraph = min_num_words_in_sentences
min_len_sentence = min_num_words_in_sentences*4

In [11]:
!mkdir $SRC_DIR/coref_resolved/triplets/

mkdir: cannot create directory ‘/mnt/clbp/paul/cross-talk/texts.examples//coref_resolved/triplets/’: File exists


In [32]:
import time
import io
import glob

min_num_words_in_paragraph = 3

def get_triplets(contents,field):
    all_sentences_with_results = []
    all_sentences_without_results = []
    
    output = []
    paragraphs = contents[field].split("\n\n")
    for paragraph in paragraphs:
        output.append({"paragraph": {"text":paragraph}})
        num_words = len(paragraph.split(" "))
        if num_words < min_num_words_in_paragraph:
            continue

        sentences_with_results = []
        sentences_without_results = []
        for sent in paragraph.split("."):
            print(sent[:10])
            if sent.strip() == "":
                continue
            messages=[
                  {"role": "system", "content": "Rewrite a sentence such that each sentence has one subject and one object. Keep the meaning the same and write in active voice. Your answer should be a numbered list."},
                  {"role": "user", "content": sent }
            ]
            finished = False
            while not finished:
                print("Trying openai 1")
                try:
                    response = openai.ChatCompletion.create(
                      model="gpt-3.5-turbo",
                      messages=messages,
                      temperature=0,
                      max_tokens=1024,
                      top_p=1,
                      frequency_penalty=0,
                      presence_penalty=0
                    )
                    finished = True
                except Exception as e:
                    print(e)
            sentences = response['choices'][0]['message']['content'].split("\n")
            if sentences[0].strip() == "None":
                continue
            sentences_with_results += [" ".join(s.split(" ")[1:]) for s in sentences] # Remove the number
            
        output[-1]["paragraph"]["sentences_with_results"] = [{"text":s, "clauses": []} for s in sentences_with_results]
        output[-1]["paragraph"]["sentences_without_results"] = [{"text":s, "clauses": []} for s in sentences_without_results]

        print("Sentences with results:")
        print("\n".join(sentences_with_results))
        for j,sentence in enumerate(sentences_with_results):
            print("Sentence",j+1,"out of",len(sentences_with_results))

            finished = False
            while not finished:
                print("Trying openai 2")
                try:
                    response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=[
                            {"role": "system", "content": "Break down this sentence into more straightforward sentences. Your answer should be a numbered list."},
                            {"role": "user", "content": sentence }
                        ],
                        temperature=0,
                        max_tokens=1024,
                        top_p=1,
                        frequency_penalty=0,
                        presence_penalty=0
                    )
                    finished = True
                except Exception as e:
                    print(e)
            time.sleep(1)
            clauses = [" ".join(clause.split(" ")[1:]) for clause in response['choices'][0]['message']['content'].split("\n")]
            paragraph_clauses = " ".join(clauses)
            clauses = []
            text2,from_text,to_text = resolveText2Text(paragraph_clauses)
            for c in text2.strip().split("."):
                c = c.strip()
                if c == "":
                    continue
                clauses.append(f"{c}.")
            output[-1]["paragraph"]["sentences_with_results"][j]["clauses"] = [{"text": c} for c in clauses]
            for i, clause in enumerate(clauses):  
                print("Clause",i+1,"out of",len(clauses))
                finished = False
                while not finished:
                    print("Trying openai 3")
                    try:
                        response = openai.ChatCompletion.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                {"role": "system", "content": "You will be provided a sentence, and your task is split it into a subject, verb, and object. Return your answer as JSON with keys subject, verb, object."},
                                {"role": "user", "content": clause }
                            ],
                            temperature=0,
                            max_tokens=1024,
                            top_p=1,
                            frequency_penalty=0,
                            presence_penalty=0
                        )
                        finished = True
                    except Exception as e:
                        print(e)
                time.sleep(1)
                content = response['choices'][0]['message']['content']
                output[-1]["paragraph"]["sentences_with_results"][j]["clauses"][i]["triplet"] = json.loads(content)
    return output

In [29]:
contents = {"text": "A patient's personal history also includes personal preferences, expectations, habits, such as sleep schedule, clothing choices, and diet. A patient's personal history also includes games, sports, and relationships."}
outputs = get_triplets(contents,"text")

Trying openai 1
Trying openai 1
Sentences with results:
The patient's personal history includes personal preferences.
The patient's personal history includes expectations.
The patient's personal history includes habits, such as sleep schedule.
The patient's personal history includes habits, such as clothing choices.
The patient's personal history includes habits, such as diet.
Games are included in a patient's personal history.
Sports are included in a patient's personal history.
Relationships are included in a patient's personal history.
Sentence 1 out of 8
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 2 out of 8
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 3 out of 8
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 4 out of 8
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 5 out of 8
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 6 out of 8
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 7 out of 8
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 8 out of 8
Trying openai 2


                                                                                

Clause 1 out of 4
Trying openai 3
Clause 2 out of 4
Trying openai 3
Clause 3 out of 4
Trying openai 3
Clause 4 out of 4
Trying openai 3


In [30]:
outputs

[{'paragraph': {'text': "A patient's personal history also includes personal preferences, expectations, habits, such as sleep schedule, clothing choices, and diet. A patient's personal history also includes games, sports, and relationships.",
   'sentences_with_results': [{'text': "The patient's personal history includes personal preferences.",
     'clauses': [{'text': 'The patient has a personal history.',
       'triplet': {'subject': 'patient',
        'verb': 'has',
        'object': 'a personal history'}},
      {'text': 'a personal history includes personal preferences.',
       'triplet': {'subject': 'a personal history',
        'verb': 'includes',
        'object': 'personal preferences'}}]},
    {'text': "The patient's personal history includes expectations.",
     'clauses': [{'text': 'The patient has a personal history.',
       'triplet': {'subject': 'patient',
        'verb': 'has',
        'object': 'a personal history'}},
      {'text': 'a personal history includes exp

In [36]:
!ls -l {outfile}

-rw-rw-r-- 1 myuser myuser 9960 Sep 26 21:29 /mnt/clbp/paul/cross-talk/texts.examples//coref_resolved/triplets/personal_medical_history.json


In [33]:
path = SRC_DIR
files = glob.glob(path + '/coref_resolved/*.json')
for file in files[1:]:
    print(file)
    with open(file) as f:
        data = json.load(f)
    output = get_triplets(data,"text")
    outfile = path + '/coref_resolved/triplets/'+file.split("/")[-1]
    open(outfile,"w").write(json.dumps(output))

/mnt/clbp/paul/cross-talk/texts.examples//coref_resolved/spinal_cord.json
The spinal
Trying openai 1
 three sub
Trying openai 1
 Motor con
Trying openai 1
 An import
Trying openai 1
 Movement 
Trying openai 1
 Subsequen
Trying openai 1
 Changes i
Trying openai 1

Sentences with results:
Three subsystems biomechanically stabilize the spinal column: the passive subsystem includes bone, cartilage, ligaments, and intervertebral disc.
The paraspinal muscles form the active subsystem that biomechanically stabilizes the spinal column.
The neural control subsystem also plays a role in biomechanically stabilizing the spinal column.
People often conceptualize three subsystems separately, but the paraspinal muscles are functionally interdependent.
Many individuals tend to conceptualize three subsystems separately, but the paraspinal muscles are functionally interdependent.
It is common for individuals to conceptualize three subsystems separately, but the paraspinal muscles are functionally interd

                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 2 out of 18
Trying openai 2


                                                                                

Clause 1 out of 5
Trying openai 3
Clause 2 out of 5
Trying openai 3
Clause 3 out of 5
Trying openai 3
Clause 4 out of 5
Trying openai 3
Clause 5 out of 5
Trying openai 3
Sentence 3 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 4 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 5 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 6 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 7 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 8 out of 18
Trying openai 2
Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 9 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 10 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 11 out of 18
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 12 out of 18
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 13 out of 18
Trying openai 2


                                                                                

Clause 1 out of 8
Trying openai 3
Clause 2 out of 8
Trying openai 3
Clause 3 out of 8
Trying openai 3
Clause 4 out of 8
Trying openai 3
Clause 5 out of 8
Trying openai 3
Clause 6 out of 8
Trying openai 3
Clause 7 out of 8
Trying openai 3
Clause 8 out of 8
Trying openai 3
Sentence 14 out of 18
Trying openai 2


                                                                                

Clause 1 out of 4
Trying openai 3
Clause 2 out of 4
Trying openai 3
Clause 3 out of 4
Trying openai 3
Clause 4 out of 4
Trying openai 3
The server is overloaded or not ready yet.
Trying openai 3
Sentence 15 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 16 out of 18
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 17 out of 18
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 18 out of 18
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
/mnt/clbp/paul/cross-talk/texts.examples//coref_resolved/personal_medical_history.json
The person
Trying openai 1
 the perso
Trying openai 1
 The patie
Trying openai 1
 All of th
Trying openai 1

Sentences with results:
A patient's personal history includes their employment status.
A patient's personal history includes their occupation.
A patient's personal history includes their marital status.
A patient's personal history includes their insurance status.
A patient's personal history includes social history, which encompasses alcohol, tobacco, and substance use.
A patient's personal history also includes personal preferences, expectations, and habits such as sleep schedule, clothing choices, and diet.
The medical history of the patient's isolated and chronic illnesses is important.
The isolated and chronic illnesses of the patient have an important medical history.
Each individual lik

                                                                                

Clause 1 out of 4
Trying openai 3
Clause 2 out of 4
Trying openai 3
Clause 3 out of 4
Trying openai 3
Clause 4 out of 4
Trying openai 3
Sentence 2 out of 13
Trying openai 2


                                                                                

Clause 1 out of 5
Trying openai 3
Clause 2 out of 5
Trying openai 3
Clause 3 out of 5
Trying openai 3
Clause 4 out of 5
Trying openai 3
Clause 5 out of 5
Trying openai 3
Sentence 3 out of 13
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 4 out of 13
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 5 out of 13
Trying openai 2


                                                                                

Clause 1 out of 4
Trying openai 3
Clause 2 out of 4
Trying openai 3
Clause 3 out of 4
Trying openai 3
Clause 4 out of 4
Trying openai 3
Sentence 6 out of 13
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 7 out of 13
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 8 out of 13
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 9 out of 13
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3
Sentence 10 out of 13
Trying openai 2


                                                                                

Clause 1 out of 4
Trying openai 3
Clause 2 out of 4
Trying openai 3
Clause 3 out of 4
Trying openai 3
Clause 4 out of 4
Trying openai 3
Sentence 11 out of 13
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 12 out of 13
Trying openai 2


                                                                                

Clause 1 out of 3
Trying openai 3
Clause 2 out of 3
Trying openai 3
Clause 3 out of 3
Trying openai 3
Sentence 13 out of 13
Trying openai 2


                                                                                

Clause 1 out of 2
Trying openai 3
Clause 2 out of 2
Trying openai 3


23/09/27 04:02:50 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 176025 ms exceeds timeout 120000 ms
23/09/27 04:02:50 WARN SparkContext: Killing executors is not supported by current scheduler.


# Triplets to graph

In [37]:
import glob
from py2neo import Graph
graph = Graph("bolt://localhost:7687",password='12345678',name="version1")

tx = graph.begin()
#tx.run("MATCH (n:Paragraph) DETACH DELETE n;")
#tx.run("MATCH (n:ResultSentence) DETACH DELETE n;")
#tx.run("MATCH (n:Subject) DETACH DELETE n;")
#tx.run("MATCH (n:Object) DETACH DELETE n;")
#tx.run("MATCH (n:Entity) DETACH DELETE n;")
tx.run("MATCH (n) DETACH DELETE n;")
graph.commit(tx)

path = '/mnt/clbp/paul/cross-talk/texts.examples/coref_resolved/triplets/'
files = glob.glob(path+'/*.json')
for file in files:
    print(file)
    with open(file) as f:
        data = json.load(f)
    name = file.split("/")[-1]
    domain = name.split(".")[0]
    tx = graph.begin()
    for ix,item in enumerate(data):
        paragraph = item['paragraph']
        text = paragraph['text']
        tx.run("CREATE (paragraph:Paragraph {source: $source, domain: $domain, ix: $ix, text: $text}) RETURN paragraph", source=file.split("/")[-1], domain=domain, ix=ix, text=text)
    graph.commit(tx)
    tx = graph.begin()
    for ix,item in enumerate(data):
        paragraph = item['paragraph']
        if 'sentences_with_results' in paragraph:
            for jx, result_sentence in enumerate(paragraph['sentences_with_results']):
                text = result_sentence['text']
                tx.run("MATCH (paragraph:Paragraph {ix: $ix, domain:$domain}) CREATE (sentence:ResultSentence {ix: $jx, text: $text})-[:FROM]->(paragraph) RETURN *", domain=domain, ix=ix, jx=jx, text=text)
    graph.commit(tx)
    tx = graph.begin()
    for ix,item in enumerate(data):
        paragraph = item['paragraph']
        if 'sentences_with_results' in paragraph:
            for jx, result_sentence in enumerate(paragraph['sentences_with_results']):
                for kx, clause in enumerate(result_sentence['clauses']):
                    text = clause['text']
                    triplet = clause['triplet']
                    subject,verb,object = triplet['subject'].lower(),triplet['verb'].lower(),triplet['object'].lower()
                    print(subject,verb,object,sep="|")
                    tx.run("""MATCH (sentence:ResultSentence {ix:$jx})-[:FROM]->(paragraph:Paragraph {ix: $ix, domain:$domain}) 
                              CREATE (s:Subject {ix: $kx, text: $subject, from: elementId(sentence)})-[v:VERB {ix: $kx, text: $verb, from: elementId(sentence)}]->(o:Object {ix: $kx, text: $object, from: elementId(sentence)}) RETURN *""",
                           domain=domain, ix=ix, jx=jx, kx=kx, subject=subject, verb=verb, object=object)
    graph.commit(tx)

/mnt/clbp/paul/cross-talk/texts.examples/coref_resolved/triplets/physical_activity.json
physical activity|imposes|loads and movements onto the spine
loads and movements|are|not constant, but instead fluctuate
fluctuating loads and movements|can have|an impact on the spine
physical activity|involves|putting stress and strain on the spine
stress and strain|cause|the spine to move in different ways
movements of the spine|are|not constant, but rather change or vary
the spine|is|a part of the body that is subjected to various loads and movements
various loads and movements|are caused by|physical activity
the spine|experiences|various loads and movements
loading in literature|has shown|that it has an impact that depends on the dose
sedentary and strenuous activities|are believed to have|negative effects
literature|has shown|that loading has a dose-dependent influence
sedentary activities|are thought|to be detrimental
strenuous activities|are|also thought to be detrimental
the influence of lo

# Clinical embeddings

## Combining/registering nodes

In [38]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetector, BertSentenceEmbeddings
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")
sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

embeddings = BertSentenceEmbeddings.pretrained("sent_biobert_clinical_base_cased", "en").setInputCols("sentence").setOutputCol("sentence_embeddings")

pipeline = Pipeline(stages=[documentAssembler,
                            sentence,
                            embeddings])

def get_embedding(text):
    example_df = spark.createDataFrame([[text]]).toDF("text")
    model = pipeline.fit(example_df)
    results = model.transform(example_df).toPandas()
    return results['sentence_embeddings'].loc[0][0]['embeddings']

sent_biobert_clinical_base_cased download started this may take some time.
Approximate size to download 386.6 MB
[ | ]sent_biobert_clinical_base_cased download started this may take some time.
Approximate size to download 386.6 MB
Download done! Loading the resource.
[OK!]


In [39]:
A = get_embedding("data")
B = get_embedding("the data")
import numpy as np
from numpy.linalg import norm
 
# compute cosine similarity
cosine = (np.dot(A,B)/(norm(A)*norm(B))+1)/2
print("Cosine Similarity:", cosine)

Cosine Similarity: 0.9176039052884686


In [40]:
graph.run("MATCH (s) WHERE s:Subject or s:Object SET s:SubjectOrObject return *");

In [41]:
from py2neo import Graph
graph = Graph(password='12345678',name="version1")

In [42]:
for r in graph.run("MATCH (z:SubjectOrObject) RETURN z"):
    #do something with node here
    z = r['z']
    text = z['text']
    if len(text.strip()) > 2:
        embedding = get_embedding(text.strip())
        z['embedding'] = embedding
        z.update()
        tx = graph.begin()
        tx.push(z)
        graph.commit(tx)
    else:
        embedding = get_embedding("THIS IS AN ERROR")
        z['embedding'] = embedding
        z.update()
        tx = graph.begin()
        tx.push(z)
        graph.commit(tx)

In [43]:
#graph.run("CALL db.index.vector.createNodeIndex('text-embeddings', 'SubjectOrObject', 'embedding', 768, 'cosine')")

In [44]:
print(graph.run("match (z:SubjectOrObject)-[e:COSINE_SIM]-(:SubjectOrObject) delete e"))
print(graph.run("match (z:SubjectOrObject)-[e:IS_A]-(:Entity) delete e"))

(No data)
(No data)


In [45]:
sim_threshold = 0.95
cypher = "MATCH (z:SubjectOrObject) RETURN DISTINCT z.text";
for r in graph.run(cypher):
    if len(str(r).strip()) > 2:
        sim_cypher = """
    MATCH (s:SubjectOrObject {text: %s})
    CALL db.index.vector.queryNodes('text-embeddings', 20, s.embedding)
    YIELD node AS similar, score
    WHERE score > %s and elementId(s) < elementId(similar)
    MERGE (s)-[:COSINE_SIM {score: score}]-(similar)
    return *
        """%(str(r),sim_threshold)
        #print(sim_cypher)
        try:
            graph.run(sim_cypher)
        except:
            print(r)
            import pdb; pdb.set_trace()

In [46]:
try:
    graph.run("CALL gds.graph.drop('myGraph') YIELD graphName;")
except:
    pass # graph exists

graph.run(
    """
CALL gds.graph.project(
  'myGraph',
  'SubjectOrObject',
  'COSINE_SIM',
  {
  }
)
    """
)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{SubjectOrObject: {label: 'SubjectOrObject', properties: {}}}","{COSINE_SIM: {aggregation: 'DEFAULT', orientation: 'NATURAL', indexInverse: false, properties: {}, type: 'COSINE_SIM'}}",myGraph,336,551,12


In [47]:
graph.run("""
CALL gds.wcc.stream('myGraph')
YIELD componentId, nodeId
WITH gds.util.asNode(nodeId) AS n, componentId AS componentId
SET n.componentId=componentId
RETURN n
""");

In [48]:
graph.run("""
MATCH (z:SubjectOrObject) with COUNT(z.componentId) as c, z.componentId as componentId
MERGE (b:Entity {componentId: componentId})
return *
""");

In [49]:
graph.run("MATCH (z:Entity), (b:SubjectOrObject) WHERE b.componentId=z.componentId MERGE (z)<-[:IS_A]-(b) return *");

In [50]:
graph.run("""
MATCH (s:Subject)-[v:VERB]->(o:Object)
WITH s,v,o
MATCH (e1:Entity {componentId:s.componentId}), (e2:Entity {componentId: o.componentId})
with e1,v,e2
MERGE (e1)-[:VERB2 {text: v.text, from: elementId(v)}]->(e2) RETURN *""");

In [51]:
for cid in graph.run("""MATCH (z:Entity) return DISTINCT z.componentId as componentId"""):
    cid = int(str(cid))
    texts = []
    for text in graph.run("""MATCH (z:SubjectOrObject {componentId: %s}) return z.text as text"""%cid):
        texts.append(str(text)[1:-1]) # the weird 1:-1 is because of quoting done by neo4j
    text = max(set(texts), key=texts.count)
    graph.run("""MATCH (z:Entity {componentId: %s}) SET z.text = "%s" return *"""%(cid,text))

## Combining and registering vertices

In [52]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetector, UniversalSentenceEncoder
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")
sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

embeddings = UniversalSentenceEncoder.pretrained("tfhub_use", "en").setInputCols("sentence").setOutputCol("sentence_embeddings")

pipeline = Pipeline(stages=[documentAssembler,
                            sentence,
                            embeddings])

def get_embedding2(text):
    example_df = spark.createDataFrame([[text]]).toDF("text")
    model = pipeline.fit(example_df)
    results = model.transform(example_df).toPandas()
    return results['sentence_embeddings'].loc[0][0]['embeddings']

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
Download done! Loading the resource.
[OK!]


In [53]:
graph.run("match (e:NODE_VERB2)-[r]-() delete e,r")
cypher = "MATCH (a)-[r:VERB2]->(b) MERGE (e:NODE_VERB2 {text:r.text, from: elementId(a), to: elementId(b), edge: elementId(r)}) RETURN DISTINCT r.text";
graph.run(cypher)
for r in graph.run("MATCH (z:NODE_VERB2) RETURN z"):
    #do something with node here
    z = r['z']
    text = z['text']
    if len(text.strip()) > 2:
        embedding = get_embedding2(text.strip())
        z['embedding'] = embedding
        z.update()
        tx = graph.begin()
        tx.push(z)
        graph.commit(tx)
    else:
        embedding = get_embedding2("THIS IS AN ERROR")
        z['embedding'] = embedding
        z.update()
        tx = graph.begin()
        tx.push(z)
        graph.commit(tx)

                                                                                

In [54]:
#graph.run("CALL db.index.vector.createNodeIndex('edge-text-embeddings', 'NODE_VERB2', 'embedding', 512, 'cosine')")

In [55]:
print(graph.run("match (z:NODE_VERB2)-[e:COSINE_SIM]-(:NODE_VERB2) delete e"))
print(graph.run("match (z:NODE_VERB2)-[e:IS_A]-(:Entity_VERB2) delete e"))

(No data)
(No data)


In [56]:
sim_threshold = 0.9
cypher = "MATCH (z:NODE_VERB2) RETURN DISTINCT z.text";
for r in graph.run(cypher):
    if len(str(r).strip()) > 2:
        sim_cypher = """
    MATCH (s:NODE_VERB2 {text: %s})
    CALL db.index.vector.queryNodes('edge-text-embeddings', 20, s.embedding)
    YIELD node AS similar, score
    WHERE score > %s and elementId(s) < elementId(similar)
    return elementId(s) as s,elementId(similar) as similar,score
        """%(str(r),sim_threshold)
        for r in graph.run(sim_cypher):
            s = r['s']
            similar = r['similar']
            score = r['score']
            graph.run("MATCH (s:NODE_VERB2),(similar:NODE_VERB2) where elementId(s)='%s' and elementId(similar)='%s'  MERGE (s)-[e:COSINE_SIM {score: %s}]-(similar) return e.score"%(s,similar,score))

In [57]:
try:
    graph.run("CALL gds.graph.drop('myGraph2') YIELD graphName;")
except:
    pass # graph exists

graph.run(
    """
CALL gds.graph.project(
  'myGraph2',
  'NODE_VERB2',
  'COSINE_SIM',
  {
  }
)
    """
)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{NODE_VERB2: {label: 'NODE_VERB2', properties: {}}}","{COSINE_SIM: {aggregation: 'DEFAULT', orientation: 'NATURAL', indexInverse: false, properties: {}, type: 'COSINE_SIM'}}",myGraph2,168,412,7


In [58]:
graph.run("""
CALL gds.wcc.stream('myGraph2')
YIELD componentId, nodeId
WITH gds.util.asNode(nodeId) AS n, componentId AS componentId
SET n.componentId=componentId
RETURN n
""");

In [59]:
graph.run("""
MATCH (z:NODE_VERB2) with COUNT(z.componentId) as c, z.componentId as componentId
MERGE (b:Entity_VERB2 {componentId: componentId})
return *
""");

In [60]:
graph.run("MATCH (z:Entity_VERB2), (b:NODE_VERB2) WHERE b.componentId=z.componentId MERGE (z)<-[:IS_A]-(b) return *");

In [61]:
for cid in graph.run("""MATCH (z:Entity_VERB2) return DISTINCT z.componentId as componentId"""):
    cid = int(str(cid))
    texts = []
    for text in graph.run("""MATCH (z:NODE_VERB2 {componentId: %s}) return z.text as text"""%cid):
        texts.append(str(text)[1:-1]) # the weird 1:-1 is because of quoting done by neo4j
    if len(texts) > 0:
        text = max(set(texts), key=texts.count)
        graph.run("""MATCH (z:Entity_VERB2 {componentId: %s}) SET z.text = "%s" return *"""%(cid,text))

In [62]:
graph.run("""
MATCH (e1:Entity)-[v:VERB2]-(e2:Entity)
MATCH (nv:NODE_VERB2 {text: v.text})-[:IS_A]->(ev:Entity_VERB2)
WITH e1,e2,ev
MERGE (e1)-[:VERB3 {text: ev.text}]-(e2)
RETURN *""");

# Reports

## Coresolution

In [None]:
changed_sentences = pd.DataFrame(changed_sentences,columns=["original","from","to","correct (y,n,?)"])
changed_sentences["comments"] = ""
changed_sentences.to_csv(f"{SRC_DIR}/coref_resolved/debug_changed.csv")

unchanged_sentences = pd.DataFrame(unchanged_sentences,columns=["text"])
unchanged_sentences["correct (y,n,?)"] = "?"
unchanged_sentences["comments"] = ""
unchanged_sentences.to_csv(f"{SRC_DIR}/coref_resolved/debug_unchanged.csv")

## Triplets

In [None]:
import pandas as pd

results = graph.run("""
MATCH (n:SubjectOrObject)
WITH n.from AS source, COLLECT(n) AS nodesa
UNWIND nodes AS n
MATCH (rs:ResultSentence)a
WHERE elementID(rs) in source
MATCH (n:SubjectOrObject)-[r:VERB]->(m:SubjectOrObject)
return rs.text, n.text, r.text, m.text
""")
triplets_df = pd.DataFrame(results)
triplets_df.columns = ["source sentence", "Subject", "Verb", "Object"]

results = graph.run("""
MATCH(n:ResultSentence)-[:FROM]->(m:Paragraph)
RETURN n.text, m.text
""")
simp_df = pd.DataFrame(results)
simp_df.columns = ["source sentence", "source paragraph"]
# simp_df.to_csv("par2sent.csv")

triplet_report = pd.merge(simp_df, triplets_df, on="source sentence")
triplet_report.to_csv(f"{REPORT_DIR}/triplet_report.csv")

## Cosine similarity report

In [None]:
results = graph.run("""
MATCH (n:SubjectOrObject)-[r:COSINE_SIM]->(m:SubjectOrObject)
RETURN n.text, m.text, r.score""")
score_df = pd.DataFrame(results)
score_df.columns = ["node1", "node2", "similarity"]
score_df = score_df.drop_duplicates()
score_df

In [None]:
score_df.to_csv("cosine_similarity_report.csv")