In [26]:
from platform import python_version

print(python_version())

# %pip install langchain
# %pip install -U langchain-community
# %pip install pypdf
# %pip install spacy
# !python3.9 -m pip install spacy-llm
# !python3.9 -m spacy download en_core_web_sm

3.9.21


In [21]:
import langchain
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
import os

# Directory containing your PDF files
directory_path = 'TCs'

# Initialize PyPDFLoader for each PDF in the directory
loaders = [PyPDFLoader(os.path.join(directory_path, f)) for f in os.listdir(directory_path) if f.endswith('.pdf')]

# Load documents from PDFs
news_docs = []
for loader in loaders:
    news_docs.extend(loader.load())

# Prepare the content and metadata for each news article as Document objects
news_articles_data = [
    Document(
        page_content=doc.page_content,  # Assuming this is how you access the page content of the document
        metadata={
            "source": doc.metadata['source'].removeprefix('TCs'),  # Assuming this is the metadata format
            # Include any other metadata items here
        }
    )
    for doc in news_docs  # Assuming news_docs is a list of objects with page_content and metadata
]

data = news_articles_data[0].page_content
type(news_articles_data[0])

page_contents = [doc.page_content for doc in news_docs]
combined = "".join(page_contents) 
type(combined)

str

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, chunk_overlap=200
)
# split_docs = text_splitter.split_text(data)
split_docs = text_splitter.split_text(combined)
print(len(split_docs))

38


### Entity & Relationships

In [28]:
import os
import json
import spacy
from collections import Counter
from pathlib import Path
from wasabi import msg
from spacy_llm.util import assemble

# traditional spacy NER (Named Recognition Library)
def split_document_sent(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents] # referencial

# spacy-llm relationship extraction
def process_text(nlp, text, verbose=False):
    doc = nlp(text)
    if verbose:
        msg.text(f"Text: {doc.text}")
        msg.text(f"Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
        msg.text("Relations:")
        for r in doc._.rel:
            msg.text(f"  - {doc.ents[r.dep]} [{r.relation}] {doc.ents[r.dest]}")
    return doc

def run_pipeline(config_path, examples_path=None, verbose=False):
    if not os.getenv("OPENAI_API_KEY"):
        msg.fail("OPENAI_API_KEY env variable was not found. Set it and try again.", exits=1)

    nlp = assemble(config_path, overrides={} if examples_path is None else {"paths.examples": str(examples_path)})

    # Initialize counters and storage
    processed_data = []
    entity_counts = Counter()
    relation_counts = Counter()

    # Load your articles and news data here
    # all_data = news_articles_data + documents

    # sents = split_document_sent(combined)
    # sents = split_docs[:4]
    sents = split_docs
    for sent in sents:
        doc = process_text(nlp, sent, verbose)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        relations = [(doc.ents[r.dep].text, r.relation, doc.ents[r.dest].text) for r in doc._.rel]

        # Store processed data
        processed_data.append({'text': doc.text, 'entities': entities, 'relations': relations})

        # Update counters
        entity_counts.update([ent[1] for ent in entities])
        relation_counts.update([rel[1] for rel in relations])

    # Export to JSON
    with open('processed_data.json', 'w') as f:
        json.dump(processed_data, f)

    # Display summary
    msg.text(f"Entity counts: {entity_counts}")
    msg.text(f"Relation counts: {relation_counts}")

# Set your configuration paths and flags
config_path = Path("config.cfg")
examples_path = None  # or None if not using few-shot
verbose = True

# Run the pipeline
file = run_pipeline(config_path, None, verbose)



Text: ENGLISH[ENT0:NORP] APPLE INC.[ENT1:ORG] SOFTWARE LICENSE AGREEMENT FOR
macOS Sequoia[ENT2:PERSON] For use on Apple[ENT3:ORG]-branded Systems PLEASE
READ THIS SOFTWARE LICENSE AGREEMENT (“LICENSE”) CAREFULLY BEFORE  USING THE
APPLE SOFTWARE.  BY USING THE APPLE SOFTWARE, YOU ARE AGREEING TO BE  BOUND BY
THE TERMS OF THIS LICENSE.  IF YOU DO NOT AGREE TO THE TERMS OF THIS  LICENSE,
DO NOT INSTALL AND/OR USE THE APPLE SOFTWARE AND, IF PRESENTED WITH  THE OPTION
TO “AGREE” OR “DISAGREE[ENT4:WORK_OF_ART]” TO THE TERMS, CLICK
“DISAGREE[ENT5:WORK_OF_ART]”. IF YOU  ACQUIRED THE APPLE SOFTWARE AS PART OF AN
APPLE HARDWARE PURCHASE AND IF YOU  DO NOT AGREE TO THE TERMS OF THIS LICENSE,
YOU MAY RETURN THE ENTIRE APPLE  HARDWARE/SOFTWARE PACKAGE WITHIN THE RETURN
PERIOD TO THE APPLE STORE OR  AUTHORIZED DISTRIBUTOR WHERE YOU OBTAINED IT FOR A
REFUND, SUBJECT TO APPLE[ENT6:ORG]’S  RETURN POLICY FOUND AT
https://www.apple.com/legal/sales-support/. YOU MUST RETURN  THE ENTIRE
HARDWARE/SOFTWARE 

In [5]:
# from spacy_llm.util import assemble

# nlp = assemble("config.cfg")
# doc = nlp("You look gorgeous!")
# print(doc.cats)

In [59]:
def classify_and_generate_queries(json_data, file_path):
    nodes = {}
    relationships = []

    # Enhanced mapping function for entity types based on eFLINT elements
    def map_eflint_type(entity_type, entity_name):
        # we are not really interested in numbers so we delete those
        # in the case of links it is better that we classify them as FACT rather than ORG
        if 'http://' in entity_type or 'https://' in entity_name:
            return 'FACT'

        if entity_type in ['CARDINAL', 'ORDINAL']:
            return 'NUMBER'


        if entity_type == 'ORG':
            if 'terms' in entity_name.lower() or 'conditions' in entity_name.lower() or 'agreement' in entity_name.lower():
                return 'DUTY'
            else:
                return 'ACTOR'

        if 'section' in entity_name.lower() or 'section' in entity_type.lower():
            return 'SECTION'
        
        mapping = {
            'PERSON': 'ACTOR',
            'EVENT': 'EVENT',
            'LAW': 'DUTY',
            'WORK_OF_ART': 'ACT',
            'CONDITION': 'CONDITION',
            'DATE': 'DATE',
            # they sadly do not appear in the NER we performed but we can nevertheless include them to keep the eFLINT semantics
            'CLAIMANT': 'CLAIMANT',
            'HOLDER': 'HOLDER'
        }
        return mapping.get(entity_type, 'FACT') 

    # entities and relationships
    for item in json_data:
        entities = item.get('entities', [])
        relations = item.get('relations', [])
        
        for entity in entities:
            entity_type = map_eflint_type(entity[1], entity[0])
            node_id = f"{entity[0].replace(' ', '_')}_{entity_type}"
            nodes[node_id] = {'name': entity[0], 'type': entity_type}
        
        for relation in relations:
            src_id = f"{relation[0].replace(' ', '_')}_{map_eflint_type(entities[0][1], entities[0][0])}"
            tgt_id = f"{relation[2].replace(' ', '_')}_{map_eflint_type(entities[0][1], entities[0][0])}"
            relationship_type = relation[1].replace(' ', '_').replace('-', '_')
            relationships.append((src_id, relationship_type, tgt_id))

    node_queries = [
        f"MERGE (n:{data['type']} {{name: '{data['name']}'}}) SET n.id = '{node_id}'"
        for node_id, data in nodes.items() if data['type'] != 'NUMBER'
    ]

    relationship_queries = [
        f"MATCH (a), (b) WHERE a.id = '{rel[0]}' AND b.id = '{rel[2]}' "
        f"MERGE (a)-[:`{rel[1]}`]->(b)"
        for rel in relationships
    ]

    queries = node_queries + relationship_queries
    # Save queries to a text file
    with open(file_path, 'w') as file:
        for query in queries:
            file.write(query + '\n')

    return queries

In [60]:
with open('processed_data.json', 'r') as file:
    json_data = json.load(file)

queries = classify_and_generate_queries(json_data, 'cypher_queries.txt')

In [61]:
from neo4j import GraphDatabase

def execute_queries(queries, uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session() as session:
        for query in queries:
            session.run(query)
    driver.close()

# Example usage
uri = "neo4j://localhost:7687"
user = "neo4j"
password = "movies11"

execute_queries(queries, uri, user, password)
