In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
import os

# Directory containing your PDF files
directory_path = 'TCs'

# Initialize PyPDFLoader for each PDF in the directory
loaders = [PyPDFLoader(os.path.join(directory_path, f)) for f in os.listdir(directory_path) if f.endswith('.pdf')]

# Load documents from PDFs
news_docs = []
for loader in loaders:
    news_docs.extend(loader.load())

# Prepare the content and metadata for each news article as Document objects
news_articles_data = [
    Document(
        page_content=doc.page_content,  # Assuming this is how you access the page content of the document
        metadata={
            "source": doc.metadata['source'].removeprefix('TCs'),  # Assuming this is the metadata format
            # Include any other metadata items here
        }
    )
    for doc in news_docs  # Assuming news_docs is a list of objects with page_content and metadata
]

data = news_articles_data[0].page_content

In [2]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=80
)
split_docs = text_splitter.split_text(data)
print(len(split_docs))

9


In [3]:
!python3.9 -m spacy download en_core_web_md


[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting en-core-web-md==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
    tinycss2 (>=1.1.0<1.2) ; extra == 'css'
             ~~~~~~~~^[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


entity & relationships

In [4]:
import os
import json
import spacy
from collections import Counter
from pathlib import Path
from wasabi import msg
from spacy_llm.util import assemble

# traditional spacy NER (Named Recognition Library)
def split_document_sent(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents] # referencial

# spacy-llm relationship extraction
def process_text(nlp, text, verbose=False):
    doc = nlp(text)
    if verbose:
        msg.text(f"Text: {doc.text}")
        msg.text(f"Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
        msg.text("Relations:")
        for r in doc._.rel:
            msg.text(f"  - {doc.ents[r.dep]} [{r.relation}] {doc.ents[r.dest]}")
    return doc

def run_pipeline(config_path, examples_path=None, verbose=False):
    if not os.getenv("OPENAI_API_KEY"):
        msg.fail("OPENAI_API_KEY env variable was not found. Set it and try again.", exits=1)

    nlp = assemble(config_path, overrides={} if examples_path is None else {"paths.examples": str(examples_path)})

    # Initialize counters and storage
    processed_data = []
    entity_counts = Counter()
    relation_counts = Counter()

    # Load your articles and news data here
    # all_data = news_articles_data + documents

    # sents = split_document_sent(summarization_results)
    sents = split_docs[:4]
    for sent in sents:
        doc = process_text(nlp, sent, verbose)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        relations = [(doc.ents[r.dep].text, r.relation, doc.ents[r.dest].text) for r in doc._.rel]

        # Store processed data
        processed_data.append({'text': doc.text, 'entities': entities, 'relations': relations})

        # Update counters
        entity_counts.update([ent[1] for ent in entities])
        relation_counts.update([rel[1] for rel in relations])

    # Export to JSON
    with open('processed_data.json', 'w') as f:
        json.dump(processed_data, f)

    # Display summary
    msg.text(f"Entity counts: {entity_counts}")
    msg.text(f"Relation counts: {relation_counts}")

# Set your configuration paths and flags
config_path = Path("config.cfg")
examples_path = None  # or None if not using few-shot
verbose = True

# Run the pipeline
file = run_pipeline(config_path, None, verbose)




Text: ENGLISH[ENT0:NORP] APPLE INC.[ENT1:ORG] SOFTWARE LICENSE AGREEMENT FOR
macOS Sequoia[ENT2:PERSON] For use on Apple[ENT3:ORG]-branded Systems PLEASE
READ THIS SOFTWARE LICENSE AGREEMENT (“LICENSE”) CAREFULLY BEFORE  USING THE
APPLE SOFTWARE.  BY USING THE APPLE SOFTWARE, YOU ARE AGREEING TO BE  BOUND BY
THE TERMS OF THIS LICENSE.  IF YOU DO NOT AGREE TO THE TERMS OF THIS  LICENSE,
DO NOT INSTALL AND/OR USE THE APPLE SOFTWARE AND, IF PRESENTED WITH  THE OPTION
TO “AGREE” OR “DISAGREE[ENT4:WORK_OF_ART]” TO THE TERMS, CLICK
“DISAGREE[ENT5:WORK_OF_ART]”. IF YOU
Entities: [('ENGLISH', 'NORP'), ('APPLE INC.', 'ORG'), ('macOS Sequoia',
'PERSON'), ('Apple', 'ORG'), ('DISAGREE', 'WORK_OF_ART'), ('DISAGREE',
'WORK_OF_ART')]
Relations:
  - macOS Sequoia [LICENSE_FOR] APPLE INC.
  - Apple [BRANDED_BY] APPLE INC.
Text: THE OPTION TO “AGREE” OR “DISAGREE[ENT0:WORK_OF_ART]” TO THE TERMS, CLICK
“DISAGREE[ENT1:WORK_OF_ART]”. IF YOU  ACQUIRED THE APPLE SOFTWARE AS PART OF AN
APPLE HARDWARE PURCHASE

In [5]:
# from spacy_llm.util import assemble

# nlp = assemble("config.cfg")
# doc = nlp("You look gorgeous!")
# print(doc.cats)