In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sycamore.data import Document, HierarchicalDocument
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.connectors.file.file_scan import JsonManifestMetadataProvider
from sycamore.transforms.partition import ArynPartitioner
from sycamore.utils.time_trace import timetrace

import sycamore
import pickle
import json
import sys
import pandas as pd
import csv
import os
from pathlib import Path

from sycamore.utils.cache import DiskCache
ctx = sycamore.init() 
llm = OpenAI(OpenAIModels.GPT_4O, cache=DiskCache("/home/ritam/sycamore/ritam-scripts/llm_cache"))



In [3]:
def unpickle_doc(pdoc: Document) -> list[Document]:
    doc = pickle.loads(pdoc.binary_representation)
    return [doc]

path = Path('ntsb_parsed_1').resolve()
pickled_docset = ctx.read.binary(str(path), binary_format="pickle")
ds = pickled_docset.flat_map(unpickle_doc)

In [None]:
from sycamore.transforms.extract_document_structure import StructureBySection, StructureByDocument
from sycamore.transforms.extract_graph_entities import EntityExtractor
from sycamore.transforms.extract_graph_relationships import RelationshipExtractor
from pydantic import BaseModel, Field
import boto3


class Report(BaseModel):
    ID: str = Field(description="This ID can be found under either Accident Number, Incident Number, or Occurance Number")

class Aircraft(BaseModel):
    registration: str
    make: str
    model: str

class INVOLVED_AIRCRAFT(BaseModel):
    start: Report
    end: Aircraft

ENTITY_DEFAULT_PROMPT = """
    -Instructions-
    You are a information extraction system.

    You will be given a sequence of data in different formats(text, table, Section-header) in order.
    Your job is to extract entities from the text input that match the entity schemas provided. Each entity
    and property extracted should directly reference part of the text input provided.

    """

RELATIONSHIP_DEFAULT_PROMPT = """
    -Goal-
    You are a helpful information extraction system.

    You will be given a sequence of data in different formats(text, table, Section-header) in order.
    Your job is to extract relationships that map between entities that have already been extracted from this text.

    """


ds_resolved = (
    ds.extract_document_structure(StructureBySection)
    .extract_graph_entities([EntityExtractor(llm=llm, entities=[Report, Aircraft], prompt=ENTITY_DEFAULT_PROMPT)])
    .extract_graph_relationships([RelationshipExtractor(llm=llm, relationships=[INVOLVED_AIRCRAFT], prompt=RELATIONSHIP_DEFAULT_PROMPT)])
    .materialize(path="/tmp/extract_graph", source_mode=sycamore.MATERIALIZE_USE_STORED)
    .resolve_graph_entities(resolvers=[], resolve_duplicates=False)
    #.explode()
)

docs = ds_resolved.take_all()

#URI = "bolt://localhost:11001"
#AUTH = ("neo4j", "koala-stereo-comedy-spray-figure-6974")


# URI = 'neo4j+s://adc13803.databases.neo4j.io'
# AUTH = ('neo4j', 'bbOoM_jRSl9bfiN5EfRTam4QeJFTAWU34Jvv-6soQfM')
# DATABASE = "neo4j"
# IMPORT_DIR = "/home/ritam/neo4j/import"
# s3_session = boto3.session.Session()

# ds.write.neo4j(uri=URI,auth=AUTH,database=DATABASE,import_dir=IMPORT_DIR, use_auradb=True, s3_session=s3_session)

2024-09-06 22:33:46,560	INFO worker.py:1740 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2024-09-06 22:33:50,237	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-06_22-33-42_674101_2510893/logs/ray-data
2024-09-06 22:33:50,239	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> TaskPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(unpickle_doc)->MapBatches(extract)->MapBatches(extract)->MapBatches(extract)->MapBatches(materialize)->MapBatches(_aggregate_section_nodes)]


- ReadBinary->SplitBlocks(16) 1:   0%|          | 0/1 [00:00<?, ?it/s]

- Map(BinaryScan._to_document)->MapBatches(unpickle_doc)->MapBatches(extract)->MapBatches(extract)->MapBatches…

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

(Map(BinaryScan._to_document)->MapBatches(unpickle_doc)->MapBatches(extract)->MapBatches(extract)->MapBatches(extract)->MapBatches(materialize)->MapBatches(_aggregate_section_nodes) pid=2511088) Unrecognized extenstion pickle; using application/pickle
I0000 00:00:1725662034.662199 2510893 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
2024-09-06 22:33:54,693	INFO dataset.py:2370 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2024-09-06 22:33:54,700	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-06_22-33-42_674101_2510893/logs/ray-data
2024-09-06 22:33:54,702	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperato

- Aggregate 1:   0%|          | 0/16 [00:00<?, ?it/s]

Shuffle Map 2:   0%|          | 0/16 [00:00<?, ?it/s]

Shuffle Reduce 3:   0%|          | 0/16 [00:00<?, ?it/s]

- limit=1 4:   0%|          | 0/16 [00:00<?, ?it/s]

Running 0:   0%|          | 0/16 [00:00<?, ?it/s]

2024-09-06 22:33:54,941	ERROR exceptions.py:73 -- Exception occurred in Ray Data or Ray Core internal code. If you continue to see this error, please open an issue on the Ray project GitHub page with the full stack trace below: https://github.com/ray-project/ray/issues/new/choose
2024-09-06 22:33:54,960	ERROR worker.py:406 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::reduce()[39m (pid=2511088, ip=172.31.9.237)
  File "/home/ritam/.cache/pypoetry/virtualenvs/sycamore-monorepo-2qWDk4xs-py3.11/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/aggregate_task_spec.py", line 70, in reduce
    return BlockAccessor.for_block(mapper_outputs[0]).aggregate_combined_blocks(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ritam/.cache/pypoetry/virtualenvs/sycamore-monorepo-2qWDk4xs-py3.11/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py", line 658, in aggregate_combined_blocks
    accumulat

RayTaskError(AssertionError): [36mray::reduce()[39m (pid=2511088, ip=172.31.9.237)
  File "/home/ritam/.cache/pypoetry/virtualenvs/sycamore-monorepo-2qWDk4xs-py3.11/lib/python3.11/site-packages/ray/data/_internal/planner/exchange/aggregate_task_spec.py", line 70, in reduce
    return BlockAccessor.for_block(mapper_outputs[0]).aggregate_combined_blocks(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ritam/.cache/pypoetry/virtualenvs/sycamore-monorepo-2qWDk4xs-py3.11/lib/python3.11/site-packages/ray/data/_internal/arrow_block.py", line 658, in aggregate_combined_blocks
    accumulators[i] = aggs[i].merge(
                      ^^^^^^^^^^^^^^
  File "/home/ritam/sycamore/lib/sycamore/sycamore/transforms/resolve_graph_entities.py", line 151, in merge
    check_if_bad(nodes2)
  File "/home/ritam/sycamore/lib/sycamore/sycamore/transforms/resolve_graph_entities.py", line 220, in check_if_bad
    assert isinstance(rel, dict), f"{json.dumps(node['relationships'], indent=2)}"
AssertionError: {
  "1c69646b-6130-4389-860b-b977c958caea": {
    "END_ID": "57c7d4c5-7a53-4852-8296-25f6560da022",
    "END_LABEL": "Aircraft",
    "START_ID": "ae8e2674-4f79-443e-873d-e665d5f1b1b4",
    "START_LABEL": "Report",
    "TYPE": "INVOLVED_AIRCRAFT",
    "properties": {}
  },
  "26e7c513-73a3-4373-8e17-c38d2a12afbb": null,
  "6890231b-3b2f-430d-97d2-1f47f609c7fe": null,
  "d113493e-58c8-43f8-9c56-c1397ec586ec": {
    "END_ID": "57c7d4c5-7a53-4852-8296-25f6560da022",
    "END_LABEL": "Aircraft",
    "START_ID": "3beff522-9474-4ff3-bc14-2020ed6ad8b6",
    "START_LABEL": "SECTION",
    "TYPE": "CONTAINS",
    "properties": {}
  }
}

           "model": "N/A"
(map pid=2511088)         }


(map pid=2511088)       }
(map pid=2511088)     ]
(map pid=2511088)   }
(map pid=2511088) }
(map pid=2511088) ERROR:root:INPUT ACCUMULATE: {
(map pid=2511088)   "Aircraft": {
(map pid=2511088)     "a55bd63b5d420fb34140005d40f0c5df0b2acb7a92ea93995793da8012880d00": [
(map pid=2511088)       {
(map pid=2511088)         "type": "extracted",
(map pid=2511088)         "doc_id": "57c7d4c5-7a53-4852-8296-25f6560da022",
(map pid=2511088)         "properties": {
(map pid=2511088)           "registration": "N/A",
(map pid=2511088)           "make": "N/A",
(map pid=2511088)           "model": "N/A"
(map pid=2511088)         },
(map pid=2511088)         "label": "Aircraft",
(map pid=2511088)         "relationships": {
(map pid=2511088)           "d113493e-58c8-43f8-9c56-c1397ec586ec": {
(map pid=2511088)             "TYPE": "CONTAINS",
(map pid=2511088)             "properties": {},
(map pid=2511088)             "START_ID": "3beff522-9474-4ff3-bc14-2020ed6ad8b6",
(map pid=2511088)             "ST

In [None]:
class Report(BaseModel):
    Title: str

class Drug(BaseModel):
    name:
    chemical_compisition:
    price:
    time_to_develop:


class Aircraft(BaseModel):
    registration: str
    make: str
    model: str

class INVOLVED_AIRCRAFT(BaseModel):
    start: Report
    end: Aircraft

In [None]:
for doc in docs:
    if "nodes" not in doc["properties"]:
        continue
    print(json.dumps(doc["properties"]["nodes"], indent=2))
    # for label, hashes in doc["properties"]["nodes"].items():
    #     print(label)
    #     for hash, node in hashes.items():
    #         uuid = set()
    #         rels = set()
    #         for rel_uuid, rel in node["relationships"].items():
    #             assert rel_uuid not in uuid
    #             assert json.dumps(rel) not in rels
    #             uuid.add(json.dumps(rel_uuid))
    #             rels.add(json.dumps(rel))
    #             print(rel_uuid)

In [None]:
for doc in docs:
    for section in doc.children:
        for label, hashes in section["properties"]["nodes"].items():
            for hash, node in hashes.items():
                uuid = set()
                rels = set()
                for rel_uuid, rel in node["relationships"].items():
                    assert rel_uuid not in uuid
                    assert json.dumps(rel) not in rels
                    uuid.add(json.dumps(rel_uuid))
                    rels.add(json.dumps(rel))
                    print(rel_uuid)

In [None]:
for doc in docs:
    for section in doc.children:
        if section.type == "extracted":
            print(json.dumps(section.data["relationships"], indent=2))

In [5]:
for doc in docs:
    for section in doc.children:
        if section.type == "extracted":
            print(json.dumps(section.data["relationships"], indent=2))
        #print(section)
        # if "EXTRACTED_NODES" in section["properties"]:
        #     for child in section.children:
        #         print(child)
        #         for rel_uuid, rel in child["relationships"].items():
        #             print(rel_uuid)
        #             print(rel)
        # if "nodes" not in section["properties"]:
        #     continue
        # for label, hashes in section["properties"]["nodes"].items():
        #     for hash, node in hashes.items():
        #         for rel_uuid, rel in node["relationships"].items():
        #             print(rel_uuid)
                    # print(rel)
    #for section in doc.children:
    #print(json.dumps(section["properties"]["nodes"], indent=2))

NameError: name 'docs' is not defined