## NTSB demo bolierplate (imports & utilities)

### Imports

In [2]:
from sycamore.data import Document, Table
from sycamore.functions import HuggingFaceTokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.transforms.extract_schema import OpenAISchemaExtractor, OpenAIPropertyExtractor
from sycamore.transforms.extract_entity import OpenAIEntityExtractor
#from sycamore.transforms.merge_elements import GreedySectionMerger
from sycamore.transforms.partition import UnstructuredPdfPartitioner, SycamorePartitioner
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.transforms.summarize_images import SummarizeImages
from sycamore.utils.pdf_utils import show_pages

from sycamore.data import BoundingBox, Document, Element, TableElement
from sycamore.functions.document import split_and_convert_to_image, DrawBoxes
import sycamore
import time
from pathlib import Path
import pickle
from dateutil import parser

from opensearchpy import OpenSearch

from sycamore.transforms.query import OpenSearchQueryExecutor
from sycamore.data import OpenSearchQuery
from sycamore.utils.time_trace import timetrace

import json

import os
import sys 

import PIL.Image
from io import BytesIO
from IPython.display import Image 
from IPython.display import display, HTML

import pprint

### Location standardization

In [5]:
state_dict = {
    "AK": "Alaska", "AL": "Alabama", "AR": "Arkansas", "AZ": "Arizona", "CA": "California",
    "CO": "Colorado", "CT": "Connecticut", "DC": "District of Columbia", "DE": "Delaware",
    "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "IA": "Iowa", "ID": "Idaho",
    "IL": "Illinois", "IN": "Indiana", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana",
    "MA": "Massachusetts", "MD": "Maryland", "ME": "Maine", "MI": "Michigan", "MN": "Minnesota",
    "MO": "Missouri", "MS": "Mississippi", "MT": "Montana", "NC": "North Carolina", "ND": "North Dakota",
    "NE": "Nebraska", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico",
    "NV": "Nevada", "NY": "New York", "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon",
    "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota",
    "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VA": "Virginia", "VT": "Vermont",
    "WA": "Washington", "WI": "Wisconsin", "WV": "West Virginia", "WY": "Wyoming"
}

## standardize timestamps
def standardize_date(doc: Document) -> Document:

    try:
        if 'dateAndTime' in doc.properties['entity']:
            doc.properties['entity']['dateTime'] = doc.properties['entity']['dateAndTime']
            del doc.properties['entity']['dateAndTime']

        raw_date: str = doc.properties['entity']['dateTime']

        raw_date = raw_date.replace("Local", "")
        parsed_date = parser.parse(raw_date)
        extracted_date = parsed_date.date()
        doc.properties['entity']['day'] = extracted_date
        return doc
        
    except Exception as e:
        # date not extracted propoerly, don't do anything
        return doc


def standardize_location(doc: Document) -> Document:
    if "location" not in doc.properties['entity']:
        return doc

    try:
        raw_loc: str = doc.properties['entity']['location']
        city, state = raw_loc.split(',')
        std_loc = city + ', ' + standardize_state(state)
        doc.properties['entity']['location'] = std_loc
        return doc
    except Exception as e:
        # location not extracted propoerly, don't do anything
        return doc
    

def standardize_state(state: str) -> str:
    clean_state = state.lstrip().rstrip().upper()
    if clean_state in state_dict:
        return state_dict[clean_state]
    else:
        return state


### Utils

In [1]:
# This font is used for the labels in the visual represenation.
font_path= "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"

## pickling utilities
def wall_time(fn):
    # wall_time(lambda: time.sleep(1))
    start = time.time_ns()
    fn()
    end = time.time_ns()
    print("Elapsed time:", (end - start) / 1.0e9)
    
def pickle_doc(doc: Document) -> bytes:
    return pickle.dumps(doc)

def pickle_name(doc: Document, extension = None):
    return str(doc.doc_id) + ".pickle"

def unpickle_doc(pdoc: Document) -> list[Document]:
    doc = convert_schema(pickle.loads(pdoc.binary_representation))
    return [doc]

def write_out_docset(pickle_root, docset):
    wall_time(lambda: docset.write.files(pickle_root, doc_to_bytes_fn=pickle_doc, filename_fn=pickle_name))

def read_in_docset(pickle_root, ctx):
    pickled_docset = ctx.read.binary(str(pickle_root), binary_format="pickle")
    unpickled_docset = pickled_docset.flat_map(unpickle_doc)
    wall_time(lambda: unpickled_docset.count())
    return unpickled_docset


## image generation and saving to disk
def image_page_filename(doc: Document):
    path = Path(doc.properties["path"])
    base_name = ".".join(path.name.split(".")[0:-1])
    page_num = doc.properties["page_number"]
    return f"{base_name}_page_{page_num}.png"


def list_files(directory):
    # Get a list of all files and directories in the specified directory
    all_entries = os.listdir(directory)
    
    # Filter out only the files
    files = [os.path.join(directory, entry) for entry in all_entries if os.path.isfile(os.path.join(directory, entry))]
    
    return files

def enumerate_images_and_tables(m_pages: list[Document]):
    num_pages = len(m_pages)
    for i in range(0, num_pages):
        m_page = m_pages[i]
        print("Path: ", m_page.properties['path'], "Page: ", m_page.properties['page_number'])
        for e in m_page.elements:
            if e.type == "Image":
                print("Image summary: ", e.properties['summary'], "\n")
                print()
            if e.type == "table":
                display(HTML(e.table.to_html()))
                print()

def display_page_and_table_properties(some_pages: list[Document]):
    for m_page in some_pages:
        print("Page props: ")
        display(m_page.properties['entity'])
        print()
#    for k in m_page.properties.keys():
#        print("Page: ", k)
        for e in m_page.elements:
            if "table" in e.type:
                print("Element Type: ", e.type)
                print("Element Properties: ", json.dumps(e.properties, indent=2))
                display(HTML(e.text_representation))
                #print(e.keys())
                #display(HTML(e.table.to_html()))
                # print(e.table.to_csv())

NameError: name 'Path' is not defined

In [29]:
@timetrace("LLMGen")
def llm_generate_with_retries(llm, prompt_kwargs, llm_kwargs, max_retries=5):
    for attempt in range(max_retries):
        try:
            llm_response = llm.generate(prompt_kwargs=prompt_kwargs, llm_kwargs=llm_kwargs).content
            new_props = json.loads(llm_response)
            return new_props
        except Exception as e:
            print(e)
            if attempt == max_retries - 1:
                raise e
                
"""
For table elements, ask the llm to to extract a JSON formatted key-value object from the table's csv string.
"""
@timetrace("ExtractProp")
def extract_table_as_properties(doc: Document) -> Document:
    PROMPT = """
    You are given a csv representing either a single column, or multi-column table.
    Instructions:
    1. Parse the table and return a flattened JSON object representing the key-value pairs of properties defined in the table.
    2. Do not return nested objects, keep the dictionary only 1 level deep. The only valid value types are numbers, strings, and lists.
    3. If you find multiple fields defined in a row, feel free to split them into separate properties.
    4. Use camelCase for the key names
    5. For fields where the values are in standard measurement units like miles, nautical miles, knots, celsius
       - include the unit in the key name and only set the numeric value as the value.
       - e.g. "Wind Speed: 9 knots" should become windSpeedInKnots: 9, "Temperature: 3°C" should become temperatureInC: 3
    """
    llm_kwargs = {
        "response_format":{ "type": "json_object" }
    }
    sys.stderr.write(doc.properties['path'])
    if not doc.elements:
        return
    # we are going to use the first table's properties as document level properties
    top_level_table = None
    for element in doc.elements:
        if element.type != "table" or element.table == None:
            continue
        if not top_level_table:
            top_level_table = element
        prompt = PROMPT
        prompt += "\n" + element.table.to_csv()
        prompt_kwargs = {
            "prompt": prompt
        }
#        llm_response = llm.generate(prompt_kwargs=prompt_kwargs, llm_kwargs=llm_kwargs).content
        new_props = llm_generate_with_retries(llm, prompt_kwargs, llm_kwargs, max_retries=5)
#        print(new_props)
        if new_props:
            element.properties.update(new_props)
        else:
            element.properties.update({"Foo": "Bar"})
            
    doc.properties["entity"] = top_level_table.properties.copy()
    return doc

In [None]:
os.environ['TIMETRACE'] = "/tmp/_ntsb_demo_jp" 