In [None]:
import boto3
from typing import Optional
from pyarrow.filesystem import FileSystem
from pyarrow import fs
from sycamore.connectors.file.file_scan import JsonManifestMetadataProvider
from sycamore.functions import HuggingFaceTokenizer
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.reader import DocSetReader
from sycamore.transforms.embed import SentenceTransformerEmbedder
from sycamore.transforms import COALESCE_WHITESPACE
from sycamore.transforms.merge_elements import GreedyTextElementMerger
from sycamore.transforms.partition import ArynPartitioner
import sycamore
from time import time
from pathlib import Path

from ray.data import ActorPoolStrategy

In [None]:
def get_s3_fs():
    session = boto3.session.Session()
    credentials = session.get_credentials()
    from pyarrow.fs import S3FileSystem

    fs = S3FileSystem(
        secret_key=credentials.secret_key,
        access_key=credentials.access_key,
        region=session.region_name,
        session_token=credentials.token,
    )

    return fs

class ManifestReader(DocSetReader):
    def binary(
        self,
        binary_format: str,
        parallelism: Optional[int] = None,
        filesystem: Optional[FileSystem] = None,
        metadata_provider: Optional[JsonManifestMetadataProvider] = None,
        file_range: Optional[list] = None,
        **resource_args
    ):
        paths = metadata_provider.get_paths()
        paths=paths if file_range == None else paths[file_range[0]:file_range[1]]
        return super().binary(
            paths=paths,
            binary_format=binary_format,
            parallelism=parallelism,
            filesystem=filesystem,
            metadata_provider=metadata_provider,
            **resource_args
        )

In [None]:
os_client_args = {
    "hosts": [{"host": "localhost", "port": 9200}],
    "http_compress": True,
    "http_auth": ('admin', 'admin'),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120
}

index_settings = {
    "body": {
        "settings": {
            "index.knn": True,
            "number_of_shards": 5,
            "number_of_replicas": 1
        },
        "mappings": {
            "properties": {
                "embedding": {
                  "dimension": 384,
                  "method": {
                    "engine": "faiss",
                    "space_type": "l2",
                    "name": "hnsw",
                    "parameters": {}
                  },
                  "type": "knn_vector"
                },
            }
        }
    }
}

In [None]:
index = "financebench-baseline"
s3_path = "s3://aryn-datasets-us-east-1/financebench/pdfs/"
manifest_path = "/Users/Documents/Aryn/manifest.json" # Note: AWS credentials were not working for the S3 manifest path, so this is a local file

hf_model = "sentence-transformers/all-MiniLM-L6-v2"

openai_llm = OpenAI(OpenAIModels.GPT_4O.value)
tokenizer = HuggingFaceTokenizer(hf_model)
embedder = SentenceTransformerEmbedder(model_name=hf_model, batch_size=100)

In [None]:
start = time()

ds_list = []

# Note: file number 24 hangs when trying to ingest, so it was temporarily removed
for i in [[0,15],[15,30],[30,45],[45,60],[60,75]]:
    ctx = sycamore.init()
    reader = ManifestReader(ctx)
    ds = (
        reader.binary(binary_format="pdf", metadata_provider=JsonManifestMetadataProvider(manifest_path), filesystem=get_s3_fs(), file_range=i)
        .partition(partitioner=ArynPartitioner(extract_table_structure=True, threshold=0.35, use_ocr=False), num_gpus=0.1, compute=ActorPoolStrategy(size=1))
        .regex_replace(COALESCE_WHITESPACE)
        .merge(merger=GreedyTextElementMerger(tokenizer, 512))
        .spread_properties(["path"])
        .explode()
        .embed(embedder=embedder, num_gpus=0.1)
    )
    ds_list.append(ds)

end = time()
print(f"Took {(end - start) / 60} mins")

###########################

for ds in ds_list:
    start = time()

    ds.write.opensearch(
        os_client_args=os_client_args,
        index_name=index,
        index_settings=index_settings,
    )

    end = time()
    print(f"Took {(end - start) / 60} mins")

In [None]:
import json
import os
from pathlib import Path
from typing import Any
import datasets

from datasets import Dataset

from sycamore.connectors.file.file_writer import JSONEncodeWithUserDict
from sycamore.data import Element
from sycamore.data.document import Document
from sycamore.evaluation import EvaluationDataPoint
from sycamore.evaluation.datasets import EvaluationDataSetReader
from sycamore.evaluation.pipeline import EvaluationPipeline
from sycamore.transforms.query import OpenSearchQueryExecutor

In [None]:
def _hf_to_qa_datapoint(datapoint: dict[str, Any]) -> dict[str, Any]:
    document = EvaluationDataPoint()

    page_numbers = [evidence_item["evidence_page_num"] for evidence_item in datapoint["evidence"]]

    document.question = datapoint["question"]
    document.ground_truth_answer = datapoint["answer"]
    document.ground_truth_source_documents = [Element({
        "properties": {
            "_location": datapoint["doc_link"],
            "page_number": page_numbers[0],
            "page_numbers": page_numbers
        }
    })]
    
    document["raw"] = datapoint
    return {"doc": document.serialize()}

def get_subtask_results(document: Document) -> list[Document]:
    pass

INDEX = "financebench-baseline"

if os.path.exists("/.dockerenv"):
    opensearch_host = "opensearch"
    print("Assuming we are in a sycamore jupyter container, using opensearch for opensearch host")
else:
    opensearch_host = "localhost"
    print("Assuming we are running outside of a container, using localhost for opensearch host")

OS_CLIENT_ARGS = {
    "hosts": [{"host": opensearch_host, "port": 9200}],
    "http_compress": True,
    "http_auth": ("admin", "admin"),
    "use_ssl": True,
    "verify_certs": False,
    "ssl_assert_hostname": False,
    "ssl_show_warn": False,
    "timeout": 120,
}

OS_CONFIG = {
    "size": 10,
    "neural_search_k": 200,
    "embedding_model_id": "7-KAu5EBeZTJZhOyaQFc",
    "search_pipeline": "hybrid_rag_pipeline",
    "llm": "gpt-4o",
    "context_window": "10",
}

In [None]:
output_path = "/Users/aanyapratapneni/Documents/Aryn/myresultdir/baseline.json" # Note: this is a local path to save results

context = sycamore.init()
reader = EvaluationDataSetReader(context)
hf_dataset = datasets.load_dataset("PatronusAI/financebench", split='train[0:150]')
input_docset = reader.huggingface(hf_dataset, doc_extractor=_hf_to_qa_datapoint)

data = {
    "experiment_name": "FinanceBench gpt-4o baseline",
    "description": "gpt-4o",
    "created_by": "aanyapratapneni",
    "index": INDEX,
    "os_client_args": OS_CLIENT_ARGS,
    "os_config": OS_CONFIG,
    "qa_path": ["huggingface: PatronusAI/financebench"]
}

pipeline = EvaluationPipeline(
    index=INDEX,
    os_config=OS_CONFIG,
    metrics=[],
    query_executor=OpenSearchQueryExecutor(OS_CLIENT_ARGS),
)

start = time()
query_level_metrics = pipeline.execute(input_docset)[0]
data["query_level_data"] = query_level_metrics.take_all()
data["evaluation_time"] = f'{"{:.2f}".format(time() - start)} seconds'
with open(output_path, "w+") as outfile:
    json.dump(data, outfile, cls=JSONEncodeWithUserDict)