From 4c7e037cb09b53a5ad9d3a9f337f804add3baeec Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:30:22 -0500 Subject: [PATCH 1/2] add test for chunking strategies --- .gitignore | 1 + .../integration/test_decorators.py | 43 ++++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6de83ed8..4c6c70d9 100755 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ __pycache__/ .idea/ openapi.json openapi_client.json +.env diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index e1cc73e0..13a8214c 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -5,6 +5,7 @@ import httpx import json +import os import pytest import requests from deepdiff import DeepDiff @@ -19,7 +20,7 @@ from unstructured_client._hooks.custom import form_utils from unstructured_client._hooks.custom import split_pdf_hook -FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" +FAKE_KEY = os.getenv("UNSTRUCTURED_API_KEY") or "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" @pytest.mark.parametrize("concurrency_level", [1, 2, 5]) @@ -472,3 +473,43 @@ async def mock_send(_, request: httpx.Request, **kwargs): assert mock_endpoint_called assert res.status_code == 200 + + +@pytest.mark.parametrize( + ("filename", "chunking_strategy", "expected_elements_num"), + [ + ## -- Paid strategy -- + ("_sample_docs/layout-parser-paper.pdf", "by_page", 16), # 16 pages, 133 elements w/o chunking + ("_sample_docs/layout-parser-paper.pdf", shared.ChunkingStrategy.BY_PAGE, 16), + # -- Open source strategy -- + ("_sample_docs/layout-parser-paper.pdf", "by_title", -1), # unsure what the correct number is atm + ("_sample_docs/layout-parser-paper.pdf", shared.ChunkingStrategy.BY_TITLE, -1), + ], +) +def test_chunking( + filename: str, + chunking_strategy: str| shared.ChunkingStrategy, + expected_elements_num: int, +): + + client = UnstructuredClient(api_key_auth=FAKE_KEY) + + with open(filename, "rb") as f: + files = shared.Files( + content=f.read(), + file_name=filename, + ) + + parameters = shared.PartitionParameters( + files=files, + chunking_strategy=chunking_strategy, # type: ignore + ) + + req = operations.PartitionRequest( + partition_parameters=parameters + ) + + resp = client.general.partition(request=req) + assert len(resp.elements) == expected_elements_num + assert all(element.type == "CompositeElement" for element in resp.elements) + From c65e1b18f24af4c86702a39d825dc4ce2eb5f357 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:53:40 -0500 Subject: [PATCH 2/2] test if splitting is the issue --- _test_unstructured_client/integration/test_decorators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index 13a8214c..732df1a9 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -478,7 +478,7 @@ async def mock_send(_, request: httpx.Request, **kwargs): @pytest.mark.parametrize( ("filename", "chunking_strategy", "expected_elements_num"), [ - ## -- Paid strategy -- + # -- Paid strategy -- ("_sample_docs/layout-parser-paper.pdf", "by_page", 16), # 16 pages, 133 elements w/o chunking ("_sample_docs/layout-parser-paper.pdf", shared.ChunkingStrategy.BY_PAGE, 16), # -- Open source strategy -- @@ -503,6 +503,7 @@ def test_chunking( parameters = shared.PartitionParameters( files=files, chunking_strategy=chunking_strategy, # type: ignore + split_pdf_page=False, # -- Testing splitting as potential issue ) req = operations.PartitionRequest( @@ -511,5 +512,5 @@ def test_chunking( resp = client.general.partition(request=req) assert len(resp.elements) == expected_elements_num - assert all(element.type == "CompositeElement" for element in resp.elements) + assert all(element.get("type") == "CompositeElement" for element in resp.elements)