Unstructured-IO · Coniferish · Oct 24, 2024 · Oct 24, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,4 @@ __pycache__/
 .idea/
 openapi.json
 openapi_client.json
+.env
diff --git a/.speakeasy/gen.lock b/.speakeasy/gen.lock
@@ -1,10 +1,10 @@
 lockVersion: 2.0.0
 id: 8b5fa338-9106-4734-abf0-e30d67044a90
 management:
-  docChecksum: 21f469b38bb72725739ee9d9d0fc8780
-  docVersion: 1.0.51
-  speakeasyVersion: 1.424.0
-  generationVersion: 2.445.1
+  docChecksum: 80b2dc9fb0c56267e34c1679522a1794
+  docVersion: 1.0.52
+  speakeasyVersion: 1.421.0
+  generationVersion: 2.438.15
   releaseVersion: 0.27.0
   configChecksum: 6ece96f34cb076ad455a9c66b68c30b0
   repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
@@ -13,6 +13,7 @@ management:
   published: true
 features:
   python:
+    acceptHeaders: 3.0.0
     additionalDependencies: 1.0.0
     constsAndDefaults: 1.0.4
     core: 5.6.1
@@ -107,6 +108,7 @@ examples:
       responses:
         "200":
           application/json: [{"type": "Title", "element_id": "6aa0ff22f91bbe7e26e8e25ca8052acd", "text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis", "metadata": {"languages": ["eng"], "page_number": 1, "filename": "layout-parser-paper.pdf", "filetype": "application/pdf"}}]
+          text/csv: "<value>"
         "422":
           application/json: {"detail": []}
         5XX:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 
 ### Fixes
 * Use the configured server_url for our split page "dummy" request
+* Handle `text/csv` output format and return accordingly when passing the argument
 
 ## 0.26.0
 

diff --git a/Makefile b/Makefile
@@ -77,6 +77,15 @@ client-generate-local:
 	speakeasy overlay apply -s ./openapi.json -o ./overlay_client.yaml > ./openapi_client.json
 	speakeasy generate sdk -s ./openapi_client.json -o ./ -l python
 
+## client-generate-localhost:	Generate the SDK using the openapi.json from the unstructured-api running at localhost:5000
+.PHONY: client-generate-localhost
+client-generate-localhost:
+	curl -o openapi.json http://localhost:5000/general/openapi.json || { echo "Failed to download openapi.json"; exit 1; }
+	speakeasy overlay validate -o ./overlay_client.yaml
+	speakeasy overlay apply -s ./openapi.json -o ./overlay_client.yaml > ./openapi_client.json
+	python3 -c 'import sys, yaml, json; sys.stdout.write(json.dumps(yaml.safe_load(sys.stdin), indent=2))' < ./openapi_client.json > temp.json && mv temp.json ./openapi_client.json
+	speakeasy generate sdk -s ./openapi_client.json -o ./ -l python
+
 .PHONY: publish
 publish:
 	./scripts/publish.sh

diff --git a/_test_unstructured_client/conftest.py b/_test_unstructured_client/conftest.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Generator
+import pytest
+
+from unstructured_client.sdk import UnstructuredClient
+
+
+@pytest.fixture(scope="module")
+def client() -> Generator[UnstructuredClient, None, None]:
+    _client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
+    yield _client
+
+
+@pytest.fixture(scope="module")
+def doc_path() -> Path:
+    return Path(__file__).resolve().parents[1] / "_sample_docs"
diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py
@@ -15,13 +15,41 @@
 from unstructured_client import UnstructuredClient
 from unstructured_client.models import shared, operations
 from unstructured_client.models.errors import HTTPValidationError
+from unstructured_client.models.shared.partition_parameters import OutputFormat
 from unstructured_client.utils.retries import BackoffStrategy, RetryConfig
 from unstructured_client._hooks.custom import form_utils
 from unstructured_client._hooks.custom import split_pdf_hook
 
 FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
 
 
+@pytest.mark.parametrize("split_pdf_page", [True, False])
+def test_integration_split_csv_response(split_pdf_page, client, doc_path):
+    filename = "layout-parser-paper.pdf"
+    with open(doc_path / filename, "rb") as f:
+        files = shared.Files(
+            content=f.read(),
+            file_name=filename,
+        )
+    req = operations.PartitionRequest(
+        partition_parameters=shared.PartitionParameters(
+            files=files,
+            output_format=OutputFormat.TEXT_CSV,
+            split_pdf_page=split_pdf_page,
+        )
+    )
+
+    resp = client.general.partition(request=req)
+
+    assert resp.status_code == 200
+    assert resp.content_type == "text/csv; charset=utf-8"
+    assert resp.elements is None
+    assert resp.csv_elements is not None
+    assert resp.csv_elements.startswith(
+        "type,element_id,text,filetype,languages,page_number,filename,parent_id"
+    )
+
+
 @pytest.mark.parametrize("concurrency_level", [1, 2, 5])
 @pytest.mark.parametrize(
     ("filename", "expected_ok", "strategy"),
@@ -40,10 +68,10 @@ def test_integration_split_pdf_has_same_output_as_non_split(
     concurrency_level: int, filename: str, expected_ok: bool, strategy: str
 ):
     """
-    Tests that output that we get from the split-by-page pdf is the same as from non-split.
+    Test that the output we get from the split-by-page pdf is the same as from non-split.
 
     Requires unstructured-api running in bg. See Makefile for how to run it.
-    Doesn't check for raw_response as there's no clear patter for how it changes with the number of pages / concurrency_level.
+    Doesn't check for raw_response as there's no clear pattern for how it changes with the number of pages / concurrency_level.
     """
     try:
         response = requests.get("http://localhost:8000/general/docs")

diff --git a/_test_unstructured_client/integration/test_integration_freemium.py b/_test_unstructured_client/integration/test_integration_freemium.py
@@ -3,27 +3,16 @@
 import asyncio
 import json
 import os
-from pathlib import Path
 
 import pytest
 from deepdiff import DeepDiff
+
 from unstructured_client import UnstructuredClient
 from unstructured_client.models import shared, operations
 from unstructured_client.models.errors import SDKError, ServerError, HTTPValidationError
 from unstructured_client.utils.retries import BackoffStrategy, RetryConfig
 
 
-@pytest.fixture(scope="module")
-def client() -> UnstructuredClient:
-    _client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
-    yield _client
-
-
-@pytest.fixture(scope="module")
-def doc_path() -> Path:
-    return Path(__file__).resolve().parents[2] / "_sample_docs"
-
-
 @pytest.mark.parametrize("split_pdf", [True, False])
 @pytest.mark.parametrize("strategy", ["fast", "ocr_only", "hi_res"])
 def test_partition_strategies(split_pdf, strategy, client, doc_path):

diff --git a/_test_unstructured_client/unit/test_request_utils.py b/_test_unstructured_client/unit/test_request_utils.py
@@ -1,9 +1,10 @@
-# Get unit tests for request_utils.py module
+from __future__ import annotations
+
 import httpx
+import json
 import pytest
 
-from unstructured_client._hooks.custom.request_utils import create_pdf_chunk_request_params, get_multipart_stream_fields
-from unstructured_client.models import shared
+from unstructured_client._hooks.custom.request_utils import create_pdf_chunk_request_params, create_response, get_multipart_stream_fields
 
 
 # make the above test using @pytest.mark.parametrize
@@ -30,6 +31,7 @@ def test_get_multipart_stream_fields(input_request, expected):
     fields = get_multipart_stream_fields(input_request)
     assert fields == expected
 
+
 def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
     with pytest.raises(ValueError):
         get_multipart_stream_fields(httpx.Request(
@@ -40,6 +42,7 @@ def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
             headers={"Content-Type": "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW"}),
         )
 
+
 @pytest.mark.parametrize(("input_form_data", "page_number", "expected_form_data"), [
     (
             {"hello": "world"},
@@ -70,3 +73,26 @@ def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
 def test_create_pdf_chunk_request_params(input_form_data, page_number, expected_form_data):
     form_data = create_pdf_chunk_request_params(input_form_data, page_number)
     assert form_data == expected_form_data
+
+
+def test_create_response_for_json():
+    elements = [
+        {"type": "Title", "text": "Hello, World!"},
+        {"type": "NarrativeText", "text": "Goodbye!"},
+    ]
+    response = create_response(elements)
+    assert response.status_code == 200
+    assert response.json() == elements
+    assert response.headers["Content-Type"] == "application/json"
+
+
+def test_create_response_for_csv():
+    elements = [
+        b'type,element_id,text,languages,page_number,filename,filetype,parent_id' \
+        b'\nTitle,f73329878fbbb0bb131a83e7b6daacbe,Module One - Introduction to Product' \
+        b' Development and Quality Assurance,[\'eng\'],1,list-item-example-1.pdf,application/pdf,'
+    ]
+    response = create_response(elements)
+    assert response.status_code == 200
+    pytest.raises(json.decoder.JSONDecodeError, response.json)
+    assert response.headers["Content-Type"] == "text/csv; charset=utf-8"
diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py
@@ -1,17 +1,13 @@
 from __future__ import annotations
 
 import asyncio
-import io
-import logging
 from asyncio import Task
 from collections import Counter
 from functools import partial
-from typing import Coroutine
 
-import httpx
 import pytest
 import requests
-from requests_toolbelt import MultipartDecoder, MultipartEncoder
+from requests_toolbelt import MultipartDecoder
 
 from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
 from unstructured_client._hooks.custom.form_utils import (

diff --git a/docs/models/operations/partitionresponse.md b/docs/models/operations/partitionresponse.md
@@ -8,4 +8,5 @@
 | `content_type`                                               | *str*                                                        | :heavy_check_mark:                                           | HTTP response content type for this operation                |
 | `status_code`                                                | *int*                                                        | :heavy_check_mark:                                           | HTTP response status code for this operation                 |
 | `raw_response`                                               | [httpx.Response](https://www.python-httpx.org/api/#response) | :heavy_check_mark:                                           | Raw HTTP response; suitable for custom response parsing      |
+| `csv_elements`                                               | *Optional[str]*                                              | :heavy_minus_sign:                                           | Successful Response                                          |
 | `elements`                                                   | List[Dict[str, *Any*]]                                       | :heavy_minus_sign:                                           | Successful Response                                          |
diff --git a/docs/models/shared/strategy.md b/docs/models/shared/strategy.md
@@ -10,4 +10,5 @@ The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto.
 | `FAST`     | fast       |
 | `HI_RES`   | hi_res     |
 | `AUTO`     | auto       |
-| `OCR_ONLY` | ocr_only   |
+| `OCR_ONLY` | ocr_only   |
+| `OD_ONLY`  | od_only    |
diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py
@@ -4,7 +4,7 @@
 import io
 import json
 import logging
-from typing import Tuple, Any, BinaryIO
+from typing import Any, BinaryIO, Tuple
 
 import httpx
 from httpx._multipart import DataField, FileField
@@ -207,7 +207,8 @@ def prepare_request_headers(
     new_headers.pop("Content-Length", None)
     return new_headers
 
-def create_response(elements: list) -> httpx.Response:
+
+def create_response(elements: list[dict[str, Any] | bytes]) -> httpx.Response:
     """
     Creates a modified response object with updated content.
 
@@ -218,8 +219,12 @@ def create_response(elements: list) -> httpx.Response:
     Returns:
         The modified response object with updated content.
     """
-    response = httpx.Response(status_code=200, headers={"Content-Type": "application/json"})
-    content = json.dumps(elements).encode()
+    if isinstance(elements, list) and all(isinstance(element, bytes) for element in elements):
+        response = httpx.Response(status_code=200, headers={"Content-Type": "text/csv; charset=utf-8"})
+        content = b''.join(elements) # type: ignore
+    else:
+        response = httpx.Response(status_code=200, headers={"Content-Type": "application/json"})
+        content = json.dumps(elements).encode()
     content_length = str(len(content))
     response.headers.update({"Content-Length": content_length})
     setattr(response, "_content", content)

diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -576,10 +576,13 @@ def _await_elements(
                     response_number,
                 )
                 successful_responses.append(res)
-                if self.cache_tmp_data_feature:
-                    elements.append(load_elements_from_response(res))
-                else:
-                    elements.append(res.json())
+                if res.headers["Content-Type"] == "application/json":
+                    if self.cache_tmp_data_feature:
+                        elements.append(load_elements_from_response(res))
+                    else:
+                        elements.append(res.json())
+                else:  # -- Response contains csv data
+                    elements.append(res.content)  # type: ignore
             else:
                 error_message = f"Failed to partition set {response_number}."
 
@@ -591,7 +594,12 @@ def _await_elements(
 
         self.api_successful_responses[operation_id] = successful_responses
         self.api_failed_responses[operation_id] = failed_responses
-        flattened_elements = [element for sublist in elements for element in sublist]
+        flattened_elements = []
+        for sublist in elements:
+            if isinstance(sublist, list):
+                flattened_elements.extend(sublist)
+            else:
+                flattened_elements.append(sublist)
         return flattened_elements
 
     def after_success(
@@ -613,7 +621,6 @@ def after_success(
         """
         # Grab the correct id out of the dummy request
         operation_id = response.request.headers.get("operation_id")
-
         elements = self._await_elements(operation_id)
 
         # if fails are disallowed, return the first failed response
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,4 @@ __pycache__/ @@
     .idea/
     openapi.json
     openapi_client.json
+    .env