Skip to content

Jj/klaijan/csv #200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -12,3 +12,4 @@ __pycache__/
.idea/
openapi.json
openapi_client.json
.env
10 changes: 6 additions & 4 deletions .speakeasy/gen.lock
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
lockVersion: 2.0.0
id: 8b5fa338-9106-4734-abf0-e30d67044a90
management:
docChecksum: 21f469b38bb72725739ee9d9d0fc8780
docVersion: 1.0.51
speakeasyVersion: 1.424.0
generationVersion: 2.445.1
docChecksum: 80b2dc9fb0c56267e34c1679522a1794
docVersion: 1.0.52
speakeasyVersion: 1.421.0
generationVersion: 2.438.15
releaseVersion: 0.27.0
configChecksum: 6ece96f34cb076ad455a9c66b68c30b0
repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
@@ -13,6 +13,7 @@ management:
published: true
features:
python:
acceptHeaders: 3.0.0
additionalDependencies: 1.0.0
constsAndDefaults: 1.0.4
core: 5.6.1
@@ -107,6 +108,7 @@ examples:
responses:
"200":
application/json: [{"type": "Title", "element_id": "6aa0ff22f91bbe7e26e8e25ca8052acd", "text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis", "metadata": {"languages": ["eng"], "page_number": 1, "filename": "layout-parser-paper.pdf", "filetype": "application/pdf"}}]
text/csv: "<value>"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: Figure out how to get the example from the openapi spec in here

"422":
application/json: {"detail": []}
5XX:
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@

### Fixes
* Use the configured server_url for our split page "dummy" request
* Handle `text/csv` output format and return accordingly when passing the argument

## 0.26.0

9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -77,6 +77,15 @@ client-generate-local:
speakeasy overlay apply -s ./openapi.json -o ./overlay_client.yaml > ./openapi_client.json
speakeasy generate sdk -s ./openapi_client.json -o ./ -l python

## client-generate-localhost: Generate the SDK using the openapi.json from the unstructured-api running at localhost:5000
.PHONY: client-generate-localhost
client-generate-localhost:
curl -o openapi.json http://localhost:5000/general/openapi.json || { echo "Failed to download openapi.json"; exit 1; }
speakeasy overlay validate -o ./overlay_client.yaml
speakeasy overlay apply -s ./openapi.json -o ./overlay_client.yaml > ./openapi_client.json
python3 -c 'import sys, yaml, json; sys.stdout.write(json.dumps(yaml.safe_load(sys.stdin), indent=2))' < ./openapi_client.json > temp.json && mv temp.json ./openapi_client.json
speakeasy generate sdk -s ./openapi_client.json -o ./ -l python

.PHONY: publish
publish:
./scripts/publish.sh
19 changes: 19 additions & 0 deletions _test_unstructured_client/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from __future__ import annotations

import os
from pathlib import Path
from typing import Generator
import pytest

from unstructured_client.sdk import UnstructuredClient


@pytest.fixture(scope="module")
def client() -> Generator[UnstructuredClient, None, None]:
_client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
yield _client


@pytest.fixture(scope="module")
def doc_path() -> Path:
return Path(__file__).resolve().parents[1] / "_sample_docs"
32 changes: 30 additions & 2 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
@@ -15,13 +15,41 @@
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared, operations
from unstructured_client.models.errors import HTTPValidationError
from unstructured_client.models.shared.partition_parameters import OutputFormat
from unstructured_client.utils.retries import BackoffStrategy, RetryConfig
from unstructured_client._hooks.custom import form_utils
from unstructured_client._hooks.custom import split_pdf_hook

FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"


@pytest.mark.parametrize("split_pdf_page", [True, False])
def test_integration_split_csv_response(split_pdf_page, client, doc_path):
filename = "layout-parser-paper.pdf"
with open(doc_path / filename, "rb") as f:
files = shared.Files(
content=f.read(),
file_name=filename,
)
req = operations.PartitionRequest(
partition_parameters=shared.PartitionParameters(
files=files,
output_format=OutputFormat.TEXT_CSV,
split_pdf_page=split_pdf_page,
)
)

resp = client.general.partition(request=req)

assert resp.status_code == 200
assert resp.content_type == "text/csv; charset=utf-8"
assert resp.elements is None
assert resp.csv_elements is not None
assert resp.csv_elements.startswith(
"type,element_id,text,filetype,languages,page_number,filename,parent_id"
)


@pytest.mark.parametrize("concurrency_level", [1, 2, 5])
@pytest.mark.parametrize(
("filename", "expected_ok", "strategy"),
@@ -40,10 +68,10 @@ def test_integration_split_pdf_has_same_output_as_non_split(
concurrency_level: int, filename: str, expected_ok: bool, strategy: str
):
"""
Tests that output that we get from the split-by-page pdf is the same as from non-split.
Test that the output we get from the split-by-page pdf is the same as from non-split.

Requires unstructured-api running in bg. See Makefile for how to run it.
Doesn't check for raw_response as there's no clear patter for how it changes with the number of pages / concurrency_level.
Doesn't check for raw_response as there's no clear pattern for how it changes with the number of pages / concurrency_level.
"""
try:
response = requests.get("http://localhost:8000/general/docs")
Original file line number Diff line number Diff line change
@@ -3,27 +3,16 @@
import asyncio
import json
import os
from pathlib import Path

import pytest
from deepdiff import DeepDiff

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared, operations
from unstructured_client.models.errors import SDKError, ServerError, HTTPValidationError
from unstructured_client.utils.retries import BackoffStrategy, RetryConfig


@pytest.fixture(scope="module")
def client() -> UnstructuredClient:
_client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
yield _client


@pytest.fixture(scope="module")
def doc_path() -> Path:
return Path(__file__).resolve().parents[2] / "_sample_docs"


@pytest.mark.parametrize("split_pdf", [True, False])
@pytest.mark.parametrize("strategy", ["fast", "ocr_only", "hi_res"])
def test_partition_strategies(split_pdf, strategy, client, doc_path):
32 changes: 29 additions & 3 deletions _test_unstructured_client/unit/test_request_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Get unit tests for request_utils.py module
from __future__ import annotations

import httpx
import json
import pytest

from unstructured_client._hooks.custom.request_utils import create_pdf_chunk_request_params, get_multipart_stream_fields
from unstructured_client.models import shared
from unstructured_client._hooks.custom.request_utils import create_pdf_chunk_request_params, create_response, get_multipart_stream_fields


# make the above test using @pytest.mark.parametrize
@@ -30,6 +31,7 @@ def test_get_multipart_stream_fields(input_request, expected):
fields = get_multipart_stream_fields(input_request)
assert fields == expected


def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
with pytest.raises(ValueError):
get_multipart_stream_fields(httpx.Request(
@@ -40,6 +42,7 @@ def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
headers={"Content-Type": "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW"}),
)


@pytest.mark.parametrize(("input_form_data", "page_number", "expected_form_data"), [
(
{"hello": "world"},
@@ -70,3 +73,26 @@ def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
def test_create_pdf_chunk_request_params(input_form_data, page_number, expected_form_data):
form_data = create_pdf_chunk_request_params(input_form_data, page_number)
assert form_data == expected_form_data


def test_create_response_for_json():
elements = [
{"type": "Title", "text": "Hello, World!"},
{"type": "NarrativeText", "text": "Goodbye!"},
]
response = create_response(elements)
assert response.status_code == 200
assert response.json() == elements
assert response.headers["Content-Type"] == "application/json"


def test_create_response_for_csv():
elements = [
b'type,element_id,text,languages,page_number,filename,filetype,parent_id' \
b'\nTitle,f73329878fbbb0bb131a83e7b6daacbe,Module One - Introduction to Product' \
b' Development and Quality Assurance,[\'eng\'],1,list-item-example-1.pdf,application/pdf,'
]
response = create_response(elements)
assert response.status_code == 200
pytest.raises(json.decoder.JSONDecodeError, response.json)
assert response.headers["Content-Type"] == "text/csv; charset=utf-8"
6 changes: 1 addition & 5 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
from __future__ import annotations

import asyncio
import io
import logging
from asyncio import Task
from collections import Counter
from functools import partial
from typing import Coroutine

import httpx
import pytest
import requests
from requests_toolbelt import MultipartDecoder, MultipartEncoder
from requests_toolbelt import MultipartDecoder

from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
from unstructured_client._hooks.custom.form_utils import (
1 change: 1 addition & 0 deletions docs/models/operations/partitionresponse.md
Original file line number Diff line number Diff line change
@@ -8,4 +8,5 @@
| `content_type` | *str* | :heavy_check_mark: | HTTP response content type for this operation |
| `status_code` | *int* | :heavy_check_mark: | HTTP response status code for this operation |
| `raw_response` | [httpx.Response](https://www.python-httpx.org/api/#response) | :heavy_check_mark: | Raw HTTP response; suitable for custom response parsing |
| `csv_elements` | *Optional[str]* | :heavy_minus_sign: | Successful Response |
| `elements` | List[Dict[str, *Any*]] | :heavy_minus_sign: | Successful Response |
Copy link
Contributor Author

@Coniferish Coniferish Oct 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: elements isn't displayed as optional, but response.elements is None when csv_elements is returned.

3 changes: 2 additions & 1 deletion docs/models/shared/strategy.md
Original file line number Diff line number Diff line change
@@ -10,4 +10,5 @@ The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto.
| `FAST` | fast |
| `HI_RES` | hi_res |
| `AUTO` | auto |
| `OCR_ONLY` | ocr_only |
| `OCR_ONLY` | ocr_only |
| `OD_ONLY` | od_only |
13 changes: 9 additions & 4 deletions src/unstructured_client/_hooks/custom/request_utils.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
import io
import json
import logging
from typing import Tuple, Any, BinaryIO
from typing import Any, BinaryIO, Tuple

import httpx
from httpx._multipart import DataField, FileField
@@ -207,7 +207,8 @@ def prepare_request_headers(
new_headers.pop("Content-Length", None)
return new_headers

def create_response(elements: list) -> httpx.Response:

def create_response(elements: list[dict[str, Any] | bytes]) -> httpx.Response:
"""
Creates a modified response object with updated content.

@@ -218,8 +219,12 @@ def create_response(elements: list) -> httpx.Response:
Returns:
The modified response object with updated content.
"""
response = httpx.Response(status_code=200, headers={"Content-Type": "application/json"})
content = json.dumps(elements).encode()
if isinstance(elements, list) and all(isinstance(element, bytes) for element in elements):
response = httpx.Response(status_code=200, headers={"Content-Type": "text/csv; charset=utf-8"})
content = b''.join(elements) # type: ignore
else:
response = httpx.Response(status_code=200, headers={"Content-Type": "application/json"})
content = json.dumps(elements).encode()
content_length = str(len(content))
response.headers.update({"Content-Length": content_length})
setattr(response, "_content", content)
19 changes: 13 additions & 6 deletions src/unstructured_client/_hooks/custom/split_pdf_hook.py
Original file line number Diff line number Diff line change
@@ -576,10 +576,13 @@ def _await_elements(
response_number,
)
successful_responses.append(res)
if self.cache_tmp_data_feature:
elements.append(load_elements_from_response(res))
else:
elements.append(res.json())
if res.headers["Content-Type"] == "application/json":
if self.cache_tmp_data_feature:
elements.append(load_elements_from_response(res))
else:
elements.append(res.json())
else: # -- Response contains csv data
elements.append(res.content) # type: ignore
else:
error_message = f"Failed to partition set {response_number}."

@@ -591,7 +594,12 @@ def _await_elements(

self.api_successful_responses[operation_id] = successful_responses
self.api_failed_responses[operation_id] = failed_responses
flattened_elements = [element for sublist in elements for element in sublist]
flattened_elements = []
for sublist in elements:
if isinstance(sublist, list):
flattened_elements.extend(sublist)
else:
flattened_elements.append(sublist)
return flattened_elements

def after_success(
@@ -613,7 +621,6 @@ def after_success(
"""
# Grab the correct id out of the dummy request
operation_id = response.request.headers.get("operation_id")

elements = self._await_elements(operation_id)

# if fails are disallowed, return the first failed response
Loading
Oops, something went wrong.
Loading
Oops, something went wrong.