From 7f7a907561c09bf7472058b919e2238e8d96f90c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 00:26:22 +0000 Subject: [PATCH 1/5] Initial plan From ea906720937cf2411ab20139fcb5f8ead04b12d9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 00:31:45 +0000 Subject: [PATCH 2/5] Implement nextLink pagination in get_all_analyzers method Co-authored-by: yungshinlintw <14239352+yungshinlintw@users.noreply.github.com> --- python/content_understanding_client.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py index 5417d6c..e0c25ea 100644 --- a/python/content_understanding_client.py +++ b/python/content_understanding_client.py @@ -284,21 +284,33 @@ def get_all_analyzers(self) -> Dict[str, Any]: Retrieves a list of all available analyzers from the content understanding service. This method sends a GET request to the service endpoint to fetch the list of analyzers. + It automatically follows pagination links (nextLink) to retrieve all pages of results. It raises an HTTPError if the request fails. Returns: dict: A dictionary containing the JSON response from the service, which includes - the list of available analyzers. + the complete list of available analyzers across all pages in the "value" key. Raises: requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code. """ - response = requests.get( - url=self._get_analyzer_list_url(self._endpoint, self._api_version), - headers=self._headers, - ) - self._raise_for_status_with_detail(response) - return response.json() + all_analyzers = [] + url = self._get_analyzer_list_url(self._endpoint, self._api_version) + + while url: + response = requests.get(url=url, headers=self._headers) + self._raise_for_status_with_detail(response) + response_json = response.json() + + # Collect analyzers from current page + analyzers = response_json.get("value", []) + all_analyzers.extend(analyzers) + + # Get the next page URL, if it exists + url = response_json.get("nextLink") + + # Return in the same format as the original response + return {"value": all_analyzers} def get_defaults(self) -> Dict[str, Any]: """ From ee5946d0ecfa38182875d35d18cc5e5c85b85939 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 00:34:49 +0000 Subject: [PATCH 3/5] Add safeguards for pagination: circular link detection and value type validation Co-authored-by: yungshinlintw <14239352+yungshinlintw@users.noreply.github.com> --- python/content_understanding_client.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py index e0c25ea..751688f 100644 --- a/python/content_understanding_client.py +++ b/python/content_understanding_client.py @@ -293,17 +293,32 @@ def get_all_analyzers(self) -> Dict[str, Any]: Raises: requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code. + RuntimeError: If too many pages are encountered (likely indicating a pagination loop). """ all_analyzers = [] url = self._get_analyzer_list_url(self._endpoint, self._api_version) + visited_urls = set() + max_pages = 1000 # Safeguard against infinite loops + page_count = 0 while url: + # Prevent infinite loops from circular pagination links + if url in visited_urls: + raise RuntimeError(f"Circular pagination detected: {url} was already visited") + if page_count >= max_pages: + raise RuntimeError(f"Too many pages ({max_pages}) encountered during pagination") + + visited_urls.add(url) + page_count += 1 + response = requests.get(url=url, headers=self._headers) self._raise_for_status_with_detail(response) response_json = response.json() # Collect analyzers from current page analyzers = response_json.get("value", []) + if not isinstance(analyzers, list): + raise ValueError(f"Expected 'value' to be a list, got {type(analyzers).__name__}") all_analyzers.extend(analyzers) # Get the next page URL, if it exists From 45b8c0e1e68d7d2d0c9469485e4a428eadd32a18 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 00:37:41 +0000 Subject: [PATCH 4/5] Refactor pagination safeguards: add class constant and improve error messages Co-authored-by: yungshinlintw <14239352+yungshinlintw@users.noreply.github.com> --- python/content_understanding_client.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py index 751688f..1da4c9f 100644 --- a/python/content_understanding_client.py +++ b/python/content_understanding_client.py @@ -69,6 +69,9 @@ class AzureContentUnderstandingClient: ".heif", ] # Pro mode and Training for Standard mode only support document data + # Maximum number of pages to retrieve when following pagination links + MAX_PAGINATION_PAGES: int = 1000 + def __init__( self, endpoint: str, @@ -294,19 +297,22 @@ def get_all_analyzers(self) -> Dict[str, Any]: Raises: requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code. RuntimeError: If too many pages are encountered (likely indicating a pagination loop). + ValueError: If the API response contains an invalid 'value' field (not a list). """ all_analyzers = [] url = self._get_analyzer_list_url(self._endpoint, self._api_version) visited_urls = set() - max_pages = 1000 # Safeguard against infinite loops page_count = 0 while url: # Prevent infinite loops from circular pagination links if url in visited_urls: raise RuntimeError(f"Circular pagination detected: {url} was already visited") - if page_count >= max_pages: - raise RuntimeError(f"Too many pages ({max_pages}) encountered during pagination") + if page_count >= self.MAX_PAGINATION_PAGES: + raise RuntimeError( + f"Too many pages ({self.MAX_PAGINATION_PAGES}) encountered during pagination. " + f"This likely indicates a pagination loop or misconfiguration." + ) visited_urls.add(url) page_count += 1 @@ -318,7 +324,10 @@ def get_all_analyzers(self) -> Dict[str, Any]: # Collect analyzers from current page analyzers = response_json.get("value", []) if not isinstance(analyzers, list): - raise ValueError(f"Expected 'value' to be a list, got {type(analyzers).__name__}") + raise ValueError( + f"Expected 'value' to be a list, got {type(analyzers).__name__}. " + f"Response structure: {json.dumps(response_json, indent=2)[:500]}" + ) all_analyzers.extend(analyzers) # Get the next page URL, if it exists From c29f0d5876b42a6817dcb90edcd49d485fc0bfb5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 21 Nov 2025 00:40:42 +0000 Subject: [PATCH 5/5] Fix pagination limit enforcement and sanitize error messages Co-authored-by: yungshinlintw <14239352+yungshinlintw@users.noreply.github.com> --- python/content_understanding_client.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py index 1da4c9f..7b7cc68 100644 --- a/python/content_understanding_client.py +++ b/python/content_understanding_client.py @@ -308,15 +308,17 @@ def get_all_analyzers(self) -> Dict[str, Any]: # Prevent infinite loops from circular pagination links if url in visited_urls: raise RuntimeError(f"Circular pagination detected: {url} was already visited") - if page_count >= self.MAX_PAGINATION_PAGES: - raise RuntimeError( - f"Too many pages ({self.MAX_PAGINATION_PAGES}) encountered during pagination. " - f"This likely indicates a pagination loop or misconfiguration." - ) visited_urls.add(url) page_count += 1 + # Check page count after incrementing to properly enforce limit + if page_count > self.MAX_PAGINATION_PAGES: + raise RuntimeError( + f"Maximum pagination limit ({self.MAX_PAGINATION_PAGES} pages) exceeded. " + f"This likely indicates a pagination loop or misconfiguration." + ) + response = requests.get(url=url, headers=self._headers) self._raise_for_status_with_detail(response) response_json = response.json() @@ -324,9 +326,11 @@ def get_all_analyzers(self) -> Dict[str, Any]: # Collect analyzers from current page analyzers = response_json.get("value", []) if not isinstance(analyzers, list): + # Include structure info without potentially sensitive response content + structure_keys = list(response_json.keys()) if isinstance(response_json, dict) else [] raise ValueError( f"Expected 'value' to be a list, got {type(analyzers).__name__}. " - f"Response structure: {json.dumps(response_json, indent=2)[:500]}" + f"Response contains keys: {structure_keys}" ) all_analyzers.extend(analyzers)