From 7f7a907561c09bf7472058b919e2238e8d96f90c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 21 Nov 2025 00:26:22 +0000
Subject: [PATCH 1/5] Initial plan


From ea906720937cf2411ab20139fcb5f8ead04b12d9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 21 Nov 2025 00:31:45 +0000
Subject: [PATCH 2/5] Implement nextLink pagination in get_all_analyzers method

Co-authored-by: yungshinlintw <14239352+yungshinlintw@users.noreply.github.com>
---
 python/content_understanding_client.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py
index 5417d6c..e0c25ea 100644
--- a/python/content_understanding_client.py
+++ b/python/content_understanding_client.py
@@ -284,21 +284,33 @@ def get_all_analyzers(self) -> Dict[str, Any]:
         Retrieves a list of all available analyzers from the content understanding service.
 
         This method sends a GET request to the service endpoint to fetch the list of analyzers.
+        It automatically follows pagination links (nextLink) to retrieve all pages of results.
         It raises an HTTPError if the request fails.
 
         Returns:
             dict: A dictionary containing the JSON response from the service, which includes
-                  the list of available analyzers.
+                  the complete list of available analyzers across all pages in the "value" key.
 
         Raises:
             requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code.
         """
-        response = requests.get(
-            url=self._get_analyzer_list_url(self._endpoint, self._api_version),
-            headers=self._headers,
-        )
-        self._raise_for_status_with_detail(response)
-        return response.json()
+        all_analyzers = []
+        url = self._get_analyzer_list_url(self._endpoint, self._api_version)
+        
+        while url:
+            response = requests.get(url=url, headers=self._headers)
+            self._raise_for_status_with_detail(response)
+            response_json = response.json()
+            
+            # Collect analyzers from current page
+            analyzers = response_json.get("value", [])
+            all_analyzers.extend(analyzers)
+            
+            # Get the next page URL, if it exists
+            url = response_json.get("nextLink")
+        
+        # Return in the same format as the original response
+        return {"value": all_analyzers}
 
     def get_defaults(self) -> Dict[str, Any]:
         """

From ee5946d0ecfa38182875d35d18cc5e5c85b85939 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 21 Nov 2025 00:34:49 +0000
Subject: [PATCH 3/5] Add safeguards for pagination: circular link detection
 and value type validation

Co-authored-by: yungshinlintw <14239352+yungshinlintw@users.noreply.github.com>
---
 python/content_understanding_client.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py
index e0c25ea..751688f 100644
--- a/python/content_understanding_client.py
+++ b/python/content_understanding_client.py
@@ -293,17 +293,32 @@ def get_all_analyzers(self) -> Dict[str, Any]:
 
         Raises:
             requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code.
+            RuntimeError: If too many pages are encountered (likely indicating a pagination loop).
         """
         all_analyzers = []
         url = self._get_analyzer_list_url(self._endpoint, self._api_version)
+        visited_urls = set()
+        max_pages = 1000  # Safeguard against infinite loops
+        page_count = 0
         
         while url:
+            # Prevent infinite loops from circular pagination links
+            if url in visited_urls:
+                raise RuntimeError(f"Circular pagination detected: {url} was already visited")
+            if page_count >= max_pages:
+                raise RuntimeError(f"Too many pages ({max_pages}) encountered during pagination")
+            
+            visited_urls.add(url)
+            page_count += 1
+            
             response = requests.get(url=url, headers=self._headers)
             self._raise_for_status_with_detail(response)
             response_json = response.json()
             
             # Collect analyzers from current page
             analyzers = response_json.get("value", [])
+            if not isinstance(analyzers, list):
+                raise ValueError(f"Expected 'value' to be a list, got {type(analyzers).__name__}")
             all_analyzers.extend(analyzers)
             
             # Get the next page URL, if it exists

From 45b8c0e1e68d7d2d0c9469485e4a428eadd32a18 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 21 Nov 2025 00:37:41 +0000
Subject: [PATCH 4/5] Refactor pagination safeguards: add class constant and
 improve error messages

Co-authored-by: yungshinlintw <14239352+yungshinlintw@users.noreply.github.com>
---
 python/content_understanding_client.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py
index 751688f..1da4c9f 100644
--- a/python/content_understanding_client.py
+++ b/python/content_understanding_client.py
@@ -69,6 +69,9 @@ class AzureContentUnderstandingClient:
         ".heif",
     ]  # Pro mode and Training for Standard mode only support document data
 
+    # Maximum number of pages to retrieve when following pagination links
+    MAX_PAGINATION_PAGES: int = 1000
+
     def __init__(
         self,
         endpoint: str,
@@ -294,19 +297,22 @@ def get_all_analyzers(self) -> Dict[str, Any]:
         Raises:
             requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code.
             RuntimeError: If too many pages are encountered (likely indicating a pagination loop).
+            ValueError: If the API response contains an invalid 'value' field (not a list).
         """
         all_analyzers = []
         url = self._get_analyzer_list_url(self._endpoint, self._api_version)
         visited_urls = set()
-        max_pages = 1000  # Safeguard against infinite loops
         page_count = 0
         
         while url:
             # Prevent infinite loops from circular pagination links
             if url in visited_urls:
                 raise RuntimeError(f"Circular pagination detected: {url} was already visited")
-            if page_count >= max_pages:
-                raise RuntimeError(f"Too many pages ({max_pages}) encountered during pagination")
+            if page_count >= self.MAX_PAGINATION_PAGES:
+                raise RuntimeError(
+                    f"Too many pages ({self.MAX_PAGINATION_PAGES}) encountered during pagination. "
+                    f"This likely indicates a pagination loop or misconfiguration."
+                )
             
             visited_urls.add(url)
             page_count += 1
@@ -318,7 +324,10 @@ def get_all_analyzers(self) -> Dict[str, Any]:
             # Collect analyzers from current page
             analyzers = response_json.get("value", [])
             if not isinstance(analyzers, list):
-                raise ValueError(f"Expected 'value' to be a list, got {type(analyzers).__name__}")
+                raise ValueError(
+                    f"Expected 'value' to be a list, got {type(analyzers).__name__}. "
+                    f"Response structure: {json.dumps(response_json, indent=2)[:500]}"
+                )
             all_analyzers.extend(analyzers)
             
             # Get the next page URL, if it exists

From c29f0d5876b42a6817dcb90edcd49d485fc0bfb5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 21 Nov 2025 00:40:42 +0000
Subject: [PATCH 5/5] Fix pagination limit enforcement and sanitize error
 messages

Co-authored-by: yungshinlintw <14239352+yungshinlintw@users.noreply.github.com>
---
 python/content_understanding_client.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py
index 1da4c9f..7b7cc68 100644
--- a/python/content_understanding_client.py
+++ b/python/content_understanding_client.py
@@ -308,15 +308,17 @@ def get_all_analyzers(self) -> Dict[str, Any]:
             # Prevent infinite loops from circular pagination links
             if url in visited_urls:
                 raise RuntimeError(f"Circular pagination detected: {url} was already visited")
-            if page_count >= self.MAX_PAGINATION_PAGES:
-                raise RuntimeError(
-                    f"Too many pages ({self.MAX_PAGINATION_PAGES}) encountered during pagination. "
-                    f"This likely indicates a pagination loop or misconfiguration."
-                )
             
             visited_urls.add(url)
             page_count += 1
             
+            # Check page count after incrementing to properly enforce limit
+            if page_count > self.MAX_PAGINATION_PAGES:
+                raise RuntimeError(
+                    f"Maximum pagination limit ({self.MAX_PAGINATION_PAGES} pages) exceeded. "
+                    f"This likely indicates a pagination loop or misconfiguration."
+                )
+            
             response = requests.get(url=url, headers=self._headers)
             self._raise_for_status_with_detail(response)
             response_json = response.json()
@@ -324,9 +326,11 @@ def get_all_analyzers(self) -> Dict[str, Any]:
             # Collect analyzers from current page
             analyzers = response_json.get("value", [])
             if not isinstance(analyzers, list):
+                # Include structure info without potentially sensitive response content
+                structure_keys = list(response_json.keys()) if isinstance(response_json, dict) else []
                 raise ValueError(
                     f"Expected 'value' to be a list, got {type(analyzers).__name__}. "
-                    f"Response structure: {json.dumps(response_json, indent=2)[:500]}"
+                    f"Response contains keys: {structure_keys}"
                 )
             all_analyzers.extend(analyzers)