# Configuring GX Client

In [2]:
import os
from dotenv import load_dotenv
from groundx import GroundX

# Load environment variables from .env file
load_dotenv()

# Get API key from environment
api_key = os.getenv("GROUNDX_API_KEY")
if not api_key:
    raise ValueError("GROUNDX_API_KEY not found in .env file")

# Initialize client
client = GroundX(api_key=api_key)

# Looking Up Documents from a Particular Bucket

In [3]:
client.documents.lookup(
    id=20902,
    n=100
)

DocumentLookupResponse(count=100, documents=[DocumentDetail(bucket_id=20902, document_id='1e933a22-cc4d-490a-adb3-57a5e77ccf24', file_name='TipantaxiEBT.2023.10.30.pltf.pdf', file_size='1.3 MB', file_type='pdf', filter=None, process_id='ba2800ea-f587-48c1-acfb-f703c02e449a', search_data=None, source_url='https://claims-doc-uploader.s3.amazonaws.com/qa-groundx/20902/TipantaxiEBT.2023.10.30.pltf.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEOv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJIMEYCIQDWe%2FkNo1%2FxiL8kWX1ZOPydVeZ1iPnZpWgpU7qk7jL0nwIhAPUW0XgTyk5YhWR4LjED9vKYajWpWeo6PZdLP4cynFLfKrQFCFQQAhoMOTAzNzEzMDQ2MjYxIgxpGYAiBcgo9JNzBZAqkQWyZSldYHBNQK5NCRu%2BcRvVO5D9p9GsrCak3nitl3rJWLIwQUdYLczwTL6PM2HKc7J7o0e%2FqDjVSVIcUsbsTHNWq9dCim%2FUtkcgchTJuT21EEt6RnD2c2XPqA%2BTiq0RVQ4JvvsrRTlNpXaocKV5%2BfEreyeoxG0a0lw1XPIRRtIBG0lBD1N1DvpLhItMfl8LMVeKAomngs5T7RsX6MCeMemPs8ROsyHqFJ%2FKuoIqHFc2v9TdeGZjh%2FadPLgQ9e8imlj8kXtRLTFc%2BqF6EMaQlH0%2Be5qmpuURD2KOuOkoK7JrYK0RQUuevraVdxijVhhfbwNtkfh9tcb3AMDbQ%2FBLBR

In [None]:
def lookup_all_documents(doc_id: int, page_size: int = 100):
    """Retrieve all document chunks using nextToken pagination."""
    all_documents = []
    next_token = None

    while True:
        params = {"id": doc_id, "n": page_size}
        if next_token:
            params["next_token"] = next_token

        resp = client.documents.lookup(**params)
        print(resp)

        docs = resp.documents
        all_documents.extend(docs)

        # Check if there's another page
        next_token = resp.next_token
        if not next_token:
            break

    return all_documents

# Example usage
all_docs = lookup_all_documents(doc_id=20902, page_size=20)
print(f"Retrieved {len(all_docs)} documents total")

TypeError: DocumentsClient.lookup() got an unexpected keyword argument 'count'

In [14]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def fetch_xray_single(doc, timeout=30):
    """Download a single xray JSON safely."""
    file_name = getattr(doc, "file_name", None) or doc.get("fileName") or "unknown"
    xray_url = getattr(doc, "xray_url", None) or doc.get("xrayUrl")

    if not xray_url:
        return {"file_name": file_name, "error": "no xray_url"}

    try:
        resp = requests.get(xray_url, timeout=timeout)
        resp.raise_for_status()
        return {"file_name": file_name, "xray": resp.json()}
    except Exception as e:
        return {"file_name": file_name, "error": str(e)}

def fetch_all_xrays(all_docs, max_workers=10):
    """
    Multithreaded download of xrays with progress bar.
    
    Args:
        all_docs: list of dicts/objects with file_name/xray_url
        max_workers: number of threads
    
    Returns:
        List[dict]: each item has {"file_name": ..., "xray": ...} or {"error": ...}
    """
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_xray_single, doc): doc for doc in all_docs}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading xrays"):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                # This should rarely happen since fetch_xray_single is already error-safe
                doc = futures[future]
                file_name = getattr(doc, "file_name", None) or doc.get("fileName") or "unknown"
                results.append({"file_name": file_name, "error": f"unhandled: {e}"})
    return results

# Example usage
xray_data = fetch_all_xrays(all_docs, max_workers=10)
print(f"\n✅ Completed: {len(xray_data)} xrays fetched")
errors = [x for x in xray_data if "error" in x]
if errors:
    print(f"⚠️ {len(errors)} errors encountered")


Downloading xrays: 100%|██████████| 526/526 [00:16<00:00, 31.60it/s]


✅ Completed: 526 xrays fetched
⚠️ 38 errors encountered





In [24]:
len(xray_data[3]['xray']['chunks'])

4

In [67]:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")

document_stats = []

for document in xray_data:
    if 'xray' in document:
        
        #Inferring number of pages, and skipping documents with no bounding boxes
        num_pages = []
        for chunk in document['xray']['chunks']:
            if chunk['boundingBoxes'] is None:
                continue
            for bb in chunk['boundingBoxes']:
                num_pages.append(bb['pageNumber'])
        if len(num_pages) == 0:
            continue
        num_pages = max(num_pages)

        doc_stats = {'num_pages': num_pages, 'chunk_stats': []}
        
        #of the pages that have bounding boxes, getting statistics around:
        # - the number of input tokens
        # - the number of output tokens
        # - the size of input images

        #extracting input and output tokens per chunk
        text_fields = ['json', 'narrative', 'text']

        for chunk in document['xray']['chunks']:
            chunk_stats = {}

            #counting chunks for text
            for field in text_fields:
                if field in chunk:
                    tokens = enc.encode(str(chunk[field]))
                    num_tokens = len(tokens)
                    chunk_stats[field] = num_tokens
                    
            #checking for multimodality
            chunk_stats['is_multimodal'] = 'multimodalUrl' in chunk

            #appending chunk stats to document
            doc_stats['chunk_stats'].append(chunk_stats)
        document_stats.append(doc_stats)

        

In [68]:
document_stats

[{'num_pages': 2,
  'chunk_stats': [{'json': 1000, 'text': 1002, 'is_multimodal': False}]},
 {'num_pages': 2,
  'chunk_stats': [{'json': 639, 'text': 17, 'is_multimodal': False},
   {'json': 272, 'narrative': 76, 'text': 288, 'is_multimodal': True},
   {'json': 249, 'text': 216, 'is_multimodal': False}]},
 {'num_pages': 2,
  'chunk_stats': [{'json': 241, 'text': 24, 'is_multimodal': False},
   {'json': 272, 'narrative': 83, 'text': 174, 'is_multimodal': True},
   {'json': 145, 'text': 112, 'is_multimodal': False},
   {'json': 273, 'narrative': 85, 'text': 40, 'is_multimodal': True},
   {'json': 67, 'text': 40, 'is_multimodal': False},
   {'json': 777, 'narrative': 62, 'text': 354, 'is_multimodal': True},
   {'json': 687, 'narrative': 54, 'text': 47, 'is_multimodal': True},
   {'json': 67, 'text': 37, 'is_multimodal': False}]},
 {'num_pages': 3,
  'chunk_stats': [{'json': 479, 'text': 458, 'is_multimodal': False},
   {'json': 714, 'narrative': 83, 'text': 88, 'is_multimodal': True},
   