In [80]:
import sys
import os
import boto3

sys.path.append(
    os.path.abspath(
        os.path.join(os.getcwd(), '../credentials')
    )
)


In [81]:
from credentials import ORACLE_S3_ACCESS_KEY, ORACLE_S3_SECRET_KEY, ORACLE_S3_ENDPOINT, ORACLE_REGION, ORACLE_INGEST_BUCKET

In [82]:
s3_client = boto3.client(
    "s3",
    aws_access_key_id=ORACLE_S3_ACCESS_KEY,
    aws_secret_access_key=ORACLE_S3_SECRET_KEY,
    endpoint_url=ORACLE_S3_ENDPOINT
)

In [83]:
objects = s3_client.list_objects_v2(Bucket=ORACLE_INGEST_BUCKET)

In [84]:
objects.get("Contents",[])

[{'Key': 'April2024.pdf',
  'LastModified': datetime.datetime(2025, 8, 22, 21, 53, 26, tzinfo=tzlocal()),
  'ETag': '"b84b5eb4948f442c0ad455041509946e"',
  'Size': 78863,
  'StorageClass': 'STANDARD'},
 {'Key': 'December2023.pdf',
  'LastModified': datetime.datetime(2025, 8, 22, 21, 53, 26, tzinfo=tzlocal()),
  'ETag': '"c88be1b3ea330c7b619376b6ed69e96e"',
  'Size': 79388,
  'StorageClass': 'STANDARD'},
 {'Key': 'Feb2024.pdf',
  'LastModified': datetime.datetime(2025, 8, 22, 21, 53, 26, tzinfo=tzlocal()),
  'ETag': '"c710b51fe86c1b3d4591be580108a369"',
  'Size': 79336,
  'StorageClass': 'STANDARD'},
 {'Key': 'Jan2024.pdf',
  'LastModified': datetime.datetime(2025, 8, 22, 21, 53, 26, tzinfo=tzlocal()),
  'ETag': '"2b0753a753799559fb4fdadd8702ce59"',
  'Size': 78906,
  'StorageClass': 'STANDARD'},
 {'Key': 'March2024.pdf',
  'LastModified': datetime.datetime(2025, 8, 22, 21, 53, 26, tzinfo=tzlocal()),
  'ETag': '"b52fdc30107cc39ef339c7b80782a4cc"',
  'Size': 79921,
  'StorageClass': 'STA

In [85]:
from credentials import MILVUS_HOST, MILVUS_PORT, COLLECTION_NAME

In [97]:
from pymilvus import MilvusClient, Collection

# Authentication not enabled
client = MilvusClient("http://"+MILVUS_HOST+":"+MILVUS_PORT, db_name="default")

In [98]:
client.list_collections()

['docs', 'energy_vector']

In [99]:


if not client.has_collection(COLLECTION_NAME):
    client.create_collection(
        collection_name=COLLECTION_NAME,
        dimension=768,
        metric_type="COSINE",
        auto_id=True,
        enable_dynamic_field=True
    )
    print(f"Collection '{COLLECTION_NAME}' created.")
else:
    print(f"Collection '{COLLECTION_NAME}' already exists.")

client.load_collection(COLLECTION_NAME)

Collection 'energy_vector' already exists.


In [100]:
stats = client.get_collection_stats(collection_name=COLLECTION_NAME)
row_count = int(stats["row_count"])
print(f"Rows in '{COLLECTION_NAME}': {row_count}")

Rows in 'energy_vector': 50


In [132]:
from typing import Iterable, Dict, Any, List, Generator
from pymilvus import MilvusClient

def _norm_etag(x: str) -> str:
    return str(x).strip('"').strip("'") if x is not None else x

def _batched(seq: List[str], n: int) -> Iterable[List[str]]:
    for i in range(0, len(seq), n):
        yield seq[i:i+n]

def _query_existing_etags(client: MilvusClient, collection: str, etags: List[str], etag_field="etag", batch_size=500) -> set:
    """Check existence in Milvus in IN-batches; returns a set of found etags."""
    found = set()
    in_list = ",".join([f'"{e}"' for e in etags])
    filt = f'{etag_field} in [{in_list}]'
    rows = client.query(
        collection_name=collection,
        filter=filt,
        output_fields=[etag_field],
    )
    
    for r in rows:
        val = r.get(etag_field)
        if val is not None:
            found.add(_norm_etag(val))
    return found

def iter_new_objects_by_etag(
    s3_client,
    bucket: str,
    prefix: str,
    milvus_client: MilvusClient,
    collection_name: str,
    etag_field: str = "ETag",
    page_batch_check: int = 5,
) -> Generator[Dict[str, Any], None, None]:
    """
    Stream pages from S3/OCI and yield only objects whose ETag is NOT already in Milvus.
    Memory-friendly: handles a single page at a time.
    """
    paginator = s3_client.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket, Prefix=prefix or "")


    milvus_client.load_collection(collection_name)

    for page in pages:
        contents = page.get("Contents", []) or []
        if not contents:
            continue

        etag_to_objs: Dict[str, List[Dict[str, Any]]] = {}
        for obj in contents:
            etg = _norm_etag(obj.get("ETag"))
            if not etg:
                continue
            etag_to_objs.setdefault(etg, []).append(obj)

        unique_etags = list(etag_to_objs.keys())

        # print("unique_etags : ",print(len(unique_etags)), unique_etags)
        existing = set()
        for sub in _batched(unique_etags, page_batch_check):
            existing |= _query_existing_etags(
                milvus_client, collection_name, sub, etag_field=etag_field, batch_size=page_batch_check
            )
        # print("existing : ", len(existing),existing)
        for etg, objs in etag_to_objs.items():
            if etg not in existing:
                for obj in objs:
                    yield obj


In [133]:
import io
import os
from typing import Tuple, List, Dict, Any
from pypdf import PdfReader


def _is_pdf_key(key: str) -> bool:
    return key.lower().endswith(".pdf")


def read_pdf_from_s3_bytes(s3_client, bucket: str, key: str, max_pages: int = None) -> Tuple[str, List[str], Dict[str, Any]]:
    """
    Download the object into memory (bytes) and extract text.
    Returns: (full_text, pages_list, meta)
    """
    resp = s3_client.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()  # bytes
    reader = PdfReader(io.BytesIO(body))

    try:
        if getattr(reader, "is_encrypted", False):
            try:
                reader.decrypt("")
            except Exception:
                pass
    except Exception:
        pass

    pages, limit = [], (max_pages or 10**9)
    for i, page in enumerate(reader.pages):
        if i >= limit:
            break
        txt = page.extract_text() or ""
        pages.append(txt)

    full_text = "\n\n".join(pages)
    meta = {
        "n_pages_total": len(reader.pages),
        "n_pages_read": len(pages),
        "content_length": resp.get("ContentLength"),
        "content_type": resp.get("ContentType"),
        "key": key,
        "bucket": bucket,
    }
    return full_text, pages, meta


In [134]:
new_objs_iter = iter_new_objects_by_etag(
    s3_client=s3_client,
    bucket=ORACLE_INGEST_BUCKET,
    prefix="",
    milvus_client=client,
    collection_name=COLLECTION_NAME,
    etag_field="ETag",
    page_batch_check=5
)

for obj in new_objs_iter:
    key  = obj["Key"]
    etag = obj["ETag"].strip('"')
    print(key," : ",etag)


    if not _is_pdf_key(key):
        continue

    try:
        text, pages, meta = read_pdf_from_s3_bytes(s3_client, ORACLE_INGEST_BUCKET, key)
        # print(f"[PDF] {key} | etag={etag} | pages_read={meta['n_pages_read']}/{meta['n_pages_total']}")

        # print(text)

    except Exception as e:
        print(f"[skip] {key} ({e})")

April2024.pdf  :  b84b5eb4948f442c0ad455041509946e
December2023.pdf  :  c88be1b3ea330c7b619376b6ed69e96e
Feb2024.pdf  :  c710b51fe86c1b3d4591be580108a369
Jan2024.pdf  :  2b0753a753799559fb4fdadd8702ce59
March2024.pdf  :  b52fdc30107cc39ef339c7b80782a4cc
Nov2023.pdf  :  8ef5b4b8fe9dd0098932c6f7c4151163
October2023.pdf  :  8cede8e9593081b7543c00344f897074
September2023.pdf  :  953f7786a76663cfbe18c669408db85b


In [141]:
rows = client.query(
    collection_name=COLLECTION_NAME,
    filter="id >= 0",                 # adjust if your PK name differs
    output_fields=["id", "$meta"],    # add other fields as needed
)

print("Length : ",len(rows))
# for r in rows:
#     print(r["filename"])

print(set([(i["filename"],i["ETag"]) for i in rows]))

Length :  21
{('Nov2023.pdf', '8ef5b4b8fe9dd0098932c6f7c4151163'), ('October2023.pdf', '8cede8e9593081b7543c00344f897074'), ('September2023.pdf', '953f7786a76663cfbe18c669408db85b'), ('April2024.pdf', 'b84b5eb4948f442c0ad455041509946e'), ('December2023.pdf', 'c88be1b3ea330c7b619376b6ed69e96e'), ('May2024.pdf', 'e6aa5b06e157a910bea8064308b08d2c'), ('March2024.pdf', 'b52fdc30107cc39ef339c7b80782a4cc'), ('Jan2024.pdf', '2b0753a753799559fb4fdadd8702ce59'), ('Feb2024.pdf', 'c710b51fe86c1b3d4591be580108a369')}
