# **Unstructured Document Retrieval**

In [1]:
import os

POPPLER_PATH = r"C:\poppler\poppler-25.12.0\Library\bin"
os.environ["PATH"] += os.pathsep + POPPLER_PATH

In [2]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_unstructured import UnstructuredLoader
from unstructured.cleaners.core import clean_extra_whitespace

'''
loader = DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=UnstructuredLoader,
    loader_kwargs={
        "mode": "elements",
        "strategy": "fast",
        "post_processors": [clean_extra_whitespace],
        "languages": ["en"],
        "poppler_path": POPPLER_PATH,
    },
)

docs = loader.load()
docs[5:10]
'''

  from .autonotebook import tqdm as notebook_tqdm


'\nloader = DirectoryLoader(\n    "../data/pdf",\n    glob="**/*.pdf",\n    loader_cls=UnstructuredLoader,\n    loader_kwargs={\n        "mode": "elements",\n        "strategy": "fast",\n        "post_processors": [clean_extra_whitespace],\n        "languages": ["en"],\n        "poppler_path": POPPLER_PATH,\n    },\n)\n\ndocs = loader.load()\ndocs[5:10]\n'

In [26]:
import json
from pathlib import Path
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

# --- Paths ---
JSONL_FOLDER = Path("../data/ingested/batches")

# --- Setup LangChain RecursiveCharacterTextSplitter ---
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # characters per chunk
    chunk_overlap=200,    # overlap to preserve context
    length_function=len,
    add_start_index=True,
    separators=["\n\n", "\n", ". ", " ", ""]
)

all_chunks = []

# --- Loop through all JSONL files in the folder ---
for jsonl_file in JSONL_FOLDER.glob("*.jsonl"):
    docs = []
    with jsonl_file.open("r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data.get("text", "")
            metadata = data.get("metadata", {})
            if text.strip():
                docs.append(Document(page_content=text, metadata=metadata))
    
    # Split documents into chunks
    file_chunks = text_splitter.split_documents(docs)
    all_chunks.extend(file_chunks)
    print(f"Processed {jsonl_file.name}: {len(docs)} documents → {len(file_chunks)} chunks")

print(f"\nTotal chunks from all JSONL files: {len(all_chunks)}")

# --- Optional: inspect first chunk ---
if all_chunks:
    print("Example chunk text:", all_chunks[0].page_content[:300])
    print("Example chunk metadata:", all_chunks[0].metadata)

Processed batch_0000.jsonl: 10 documents → 10 chunks
Processed batch_0001.jsonl: 10 documents → 10 chunks
Processed batch_0002.jsonl: 10 documents → 10 chunks
Processed batch_0003.jsonl: 10 documents → 10 chunks
Processed batch_0004.jsonl: 10 documents → 10 chunks
Processed batch_0005.jsonl: 10 documents → 10 chunks
Processed batch_0006.jsonl: 10 documents → 10 chunks
Processed batch_0007.jsonl: 10 documents → 10 chunks
Processed batch_0008.jsonl: 10 documents → 10 chunks
Processed batch_0009.jsonl: 10 documents → 10 chunks
Processed batch_0010.jsonl: 10 documents → 10 chunks
Processed batch_0011.jsonl: 10 documents → 10 chunks
Processed batch_0012.jsonl: 10 documents → 10 chunks
Processed batch_0013.jsonl: 10 documents → 10 chunks
Processed batch_0014.jsonl: 10 documents → 10 chunks
Processed batch_0015.jsonl: 10 documents → 10 chunks
Processed batch_0016.jsonl: 10 documents → 10 chunks
Processed batch_0017.jsonl: 10 documents → 10 chunks
Processed batch_0018.jsonl: 10 documents → 10 

In [None]:
"""
Optimized Embedding Script for Unstructured Chunks
--------------------------------------------------
This script loads ALL chunks at once and embeds them in a single operation,
resulting in 6-12x faster performance compared to incremental add_documents().
"""

import json
import time
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

# --- Configuration ---
BATCH_DIR = Path("../data/ingested/chunk_batches")
VECTOR_DB_DIR = Path("../vectordb/chromadb")

VECTOR_DB_DIR.mkdir(parents=True, exist_ok=True)

# --- Start Timer ---
start_time = time.time()

print("="*60)
print("OPTIMIZED EMBEDDING PIPELINE - BULK LOADING")
print("="*60)

# STEP 1: Load ALL chunks into memory at once
print("\n Loading all chunks from JSONL files...")
all_docs = []
file_count = 0

for batch_file in sorted(BATCH_DIR.glob("batch_*.jsonl")):
    print(f"  Reading {batch_file.name}...", end=" ")
    chunk_count = 0
    
    with batch_file.open("r", encoding="utf-8") as f:
        for line in f:
            record = json.loads(line)
            
            # Skip empty chunks
            if not record.get("text", "").strip():
                continue
            
            all_docs.append(
                Document(
                    page_content=record["text"],
                    metadata=record["metadata"]
                )
            )
            chunk_count += 1
    
    print(f"{chunk_count} chunks")
    file_count += 1

load_time = time.time()
print(f"\n Loaded {len(all_docs):,} chunks from {file_count} files")
print(f"   Time: {load_time - start_time:.2f} seconds")

# STEP 2: Filter complex metadata once
print("\n Filtering complex metadata...")
all_docs = filter_complex_metadata(all_docs)
filter_time = time.time()
print(f"   Time: {filter_time - load_time:.2f} seconds")

# STEP 3: Create embeddings in ONE SINGLE operation
print("\nCreating vector database with bulk embedding...")
print(f"   Model: nomic-embed-text")
print(f"   Total chunks: {len(all_docs):,}")
print(f"   This may take several minutes...\n")

embeddings = OllamaEmbeddings(model="nomic-embed-text")

# THE KEY OPTIMIZATION: from_documents() instead of add_documents() loop
vectorstore = Chroma.from_documents(
    documents=all_docs,
    embedding=embeddings,
    persist_directory=str(VECTOR_DB_DIR),
    collection_name="unstructured_chunks"
)

embed_time = time.time()

# --- Results ---
total_time = embed_time - start_time
chunks_per_second = len(all_docs) / total_time

print("\n" + "="*60)
print("EMBEDDING COMPLETE!")
print("="*60)
print(f"Total chunks embedded: {len(all_docs):,}")
print(f"Vector DB location: {VECTOR_DB_DIR}")
print(f"Collection count: {vectorstore._collection.count():,}")
print(f"\nPerformance Metrics:")
print(f"   Total time: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
print(f"   Loading time: {load_time - start_time:.2f}s")
print(f"   Filtering time: {filter_time - load_time:.2f}s")
print(f"   Embedding time: {embed_time - filter_time:.2f}s ({(embed_time - filter_time)/60:.2f} min)")
print(f"   Throughput: {chunks_per_second:.2f} chunks/second")
print("="*60)

OPTIMIZED EMBEDDING PIPELINE - BULK LOADING

📂 Loading all chunks from JSONL files...
  Reading batch_00000.jsonl... 50 chunks
  Reading batch_00001.jsonl... 50 chunks
  Reading batch_00002.jsonl... 50 chunks
  Reading batch_00003.jsonl... 50 chunks
  Reading batch_00004.jsonl... 50 chunks
  Reading batch_00005.jsonl... 50 chunks
  Reading batch_00006.jsonl... 50 chunks
  Reading batch_00007.jsonl... 50 chunks
  Reading batch_00008.jsonl... 50 chunks
  Reading batch_00009.jsonl... 50 chunks
  Reading batch_00010.jsonl... 50 chunks
  Reading batch_00011.jsonl... 50 chunks
  Reading batch_00012.jsonl... 50 chunks
  Reading batch_00013.jsonl... 50 chunks
  Reading batch_00014.jsonl... 50 chunks
  Reading batch_00015.jsonl... 50 chunks
  Reading batch_00016.jsonl... 50 chunks
  Reading batch_00017.jsonl... 50 chunks
  Reading batch_00018.jsonl... 50 chunks
  Reading batch_00019.jsonl... 50 chunks
  Reading batch_00020.jsonl... 50 chunks
  Reading batch_00021.jsonl... 50 chunks
  Reading ba

In [None]:
"""
FINAL OPTIMIZED PIPELINE - NO CHUNKING NEEDED
Using Ollama nomic-embed-text model
"""

import json
import time
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata

# Configuration
ELEMENTS_DIR = Path("../data/ingested/batches")
VECTOR_DB_DIR = Path("vectordb/chroma_db")

print("="*60)
print("LOADING ELEMENTS (NO CHUNKING NEEDED)")
print("="*60)

print(f"\nLoading from: {ELEMENTS_DIR}")

elements = []
total_files = len(list(ELEMENTS_DIR.glob("batch_*.jsonl")))
print(f"Found {total_files} batch files")

for i, batch_file in enumerate(sorted(ELEMENTS_DIR.glob("batch_*.jsonl")), 1):
    with batch_file.open("r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data.get("text", "").strip()
            
            # Filter out very small elements (footers, page numbers)
            if len(text) >= 30:
                elements.append(Document(
                    page_content=text,
                    metadata=data["metadata"]
                ))
    
    if i % 100 == 0:
        print(f"  Processed {i}/{total_files} files ({len(elements):,} elements so far)")

print(f"\nLoaded {len(elements):,} elements")

# Element statistics
avg_size = sum(len(e.page_content) for e in elements) / len(elements)
min_size = min(len(e.page_content) for e in elements)
max_size = max(len(e.page_content) for e in elements)

print(f"\nElement Statistics:")
print(f"  Average size: {avg_size:.0f} characters")
print(f"  Min size: {min_size} characters")
print(f"  Max size: {max_size} characters")
print(f"  Estimated words per element: {avg_size/5:.0f}")

# Filter metadata
print("\nFiltering complex metadata...")
elements = filter_complex_metadata(elements)

# Create embeddings with Ollama
print("\nCreating embeddings with Ollama nomic-embed-text...")
print(f"  Elements to embed: {len(elements):,}")
print(f"  This will take approximately 10-15 minutes...")

embeddings = OllamaEmbeddings(model="nomic-embed-text")

print("\nEmbedding in progress...")
start = time.time()

vectorstore = Chroma.from_documents(
    documents=elements,
    embedding=embeddings,
    persist_directory=str(VECTOR_DB_DIR),
    collection_name="kaggle_1k_docs"
)

elapsed = time.time() - start

# Results
print("\n" + "="*60)
print("EMBEDDING COMPLETE")
print("="*60)
print(f"Total elements embedded: {len(elements):,}")
print(f"Vector DB location: {VECTOR_DB_DIR.absolute()}")
print(f"Collection count: {vectorstore._collection.count():,}")
print(f"\nPerformance:")
print(f"  Total time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
print(f"  Throughput: {len(elements)/elapsed:.2f} elements/second")
print("="*60)

LOADING ELEMENTS (NO CHUNKING NEEDED)

Loading from: ..\data\ingested\batches
Found 23466 batch files
  Processed 100/23466 files (723 elements so far)
  Processed 200/23466 files (1,449 elements so far)
  Processed 300/23466 files (1,838 elements so far)
  Processed 400/23466 files (1,956 elements so far)
  Processed 500/23466 files (2,179 elements so far)
  Processed 600/23466 files (2,284 elements so far)
  Processed 700/23466 files (2,812 elements so far)
  Processed 800/23466 files (3,469 elements so far)
  Processed 900/23466 files (4,190 elements so far)
  Processed 1000/23466 files (4,938 elements so far)
  Processed 1100/23466 files (5,049 elements so far)
  Processed 1200/23466 files (5,143 elements so far)
  Processed 1300/23466 files (5,217 elements so far)
  Processed 1400/23466 files (5,317 elements so far)
  Processed 1500/23466 files (5,411 elements so far)
  Processed 1600/23466 files (5,500 elements so far)
  Processed 1700/23466 files (5,587 elements so far)
  Proces

  embeddings = OllamaEmbeddings(model="nomic-embed-text")
INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


KeyboardInterrupt: 

In [None]:
"""
IMPROVED PIPELINE - Smart Filtering + Light Chunking
"""

import json
import time
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Configuration
ELEMENTS_DIR = Path("../data/ingested/batches")
VECTOR_DB_DIR = Path("vectordb/chroma_db")

MIN_ELEMENT_SIZE = 100  # Skip tiny fragments
MAX_ELEMENT_SIZE = 1500  # Split large elements

print("="*60)
print("SMART FILTERING + LIGHT CHUNKING")
print("="*60)

print(f"\nLoading from: {ELEMENTS_DIR}")

raw_elements = []
total_files = len(list(ELEMENTS_DIR.glob("batch_*.jsonl")))
print(f"Found {total_files} batch files")

for i, batch_file in enumerate(sorted(ELEMENTS_DIR.glob("batch_*.jsonl")), 1):
    with batch_file.open("r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data.get("text", "").strip()
            
            if text:
                raw_elements.append(Document(
                    page_content=text,
                    metadata=data["metadata"]
                ))
    
    if i % 5000 == 0:
        print(f"  Processed {i}/{total_files} files ({len(raw_elements):,} elements)")

print(f"\nLoaded {len(raw_elements):,} raw elements")

# Filter and prepare
print("\nFiltering elements...")
filtered_elements = []
too_small = 0
too_large = 0

for elem in raw_elements:
    size = len(elem.page_content)
    
    if size < MIN_ELEMENT_SIZE:
        too_small += 1
        continue
    elif size > MAX_ELEMENT_SIZE:
        too_large += 1
        # Will chunk these later
        filtered_elements.append(elem)
    else:
        # Perfect size
        filtered_elements.append(elem)

print(f"  Kept: {len(filtered_elements):,} elements")
print(f"  Filtered out {too_small:,} elements (< {MIN_ELEMENT_SIZE} chars)")
print(f"  Found {too_large:,} large elements (> {MAX_ELEMENT_SIZE} chars, will split)")

# Split large elements
if too_large > 0:
    print(f"\nSplitting {too_large:,} large elements...")
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    final_elements = []
    for elem in filtered_elements:
        if len(elem.page_content) > MAX_ELEMENT_SIZE:
            chunks = splitter.split_documents([elem])
            final_elements.extend(chunks)
        else:
            final_elements.append(elem)
else:
    final_elements = filtered_elements

print(f"\nFinal element count: {len(final_elements):,}")

# Statistics
sizes = [len(e.page_content) for e in final_elements]
avg_size = sum(sizes) / len(sizes)
min_size = min(sizes)
max_size = max(sizes)

print(f"\nFinal Element Statistics:")
print(f"  Average size: {avg_size:.0f} characters (~{avg_size/5:.0f} words)")
print(f"  Min size: {min_size} characters")
print(f"  Max size: {max_size} characters")

# Filter metadata
print("\nFiltering complex metadata...")
final_elements = filter_complex_metadata(final_elements)

# Estimate time
estimated_time = len(final_elements) / 40  # ~40 chunks/sec with Ollama CPU
print(f"\nEstimated embedding time: {estimated_time/60:.1f} minutes")

proceed = input("\nProceed with embedding? (y/n): ")
if proceed.lower() != 'y':
    print("Aborted.")
    exit()

# Create embeddings
print("\nCreating embeddings with Ollama nomic-embed-text...")
embeddings = OllamaEmbeddings(model="nomic-embed-text")

print("\nEmbedding in progress...")
start = time.time()

vectorstore = Chroma.from_documents(
    documents=final_elements,
    embedding=embeddings,
    persist_directory=str(VECTOR_DB_DIR),
    collection_name="kaggle_1k_docs"
)

elapsed = time.time() - start

# Results
print("\n" + "="*60)
print("EMBEDDING COMPLETE")
print("="*60)
print(f"Total elements embedded: {len(final_elements):,}")
print(f"Vector DB location: {VECTOR_DB_DIR.absolute()}")
print(f"Collection count: {vectorstore._collection.count():,}")
print(f"\nPerformance:")
print(f"  Total time: {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
print(f"  Throughput: {len(final_elements)/elapsed:.2f} elements/second")
print("="*60)

SMART FILTERING + LIGHT CHUNKING

Loading from: ..\data\ingested\batches
Found 23466 batch files
  Processed 5000/23466 files (49,269 elements)
  Processed 10000/23466 files (98,939 elements)
  Processed 15000/23466 files (148,762 elements)
  Processed 20000/23466 files (465,234 elements)

Loaded 499,850 raw elements

Filtering elements...
  Kept: 65,380 elements
  Filtered out 434,470 elements (< 100 chars)
  Found 475 large elements (> 1500 chars, will split)

Splitting 475 large elements...

Final element count: 66,458

Final Element Statistics:
  Average size: 320 characters (~64 words)
  Min size: 1 characters
  Max size: 1500 characters

Filtering complex metadata...

Estimated embedding time: 27.7 minutes
Aborted.

Creating embeddings with Ollama nomic-embed-text...

Embedding in progress...


In [None]:
"""
ONE-SHOT APPROACH: Load → Extract → Filter → Chunk → Embed
Best approach for your use case
"""

import time
from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader
from langchain_unstructured import UnstructuredLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
from unstructured.cleaners.core import clean_extra_whitespace

# Configuration
PDF_DIR = Path("../data/pdf")
VECTOR_DB_DIR = Path("vectordb/chromadb_direct")
MIN_ELEMENT_SIZE = 100
MAX_ELEMENT_SIZE = 1500

print("="*60)
print("ONE-SHOT PIPELINE: PDF -> ELEMENTS -> CHUNKS -> EMBEDDINGS")
print("="*60)

start_time = time.time()

# Step 1: Load PDFs and extract elements
print("\nStep 1: Loading PDFs and extracting elements...")
print(f"Source: {PDF_DIR}")

loader = DirectoryLoader(
    str(PDF_DIR),
    glob="**/*.pdf",
    loader_cls=UnstructuredLoader,
    show_progress=True,
    loader_kwargs={
        "mode": "elements",
        "strategy": "fast",
        "languages": ["eng"],
        "post_processors": [clean_extra_whitespace],
    }
)

raw_elements = []
for doc in loader.lazy_load():
    raw_elements.append(doc)

load_time = time.time() - start_time
print(f"Loaded {len(raw_elements):,} raw elements in {load_time:.1f}s")

# Step 2: Filter elements
print("\nStep 2: Filtering elements...")
filtered_elements = []
too_small = 0
large_elements = []

for elem in raw_elements:
    text = elem.page_content.strip()
    size = len(text)
    
    if size < MIN_ELEMENT_SIZE:
        too_small += 1
        continue
    elif size > MAX_ELEMENT_SIZE:
        large_elements.append(elem)
    else:
        filtered_elements.append(elem)

print(f"  Filtered out {too_small:,} small elements (< {MIN_ELEMENT_SIZE} chars)")
print(f"  Kept {len(filtered_elements):,} good elements ({MIN_ELEMENT_SIZE}-{MAX_ELEMENT_SIZE} chars)")
print(f"  Found {len(large_elements):,} large elements (> {MAX_ELEMENT_SIZE} chars)")

# Step 3: Split large elements
if large_elements:
    print("\nStep 3: Splitting large elements...")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    for elem in large_elements:
        chunks = splitter.split_documents([elem])
        filtered_elements.extend(chunks)
    
    print(f"  Created {len(filtered_elements):,} total chunks after splitting")
else:
    print("\nStep 3: No large elements to split")

# Step 4: Filter complex metadata
print("\nStep 4: Filtering complex metadata...")
final_elements = filter_complex_metadata(filtered_elements)

# Remove any elements that became too small after metadata filtering
final_elements = [e for e in final_elements if len(e.page_content.strip()) >= MIN_ELEMENT_SIZE]
print(f"  Final element count: {len(final_elements):,}")

# Statistics
sizes = [len(e.page_content) for e in final_elements]
avg_size = sum(sizes) / len(sizes)
min_size = min(sizes)
max_size = max(sizes)

print(f"\nFinal Statistics:")
print(f"  Average: {avg_size:.0f} chars (~{avg_size/5:.0f} words)")
print(f"  Min: {min_size} chars")
print(f"  Max: {max_size} chars")

# Step 5: Embed
estimated_time = len(final_elements) / 40
print(f"\nStep 5: Creating embeddings...")
print(f"  Elements to embed: {len(final_elements):,}")
print(f"  Estimated time: {estimated_time/60:.1f} minutes")

embeddings = OllamaEmbeddings(model="nomic-embed-text")

embed_start = time.time()
vectorstore = Chroma.from_documents(
    documents=final_elements,
    embedding=embeddings,
    persist_directory=str(VECTOR_DB_DIR),
    collection_name="humanitarian_docs"
)
embed_time = time.time() - embed_start

# Results
total_time = time.time() - start_time

print("\n" + "="*60)
print("PIPELINE COMPLETE")
print("="*60)
print(f"Total elements embedded: {len(final_elements):,}")
print(f"Vector DB: {VECTOR_DB_DIR.absolute()}")
print(f"Collection count: {vectorstore._collection.count():,}")
print(f"\nTiming:")
print(f"  Loading & extraction: {load_time:.1f}s")
print(f"  Filtering & chunking: {embed_start - start_time - load_time:.1f}s")
print(f"  Embedding: {embed_time:.1f}s ({embed_time/60:.1f} min)")
print(f"  Total: {total_time:.1f}s ({total_time/60:.1f} min)")
print(f"  Throughput: {len(final_elements)/embed_time:.1f} chunks/sec")
print("="*60)

# Test
print("\nTesting retrieval...")
results = vectorstore.similarity_search(
    "What are the recommendations for data sharing with donors?",
    k=3
)

print("\nTop 3 results:")
for i, doc in enumerate(results, 1):
    print(f"\n{i}. {doc.page_content[:200]}...")
    print(f"   Source: {doc.metadata.get('filename', 'unknown')}")
    print(f"   Page: {doc.metadata.get('page_number', 'unknown')}")

  from .autonotebook import tqdm as notebook_tqdm


ONE-SHOT PIPELINE: PDF -> ELEMENTS -> CHUNKS -> EMBEDDINGS

Step 1: Loading PDFs and extracting elements...
Source: ..\data\pdf


  0%|          | 0/1076 [00:00<?, ?it/s]INFO: pikepdf C++ to Python logger bridge initialized
  8%|▊         | 90/1076 [03:13<26:32,  1.62s/it]  INFO: PDF text extraction failed, skip text extraction...
INFO: PDF text extraction failed, skip text extraction...
 31%|███       | 333/1076 [09:26<07:58,  1.55it/s]INFO: PDF text extraction failed, skip text extraction...
 50%|████▉     | 536/1076 [15:04<20:22,  2.26s/it]

In [2]:
"""
FIXED: Strip heavy metadata before embedding
"""

import json
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

print("Loading elements...")
ELEMENTS_DIR = Path("../data/ingested/batches")

elements = []
for batch_file in sorted(ELEMENTS_DIR.glob("batch_*.jsonl")):
    with batch_file.open("r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data.get("text", "").strip()
            
            if len(text) >= 100:
                # CRITICAL FIX: Keep only essential metadata
                clean_metadata = {
                    "source": data["metadata"].get("filename", "unknown"),
                    "page": data["metadata"].get("page_number", 1),
                    "category": data["metadata"].get("category", "unknown"),
                }
                
                elements.append(Document(
                    page_content=text,
                    metadata=clean_metadata  # Stripped metadata
                ))

print(f"Loaded: {len(elements):,} elements")

# Now embed
embeddings = OllamaEmbeddings(model="nomic-embed-text")

vectorstore = Chroma.from_documents(
    documents=elements,
    embedding=embeddings,
    persist_directory="vectordb/chromadb_clean"
)

print(f"Done! {vectorstore._collection.count():,} chunks")

Loading elements...


KeyboardInterrupt: 

In [3]:
"""
TEST: First 20 files only - Check timing
"""

import json
import time
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

print("="*60)
print("TEST RUN - First 20 Files Only")
print("="*60)

start_time = time.time()

ELEMENTS_DIR = Path("../data/ingested/batches")
print(f"\nLoading from: {ELEMENTS_DIR}")

elements = []
file_count = 0
MAX_FILES = 20

for batch_file in sorted(ELEMENTS_DIR.glob("batch_*.jsonl")):
    if file_count >= MAX_FILES:
        break
    
    with batch_file.open("r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data.get("text", "").strip()
            
            if len(text) >= 100:
                clean_metadata = {
                    "source": data["metadata"].get("filename", "unknown"),
                    "page": data["metadata"].get("page_number", 1),
                    "category": data["metadata"].get("category", "unknown"),
                }
                
                elements.append(Document(
                    page_content=text,
                    metadata=clean_metadata
                ))
    
    file_count += 1
    print(f"  File {file_count}/{MAX_FILES}: {len(elements)} elements loaded")

load_time = time.time() - start_time
print(f"\nLoaded {len(elements):,} elements from {file_count} files")
print(f"Load time: {load_time:.2f}s")

# Check stats
sizes = [len(e.page_content) for e in elements]
avg_size = sum(sizes) / len(sizes)
print(f"Average chunk size: {avg_size:.0f} characters")

# Embed
print("\nStarting embedding...")
embeddings = OllamaEmbeddings(model="nomic-embed-text")

embed_start = time.time()
vectorstore = Chroma.from_documents(
    documents=elements,
    embedding=embeddings,
    persist_directory="../vectordb/chromadb_test"
)
embed_time = time.time() - embed_start

total_time = time.time() - start_time

print("\n" + "="*60)
print("TEST COMPLETE")
print("="*60)
print(f"Chunks embedded: {vectorstore._collection.count():,}")
print(f"Load time: {load_time:.2f}s")
print(f"Embed time: {embed_time:.2f}s ({embed_time/60:.2f} min)")
print(f"Total time: {total_time:.2f}s ({total_time/60:.2f} min)")
print(f"Speed: {len(elements)/embed_time:.2f} chunks/sec")
print("="*60)

# Extrapolate
total_files = len(list(ELEMENTS_DIR.glob("batch_*.jsonl")))
estimated_total_chunks = len(elements) * (total_files / file_count)
estimated_total_time = total_time * (total_files / file_count)

print(f"\nExtrapolation for all {total_files} files:")
print(f"  Estimated chunks: {estimated_total_chunks:,.0f}")
print(f"  Estimated time: {estimated_total_time/60:.1f} minutes")

TEST RUN - First 20 Files Only

Loading from: ..\data\ingested\batches
  File 1/20: 5 elements loaded
  File 2/20: 12 elements loaded
  File 3/20: 20 elements loaded
  File 4/20: 26 elements loaded
  File 5/20: 31 elements loaded
  File 6/20: 37 elements loaded
  File 7/20: 40 elements loaded
  File 8/20: 47 elements loaded
  File 9/20: 51 elements loaded
  File 10/20: 56 elements loaded
  File 11/20: 65 elements loaded
  File 12/20: 69 elements loaded
  File 13/20: 73 elements loaded
  File 14/20: 82 elements loaded
  File 15/20: 85 elements loaded
  File 16/20: 89 elements loaded
  File 17/20: 92 elements loaded
  File 18/20: 97 elements loaded
  File 19/20: 103 elements loaded
  File 20/20: 107 elements loaded

Loaded 107 elements from 20 files
Load time: 0.14s
Average chunk size: 325 characters

Starting embedding...

TEST COMPLETE
Chunks embedded: 107
Load time: 0.14s
Embed time: 228.30s (3.80 min)
Total time: 228.44s (3.81 min)
Speed: 0.47 chunks/sec

Extrapolation for all 23466 