In [2]:
import chromadb
from chromadb.config import Settings

# Path to your ChromaDB directory
CHROMA_DB_PATH = "/shared_folders/team_1/austin/chroma_db"

# Connect to the ChromaDB client
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

# List and print all collection names
collections = client.list_collections()
print(f"\n🔍 Found {len(collections)} collection(s):\n")

for col_name in collections:
    print(f"📁 Collection: {col_name}")

    # Access the collection
    collection = client.get_collection(name=col_name)

    # Query the first few documents (n_results can be changed)
    try:
        results = collection.query(query_texts=["Show me documents"], n_results=5)

        docs = results.get("documents", [[]])[0]
        ids = results.get("ids", [[]])[0]
        metadatas = results.get("metadatas", [[]])[0]

        if docs:
            print("📝 Sample documents:\n")
            for idx, doc in enumerate(docs):
                print(f"  {idx + 1}. 🆔 ID: {ids[idx]}")
                print(f"     📎 Metadata: {metadatas[idx] if idx < len(metadatas) else 'None'}")
                print(f"     📄 Content: {doc[:500]}{'...' if len(doc) > 500 else ''}\n")
        else:
            print("⚠️  No documents found in this collection.")

    except Exception as e:
        print(f"❌ Error querying collection '{col_name}': {e}")

    print("\n" + "-" * 60 + "\n")



🔍 Found 1 collection(s):

📁 Collection: markdown_documents
📝 Sample documents:

  1. 🆔 ID: 20110118195823_001.md_chunk_20
     📎 Metadata: {'source_file': '20110118195823_001.md'}
     📄 Content: CALIFORNIA 94520 ###### LU TELEX: 67-5570 TELEPHONE: (510)798-2940 `TELEFAX: (909) 592-6920 OR (909) 592-3399` TELEFAX: (510) 798-2944 2, ALL DOCUMENTS SHALL BE SENT TO THE FOLLOWING: `-DOCUMENTS WITH QUOTATION` -TO PURCHASING AGENT `J. BIGONY - SAN DIHAS OFFICE` `-ORDER PROGRESS DOCUMENTS` -TO PURCHASING AGENT `J. BIGONY` `- SAN DIMAS` `OFFICE` -SUB-ORDERS -TO PURCHASING AGENT J. BIGONY ["] SAN DIMAS OFFICE -ALL OTHER COMMUNICATION -TO PURCHASING AGENT J. BIGONY - SAN DIMAS OFFICE -DOCUMENTS FO...

  2. 🆔 ID: gei100752.md_chunk_38
     📎 Metadata: {'source_file': 'gei100752.md'}
     📄 Content: a navigation link labeled "Reports," which suggests that clicking on this link will lead to a page displaying links to all configured reports. Here is the transcription of all visible text: ``` Histor

In [5]:
query = input("Enter a query (e.g. 'purchase orders', 'historian reports'): ")
results = collection.query(query_texts=[query], n_results=5)


In [17]:
import chromadb
import json

# Path to your ChromaDB
CHROMA_DB_PATH = "/shared_folders/team_1/austin/chroma_db"

# Connect to the collection
client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collection = client.get_collection("markdown_documents")

# Get the first 10 entries with all fields
results = collection.get(limit=10, include=["documents", "metadatas", "embeddings"])

# Pretty-print each entry
for i in range(len(results["ids"])):
    embedding = results["embeddings"][i]
    embedding_preview = embedding[:5].tolist() + ["..."] if embedding is not None else None

    entry = {
        "id": results["ids"][i],
        "document": results["documents"][i],
        "metadata": results["metadatas"][i],
        "embedding": embedding_preview
    }

    print(f"📦 Entry {i + 1}")
    print(json.dumps(entry, indent=2))
    print("-" * 60)


📦 Entry 1
{
  "id": "20111219094640186_0007.md_chunk_0",
  "document": "The image appears to be an engineering drawing or blueprint, likely for a construction project. It includes various dimensions, details, and specifications related to the components of a structure. Here is a detailed description along with the transcribed text: ### Bill of Material | No. of Shipping Pieces | Piece Mark | Shape | Length | Remarks | WT. | |------------------------|------------|----------------------|---------|--------------|-----| | 1 | A36 | SCR Door Panels | 7 | | 0 | | | | L5 x",
  "metadata": {
    "source_file": "20111219094640186_0007.md"
  },
  "embedding": [
    -0.02710399590432644,
    0.07315632700920105,
    -0.023834863677620888,
    -0.01162189431488514,
    0.0229035671800375,
    "..."
  ]
}
------------------------------------------------------------
📦 Entry 2
{
  "id": "20110120151608_001_Material_Tests.md_chunk_0",
  "document": "PAR.T HO: PRW#: A-~8009-1 Penetron~ Efg. ARO~OX Ligh

In [7]:
"""
Data exploration script for BASF ChromaDB documents.
This script analyzes the available document data to understand what fields and patterns exist.
"""
import chromadb
import numpy as np
import pandas as pd
import json
import re
import os
from collections import Counter

# Path to your ChromaDB
CHROMA_DB_PATH = "/shared_folders/team_1/austin/chroma_db"

def explore_document_data(limit=100):
    """
    Explore the document data in ChromaDB and return insights
    about what fields and patterns exist.
    """
    print("Connecting to ChromaDB...")
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
    collection = client.get_collection("markdown_documents")
    
    print(f"Retrieving sample of {limit} document chunks...")
    results = collection.get(include=["documents", "metadatas", "embeddings", "ids"], limit=limit)
    
    # Basic statistics
    print(f"\n=== Basic Statistics ===")
    print(f"Total chunks retrieved: {len(results['ids'])}")
    
    # Analyze document IDs
    print(f"\n=== Document ID Patterns ===")
    id_patterns = []
    for doc_id in results['ids']:
        # Extract pattern by replacing digits with 'D' and letters with 'L'
        pattern = ''.join('D' if c.isdigit() else 'L' if c.isalpha() else c for c in doc_id)
        id_patterns.append(pattern)
    
    pattern_counts = Counter(id_patterns)
    print(f"Top ID patterns:")
    for pattern, count in pattern_counts.most_common(5):
        print(f"  {pattern}: {count} occurrences")
        print(f"  Example: {next(id for id in results['ids'] if ''.join('D' if c.isdigit() else 'L' if c.isalpha() else c for c in id) == pattern)}")
    
    # Analyze metadata fields
    print(f"\n=== Metadata Analysis ===")
    metadata_fields = set()
    for meta in results['metadatas']:
        metadata_fields.update(meta.keys())
    
    print(f"Available metadata fields: {', '.join(metadata_fields)}")
    
    for field in metadata_fields:
        values = [meta.get(field) for meta in results['metadatas'] if field in meta]
        unique_values = set(values)
        
        print(f"\nField: {field}")
        print(f"  Present in {len(values)}/{len(results['metadatas'])} metadata entries ({len(values)/len(results['metadatas'])*100:.1f}%)")
        print(f"  Unique values: {len(unique_values)}")
        
        if len(unique_values) <= 10:
            # If there are only a few unique values, show them all
            value_counts = Counter(values)
            for val, count in value_counts.most_common():
                print(f"    {val}: {count} occurrences")
        else:
            # Otherwise just show the most common ones
            value_counts = Counter(values)
            print(f"  Most common values:")
            for val, count in value_counts.most_common(5):
                print(f"    {val}: {count} occurrences")
    
    # Analyze source files
    print(f"\n=== Source File Analysis ===")
    source_files = [meta.get('source_file', 'unknown') for meta in results['metadatas']]
    unique_sources = set(source_files)
    
    print(f"Total unique source files: {len(unique_sources)}")
    
    # Find patterns in source file names
    source_extensions = Counter([os.path.splitext(src)[1] for src in source_files if src != 'unknown'])
    print(f"Source file extensions:")
    for ext, count in source_extensions.most_common():
        print(f"  {ext if ext else '(no extension)'}: {count} occurrences")
    
    # Look for date patterns in filenames
    date_pattern = re.compile(r'(\d{8})')  # Looking for YYYYMMDD pattern
    files_with_dates = [f for f in source_files if date_pattern.search(f)]
    print(f"Files with date patterns: {len(files_with_dates)}/{len(source_files)} ({len(files_with_dates)/len(source_files)*100:.1f}%)")
    
    if files_with_dates:
        print(f"Examples of files with dates:")
        for i, f in enumerate(files_with_dates[:5]):
            print(f"  {i+1}. {f}")
    
    # Analyze document content
    print(f"\n=== Document Content Analysis ===")
    doc_lengths = [len(doc) for doc in results['documents']]
    
    print(f"Document length statistics:")
    print(f"  Min: {min(doc_lengths)} characters")
    print(f"  Max: {max(doc_lengths)} characters")
    print(f"  Average: {sum(doc_lengths)/len(doc_lengths):.1f} characters")
    
    # Check for common patterns in content
    content_patterns = {
        "Contains table": sum(1 for doc in results['documents'] if '|' in doc and '-|-' in doc),
        "Contains image reference": sum(1 for doc in results['documents'] if 'image' in doc.lower() or 'figure' in doc.lower()),
        "Contains bullet points": sum(1 for doc in results['documents'] if '* ' in doc or '- ' in doc),
        "Contains code blocks": sum(1 for doc in results['documents'] if '```' in doc),
        "Contains headers": sum(1 for doc in results['documents'] if re.search(r'#+\s', doc))
    }
    
    print(f"Content patterns:")
    for pattern, count in content_patterns.items():
        print(f"  {pattern}: {count}/{len(results['documents'])} ({count/len(results['documents'])*100:.1f}%)")
    
    # Analyze embeddings
    print(f"\n=== Embedding Analysis ===")
    if results['embeddings'] and len(results['embeddings']) > 0:
        embedding_dimensions = len(results['embeddings'][0])
        print(f"Embedding dimensions: {embedding_dimensions}")
        
        # Calculate statistics on a sample embedding
        sample_embedding = results['embeddings'][0]
        print(f"Sample embedding statistics:")
        print(f"  Min value: {min(sample_embedding):.4f}")
        print(f"  Max value: {max(sample_embedding):.4f}")
        print(f"  Mean value: {sum(sample_embedding)/len(sample_embedding):.4f}")
    else:
        print("No embeddings found in the retrieved data.")
    
    # Return a structured summary
    summary = {
        "total_chunks": len(results['ids']),
        "metadata_fields": list(metadata_fields),
        "unique_source_files": len(unique_sources),
        "common_extensions": dict(source_extensions.most_common(3)),
        "has_dates_in_filenames": len(files_with_dates) > 0,
        "embedding_dimensions": len(results['embeddings'][0]) if results['embeddings'] and len(results['embeddings']) > 0 else 0
    }
    
    return summary, results

def cluster_preview(results, n_chunks_per_source=3):
    """
    Create a preview of how documents might cluster by source file
    """
    print("\n=== Document Clustering Preview ===")
    
    # Group by source file
    source_groups = {}
    for i, doc_id in enumerate(results['ids']):
        source_file = results['metadatas'][i].get('source_file', 'unknown')
        
        if source_file not in source_groups:
            source_groups[source_file] = []
        
        source_groups[source_file].append({
            'id': doc_id,
            'text_preview': results['documents'][i][:100] + '...' if len(results['documents'][i]) > 100 else results['documents'][i]
        })
    
    # Print preview of a few source files and their chunks
    print(f"Preview of {min(5, len(source_groups))} source files and their chunks:")
    
    for i, (source, chunks) in enumerate(list(source_groups.items())[:5]):
        print(f"\nSource File {i+1}: {source}")
        print(f"Total chunks: {len(chunks)}")
        
        # Show a few chunks from this source
        for j, chunk in enumerate(chunks[:n_chunks_per_source]):
            print(f"  Chunk {j+1} ({chunk['id']}):")
            print(f"    {chunk['text_preview']}")
    
    # Estimate how many unique documents we might have
    print(f"\nEstimated document count: {len(source_groups)} unique source files")
    
    return source_groups

def analyze_source_filenames(source_files):
    """
    Analyze patterns in source filenames to extract potential metadata
    """
    print("\n=== Source Filename Pattern Analysis ===")
    
    # Try to extract dates (format: YYYYMMDD)
    date_pattern = re.compile(r'(\d{8})')
    dates = {}
    
    for source in source_files:
        match = date_pattern.search(source)
        if match:
            date_str = match.group(1)
            try:
                year = int(date_str[:4])
                month = int(date_str[4:6])
                day = int(date_str[6:8])
                
                if 1990 <= year <= 2024 and 1 <= month <= 12 and 1 <= day <= 31:
                    dates[source] = {
                        'year': year,
                        'month': month, 
                        'day': day,
                        'full_date': f"{year}-{month:02d}-{day:02d}"
                    }
            except ValueError:
                pass
    
    print(f"Found dates in {len(dates)}/{len(source_files)} filenames ({len(dates)/len(source_files)*100:.1f}%)")
    
    if dates:
        years = Counter([d['year'] for d in dates.values()])
        print(f"Years distribution:")
        for year, count in sorted(years.items()):
            print(f"  {year}: {count} files")
    
    # Extract other potential metadata from filenames
    words_in_filenames = []
    for source in source_files:
        # Remove date patterns and file extensions
        clean_name = date_pattern.sub('', source)
        clean_name = os.path.splitext(clean_name)[0]
        
        # Split by non-alphanumeric characters
        words = re.findall(r'[a-zA-Z]{3,}', clean_name)
        words_in_filenames.extend([w.lower() for w in words])
    
    common_words = Counter(words_in_filenames)
    print(f"\nMost common words in filenames:")
    for word, count in common_words.most_common(15):
        print(f"  {word}: {count} occurrences")
    
    return {
        'dates': dates,
        'common_words': dict(common_words.most_common(15))
    }

def save_results(summary, filename='document_data_summary.json'):
    """Save the summary results to a JSON file"""
    with open(filename, 'w') as f:
        json.dump(summary, f, indent=2)
    print(f"\nSummary saved to {filename}")

def main():
    """Main function to run the data exploration"""
    print("Starting document data exploration...")
    
    # Get document data
    summary, results = explore_document_data(limit=200)
    
    # Preview potential clusters
    source_groups = cluster_preview(results)
    
    # Analyze source filenames
    source_files = list(source_groups.keys())
    filename_analysis = analyze_source_filenames(source_files)
    
    # Add the filename analysis to the summary
    summary['filename_analysis'] = filename_analysis
    
    # Save results
    save_results(summary)
    
    print("\nExploration complete!")

if __name__ == "__main__":
    main()

Starting document data exploration...
Connecting to ChromaDB...
Retrieving sample of 200 document chunks...


ValueError: Expected include item to be one of documents, embeddings, metadatas, distances, uris, data, got ids in get.