In [1]:
# %% [markdown]
# # Database Schema Exploration
#
# This notebook connects to the database and inspects the schemas of the tables
# within the `public` schema.

# %%
import sys
import time
from sqlalchemy import create_engine, text, inspect as sql_inspect
import warnings

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import database initializer ---
try:
    # Assuming init_vector_db returns session, engine
    from init_vector_db import init_vector_db
    INIT_DB_AVAILABLE = True
    print("✅ init_vector_db imported successfully.")
except ImportError as e:
    INIT_DB_AVAILABLE = False
    warnings.warn(f"⚠️ Failed to import init_vector_db. Check VDB_PIPELINE_PATH: {e}")
    # Define a placeholder if needed, or just rely on the check later
    def init_vector_db(wipe_database=False): return None, None

# --- Connect and Inspect Schema ---
session = None
engine = None

print("\nConnecting to database to inspect schema...")
connect_start_time = time.time()

try:
    if INIT_DB_AVAILABLE:
        session, engine = init_vector_db(wipe_database=False)

        if engine: # Check if connection was successful
            print(f"   Connected in {time.time() - connect_start_time:.2f}s")
            inspector = sql_inspect(engine)

            # --- Get List of Tables ---
            print("\nInspecting tables in 'public' schema...")
            table_names = inspector.get_table_names(schema='public')

            if not table_names:
                print("   No tables found in the 'public' schema.")
            else:
                print(f"   Found tables: {', '.join(table_names)}")

                # --- Inspect Columns for Each Table ---
                for table_name in table_names:
                    print(f"\n--- Schema for table: public.{table_name} ---")
                    try:
                        columns = inspector.get_columns(table_name, schema='public')
                        if columns:
                            for column in columns:
                                col_name = column['name']
                                col_type = str(column['type']) # Convert type object to string
                                col_nullable = column['nullable']
                                print(f"   • {col_name:<20} {col_type:<25} {'NULLABLE' if col_nullable else 'NOT NULL'}")
                        else:
                            print("      No columns found for this table.")
                    except Exception as col_error:
                         print(f"      ⚠️ Error inspecting columns for {table_name}: {col_error}")

        else:
             print("   ❌ Failed to establish database connection via init_vector_db.")

    else:
        print("   Skipping database connection as init_vector_db was not imported.")

except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")
finally:
    # --- Close Connection ---
    if session and session.is_active:
        session.close()
        print("\nDatabase session closed.")
    elif engine: # If engine exists but session might not have been active
         print("\nDatabase connection closed (or session inactive).")

print(f"\n✅ Schema exploration finished.")

# %% [markdown]
# ## Next Steps
#
# 1. Review the tables and their columns listed above.
# 2. Identify the tables you want to explore further (e.g., `summary_vectors`).
# 3. Create subsequent cells to query specific tables for row counts, sample data, etc.



✅ init_vector_db imported successfully.

Connecting to database to inspect schema...
CHECKING IF SEARCH INDICES CREATED: True
   Connected in 0.02s

Inspecting tables in 'public' schema...
   Found tables: spatial_ref_sys, vector_db, summary_vectors, mock_items_2, vector_db_2

--- Schema for table: public.spatial_ref_sys ---
   • srid                 INTEGER                   NOT NULL
   • auth_name            VARCHAR(256)              NULLABLE
   • auth_srid            INTEGER                   NULLABLE
   • srtext               VARCHAR(2048)             NULLABLE
   • proj4text            VARCHAR(2048)             NULLABLE

--- Schema for table: public.vector_db ---
   • id                   INTEGER                   NOT NULL
   • chunkid              INTEGER                   NULLABLE
   • description          TEXT                      NULLABLE
   • category             TEXT                      NULLABLE
   • md                   JSONB                     NULLABLE
   • filepath      

In [2]:
# %% [markdown]
# # Explore `vector_db` and `vector_db_2` Tables
#
# This notebook cell connects to the database and inspects the content
# of the `public.vector_db` and `public.vector_db_2` tables by fetching
# row counts and sample data.

# %%
import sys
import time
import pandas as pd
from sqlalchemy import text, inspect as sql_inspect
import warnings

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED
SAMPLE_SIZE = 3 # How many sample rows to fetch from each table

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import database initializer ---
try:
    from init_vector_db import init_vector_db
    INIT_DB_AVAILABLE = True
    print("✅ init_vector_db imported successfully.")
except ImportError as e:
    INIT_DB_AVAILABLE = False
    warnings.warn(f"⚠️ Failed to import init_vector_db. Check VDB_PIPELINE_PATH: {e}")
    def init_vector_db(wipe_database=False): return None, None

# --- Function to Explore a Table ---
def explore_table(table_name, session, engine, inspector):
    """Fetches count and sample data for a given table."""
    print(f"\n{'='*10} Exploring Table: public.{table_name} {'='*10}")
    df_sample = pd.DataFrame()
    row_count = 0

    if not inspector.has_table(table_name, schema="public"):
        print(f"   ❌ Table 'public.{table_name}' does not exist.")
        return

    print(f"   ✅ Table 'public.{table_name}' exists.")
    try:
        # Get row count
        count_result = session.execute(
             text(f"SELECT COUNT(*) FROM public.{table_name};")
         ).scalar_one_or_none()
        row_count = count_result if count_result is not None else 0
        print(f"   📊 Current row count: {row_count}")

        if row_count > 0:
            # Get column names
            columns_info = inspector.get_columns(table_name, schema="public")
            existing_columns = [col['name'] for col in columns_info] if columns_info else []

            if existing_columns:
                print(f"\n   Fetching {SAMPLE_SIZE} sample rows...")
                select_cols_str = ", ".join([f'"{col}"' for col in existing_columns]) # Quote names
                sample_query = text(f"""
                    SELECT {select_cols_str}
                    FROM public.{table_name}
                    ORDER BY id -- Assuming 'id' is the primary key or an indexed column
                    LIMIT :limit;
                """)
                df_sample = pd.read_sql(sample_query, con=engine, params={'limit': SAMPLE_SIZE})

                if not df_sample.empty:
                    print(f"\n   📄 Sample data ({len(df_sample)} rows):")
                    # Display relevant columns, handle potentially large text/embedding
                    cols_to_display = df_sample.columns.tolist()
                    # Shorten embedding preview if it exists
                    if 'embedding' in df_sample.columns:
                        df_sample['embedding_preview'] = df_sample['embedding'].astype(str).str[:50] + '...'
                        cols_to_display.remove('embedding')
                        cols_to_display.append('embedding_preview')
                    # Shorten markdown/description preview
                    for col in ['markdown', 'description']:
                         if col in df_sample.columns:
                             df_sample[f'{col}_preview'] = df_sample[col].astype(str).str[:100] + '...'
                             cols_to_display.remove(col)
                             cols_to_display.append(f'{col}_preview')

                    display(df_sample[[col for col in cols_to_display if col in df_sample.columns]]) # Show only existing cols
                else:
                    print("      Could not fetch sample rows.")
            else:
                print("      Could not retrieve column names.")
        else:
            print("   ℹ️ Table is empty.")

    except Exception as e:
        print(f"   ⚠️ Error exploring table {table_name}: {e}")


# --- Connect and Explore Tables ---
session = None
engine = None

print("\nConnecting to database to explore tables...")
connect_start_time = time.time()

try:
    if INIT_DB_AVAILABLE:
        session, engine = init_vector_db(wipe_database=False)

        if engine: # Check if connection was successful
            print(f"   Connected in {time.time() - connect_start_time:.2f}s")
            inspector = sql_inspect(engine)

            # Explore the tables of interest
            explore_table("vector_db", session, engine, inspector)
            explore_table("vector_db_2", session, engine, inspector)

        else:
             print("   ❌ Failed to establish database connection via init_vector_db.")

    else:
        print("   Skipping database connection as init_vector_db was not imported.")

except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")
finally:
    # --- Close Connection ---
    if session and session.is_active:
        session.close()
        print("\nDatabase session closed.")
    elif engine:
         print("\nDatabase connection closed (or session inactive).")

print(f"\n✅ Table exploration finished.")

# %% [markdown]
# ## Next Steps
#
# 1. Review the row counts and sample data for `vector_db` and `vector_db_2`.
# 2. Compare their structure and content to `summary_vectors` and the output of `test.py`.
# 3. This should clarify which table the `test.py` script is likely querying and what kind of data (e.g., document chunks vs. summaries) it contains.



✅ init_vector_db imported successfully.

Connecting to database to explore tables...
CHECKING IF SEARCH INDICES CREATED: True
   Connected in 0.01s

   ✅ Table 'public.vector_db' exists.
   📊 Current row count: 47822

   Fetching 3 sample rows...

   📄 Sample data (3 rows):


Unnamed: 0,id,chunkid,category,md,filepath,embedding_preview,markdown_preview,description_preview
0,1,0,,{},/shared_folders/team_1/ben/cleaned_data/201110...,"[-0.024612427,-0.05895996,-0.016738892,0.08184...",/shared_folders/team_1/document_batch/UTILS/Li...,the provided image appears to be a technical d...
1,2,1,,{},/shared_folders/team_1/ben/cleaned_data/201110...,"[0.01675415,-0.02835083,0.01361084,0.07159424,...",/shared_folders/team_1/document_batch/UTILS/Li...,the provided image appears to be a technical d...
2,3,2,,{},/shared_folders/team_1/ben/cleaned_data/201109...,"[0.028152466,-0.03466797,-0.04888916,0.0913085...",/shared_folders/team_1/document_batch/UTILS/Li...,the provided image appears to be a technical d...



   ✅ Table 'public.vector_db_2' exists.
   📊 Current row count: 0
   ℹ️ Table is empty.

Database session closed.

✅ Table exploration finished.


In [3]:
# %% [markdown]
# # Test VDB Pipeline Functions
#
# This cell tests the core functions (`init_vector_db`, `search_vdb`, `get_embeddings`)
# described in the pipeline documentation, primarily interacting with the `vector_db` table
# based on the structure suggested by the `test.py` script output.

# %%
import sys
import time
import json
import pandas as pd
from sqlalchemy import text, inspect as sql_inspect
import warnings

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED
# Test query from test.py (or use another relevant query)
TEST_SEARCH_QUERY = "I need help disassembling a dryer model 1600"
# TEST_SEARCH_QUERY = "chemical reactions in catalytic processes"
NUM_SEARCH_RESULTS = 3 # Number of results to fetch in the search test

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import VDB pipeline functions ---
try:
    from init_vector_db import init_vector_db
    from search_vdb import search_vdb, get_embeddings
    # Assuming 'vector' model is not needed directly for these tests
    # from vector import vector
    # from variables import MODEL, NUM_OF_SEARCH_RESULTS # Import if needed by functions

    INIT_DB_AVAILABLE = True
    SEARCH_AVAILABLE = True
    GET_EMB_AVAILABLE = True
    print("✅ VDB pipeline functions imported successfully.")

except ImportError as e:
    warnings.warn(f"⚠️ Failed to import one or more VDB functions: {e}")
    # Set flags to false if imports fail
    if 'init_vector_db' not in locals(): INIT_DB_AVAILABLE = False
    if 'search_vdb' not in locals(): SEARCH_AVAILABLE = False
    if 'get_embeddings' not in locals(): GET_EMB_AVAILABLE = False
    # Define placeholders to avoid crashing later if imports failed
    def init_vector_db(wipe_database=False): return None, None
    def search_vdb(query, num_results=3): return []
    def get_embeddings(): return []


# --- Test Functions ---
session = None
engine = None

print("\n--- Testing VDB Pipeline Functions ---")

# --- 1. Test init_vector_db() ---
print("\n[1/4] Testing init_vector_db()...")
connect_start_time = time.time()
try:
    if INIT_DB_AVAILABLE:
        session, engine = init_vector_db(wipe_database=False)
        if engine:
            print(f"   ✅ Connection successful in {time.time() - connect_start_time:.2f}s")
            # Optional: Check connection status further if needed
            # print(f"   Engine dialect: {engine.dialect.name}")
        else:
            print("   ❌ init_vector_db() returned None for engine. Connection failed.")
    else:
        print("   Skipping test: init_vector_db not imported.")
except Exception as e:
    print(f"   ❌ Error during init_vector_db(): {e}")

# --- 2. Test search_vdb() ---
print(f"\n[2/4] Testing search_vdb(query='{TEST_SEARCH_QUERY}', num_results={NUM_SEARCH_RESULTS})...")
search_results = []
if engine and SEARCH_AVAILABLE: # Proceed only if connected and function imported
    search_start_time = time.time()
    try:
        # Assuming search_vdb uses the session implicitly or connects itself
        # If it requires the session, pass it: search_vdb(session, TEST_SEARCH_QUERY, num_results=NUM_SEARCH_RESULTS)
        search_results = search_vdb(TEST_SEARCH_QUERY, num_results=NUM_SEARCH_RESULTS)

        print(f"   ✅ search_vdb() completed in {time.time() - search_start_time:.2f}s")
        print(f"   Returned {len(search_results)} results.")
        print("\n   --- Top Search Result Sample ---")
        if search_results:
            # Print the first result in a readable format
            first_hit = search_results[0]
            print(json.dumps(first_hit, indent=4))
            # You can add more detailed printing like in test.py if needed
        else:
            print("   search_vdb() returned no results for this query.")

    except Exception as e:
        print(f"   ❌ Error during search_vdb(): {e}")
elif not engine:
     print("   Skipping test: Database connection failed.")
else: # Not SEARCH_AVAILABLE
     print("   Skipping test: search_vdb not imported.")


# --- 3. Test get_embeddings() ---
print("\n[3/4] Testing get_embeddings()...")
all_embeddings = []
if engine and GET_EMB_AVAILABLE: # Proceed only if connected and function imported
    get_emb_start_time = time.time()
    try:
        # Assuming get_embeddings uses the session implicitly or connects itself
        # If it requires the session, pass it: get_embeddings(session)
        all_embeddings = get_embeddings()
        print(f"   ✅ get_embeddings() completed in {time.time() - get_emb_start_time:.2f}s")
        print(f"   Returned {len(all_embeddings)} embeddings.")
        if all_embeddings:
            # Check the type and shape/length of the first embedding
            first_emb = all_embeddings[0]
            print(f"   Type of first embedding: {type(first_emb)}")
            try:
                # Check length if it's list-like, or shape if numpy array
                if isinstance(first_emb, (list, tuple)):
                     print(f"   Length of first embedding: {len(first_emb)}")
                elif hasattr(first_emb, 'shape'): # Check for numpy array shape
                     print(f"   Shape of first embedding: {first_emb.shape}")
                # Add more type checks if needed
            except TypeError:
                 print("   Could not determine length/shape of the first embedding.")
        else:
            print("   get_embeddings() returned no embeddings (table might be empty or function targets wrong table).")

    except Exception as e:
        print(f"   ❌ Error during get_embeddings(): {e}")
elif not engine:
     print("   Skipping test: Database connection failed.")
else: # Not GET_EMB_AVAILABLE
     print("   Skipping test: get_embeddings not imported.")


# --- Close Session ---
if session and session.is_active:
    session.close()
    print("\nDatabase session closed.")
elif engine:
    print("\nDatabase connection closed (or session inactive).")

print(f"\n✅ VDB Pipeline function tests finished.")

# %% [markdown]
# ## Next Steps
#
# 1. Review the output of the tests above.
# 2. Did `search_vdb` return relevant results for the query? Does the structure match the `test.py` output?
# 3. Did `get_embeddings` return the expected number of embeddings? Does the dimension match the `vector_db` schema (`HALFVEC(768)`)?
# 4. This helps confirm if the pipeline functions are interacting with the intended table (`vector_db`) correctly.



  from .autonotebook import tqdm as notebook_tqdm


✅ VDB pipeline functions imported successfully.

--- Testing VDB Pipeline Functions ---

[1/4] Testing init_vector_db()...
CHECKING IF SEARCH INDICES CREATED: True
   ✅ Connection successful in 0.01s

[2/4] Testing search_vdb(query='I need help disassembling a dryer model 1600', num_results=3)...
CHECKING IF SEARCH INDICES CREATED: True
Time taken: 0.06195807456970215
   ✅ search_vdb() completed in 0.26s
   Returned 3 results.

   --- Top Search Result Sample ---
[
    17786,
    0.01639344262295082,
    0.0,
    0.01639344262295082,
    "/shared_folders/team_1/document_batch/UTILS/Library/AA Temp Backup Scans/Utilities Folders & Files Shelf 32/Air Dryer #3/20110627195344_001.PDF",
    "/shared_folders/team_1/ben/cleaned_data/20110627195344_001.md",
    "`` ` dryer specification & performance data sheet external heat reactivated dryer ppc model no . : t8100clb-s-4x-fii s.o . # : 1029694 cat . # : 1239433 standard operating conditions : fluid : air inlet flow rate : 6000 scfm ( referred

In [4]:
# %% [markdown]
# # Using the `search_vdb()` Function
#
# This cell demonstrates how to use the `search_vdb()` function from the pipeline
# to perform a hybrid search on the `vector_db` table and view the results.

# %%
import sys
import time
import json
import warnings

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED
# Define the search query you want to test
SEARCH_QUERY = "I need help disassembling a dryer model 1600"
# Define how many results you want
NUM_RESULTS = 5

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import VDB pipeline functions ---
try:
    from init_vector_db import init_vector_db
    from search_vdb import search_vdb
    INIT_DB_AVAILABLE = True
    SEARCH_AVAILABLE = True
    print("✅ VDB pipeline functions imported successfully.")
except ImportError as e:
    warnings.warn(f"⚠️ Failed to import one or more VDB functions: {e}")
    if 'init_vector_db' not in locals(): INIT_DB_AVAILABLE = False
    if 'search_vdb' not in locals(): SEARCH_AVAILABLE = False
    # Define placeholders
    def init_vector_db(wipe_database=False): return None, None
    def search_vdb(query, num_results=3): return []

# --- Connect, Search, and Display Results ---
session = None
engine = None

print("\n--- Testing search_vdb() ---")

try:
    # Check if necessary components are available
    if INIT_DB_AVAILABLE and SEARCH_AVAILABLE:
        # 1. Connect to the database
        print("Connecting to database...")
        connect_start_time = time.time()
        session, engine = init_vector_db(wipe_database=False)

        if engine: # Check if connection was successful
            print(f"   Connected in {time.time() - connect_start_time:.2f}s")

            # 2. Perform the search
            print(f"\nPerforming search for: '{SEARCH_QUERY}' (Top {NUM_RESULTS} results)")
            search_start_time = time.time()
            try:
                search_results = search_vdb(SEARCH_QUERY, num_results=NUM_RESULTS)
                print(f"   Search completed in {time.time() - search_start_time:.2f}s")
                print(f"   Found {len(search_results)} results.")

                # 3. Display the results
                print("\n--- Search Results (Formatted JSON) ---")
                if search_results:
                    # Use json.dumps for pretty printing
                    print(json.dumps(search_results, indent=4))
                else:
                    print("   No results returned.")

            except Exception as search_error:
                print(f"   ❌ Error during search_vdb(): {search_error}")

        else:
             print("   ❌ Failed to establish database connection via init_vector_db.")

    else:
        print("   Skipping test: Required functions (init_vector_db or search_vdb) not imported.")

except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")
finally:
    # 4. Close the connection
    if session and session.is_active:
        session.close()
        print("\nDatabase session closed.")
    elif engine:
         print("\nDatabase connection closed (or session inactive).")

print(f"\n✅ Search test finished.")

# %% [markdown]
# ## Review
#
# 1. Check the output above. Does it show the number of results you requested?
# 2. Examine the structure and content of the results. Does it match the format you expect (list of lists/tuples with ID, scores, paths, content)?
# 3. Are the results relevant to the search query?



✅ VDB pipeline functions imported successfully.

--- Testing search_vdb() ---
Connecting to database...
CHECKING IF SEARCH INDICES CREATED: True
   Connected in 0.01s

Performing search for: 'I need help disassembling a dryer model 1600' (Top 5 results)
CHECKING IF SEARCH INDICES CREATED: True
Time taken: 0.06718182563781738
   Search completed in 0.09s
   Found 5 results.

--- Search Results (Formatted JSON) ---
[
    [
        17786,
        0.01639344262295082,
        0.0,
        0.01639344262295082,
        "/shared_folders/team_1/document_batch/UTILS/Library/AA Temp Backup Scans/Utilities Folders & Files Shelf 32/Air Dryer #3/20110627195344_001.PDF",
        "/shared_folders/team_1/ben/cleaned_data/20110627195344_001.md",
        "`` ` dryer specification & performance data sheet external heat reactivated dryer ppc model no . : t8100clb-s-4x-fii s.o . # : 1029694 cat . # : 1239433 standard operating conditions : fluid : air inlet flow rate : 6000 scfm ( referred to 70\u00b0f and

In [6]:
# %% [markdown]
# # Analyze File Type Distribution in `vector_db`
#
# This cell connects to the database, fetches the `filepath` data from the
# `public.vector_db` table, extracts file extensions, and calculates
# the distribution of different document types.

# %%
import sys
import time
import pandas as pd
import os # For path manipulation (splitext)
from sqlalchemy import text, inspect as sql_inspect
import warnings

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED
TOP_N_TYPES = 20 # How many top file types to display

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import database initializer ---
try:
    from init_vector_db import init_vector_db
    INIT_DB_AVAILABLE = True
    print("✅ init_vector_db imported successfully.")
except ImportError as e:
    INIT_DB_AVAILABLE = False
    warnings.warn(f"⚠️ Failed to import init_vector_db. Check VDB_PIPELINE_PATH: {e}")
    def init_vector_db(wipe_database=False): return None, None

# --- Connect, Fetch, and Analyze ---
session = None
engine = None
df_paths = pd.DataFrame() # Initialize empty DataFrame

print("\nConnecting to database to fetch file paths from 'vector_db'...")
connect_start_time = time.time()

try:
    if INIT_DB_AVAILABLE:
        session, engine = init_vector_db(wipe_database=False)

        if engine: # Check if connection was successful
            print(f"   Connected in {time.time() - connect_start_time:.2f}s")
            inspector = sql_inspect(engine)

            # --- 1) Check Table and Fetch File Paths ---
            print("\n[1/2] Checking 'vector_db' table and fetching file paths...")
            table_name = "vector_db"
            if inspector.has_table(table_name, schema="public"):
                print(f"   ✅ Table 'public.{table_name}' exists.")
                # Check if filepath column exists
                columns_info = inspector.get_columns(table_name, schema="public")
                existing_columns = [col['name'] for col in columns_info] if columns_info else []

                if 'markdown' in existing_columns:
                    fetch_start_time = time.time()
                    query_select = text(f"""
                        SELECT markdown
                        FROM public.{table_name}
                        WHERE markdown IS NOT NULL;
                    """)
                    # Fetch all paths. Use chunking if memory is a concern for very large tables.
                    df_paths = pd.read_sql(query_select, con=engine)
                    print(f"   Fetched {len(df_paths)} file paths in {time.time() - fetch_start_time:.2f}s")

                    if not df_paths.empty:
                        # --- 2) Analyze File Extensions ---
                        print("\n[2/2] Analyzing file extensions...")

                        # Extract extension, convert to lowercase, handle missing extensions
                        # Use os.path.splitext to correctly handle extensions
                        df_paths['extension'] = df_paths['markdown'].apply(
                            lambda x: os.path.splitext(x)[1].lower() if pd.notnull(x) and isinstance(x, str) else ''
                        )
                        extension_counts = df_paths['extension'].value_counts()

                        print(f"\n   --- Top {min(TOP_N_TYPES, len(extension_counts))} File Extensions in '{table_name}' ---")
                        display(extension_counts.head(TOP_N_TYPES))

                    else:
                        print("   No non-NULL file paths found in the table to analyze.")
                else:
                     print(f"   ❌ Column 'filepath' not found in table '{table_name}'.")
            else:
                 print(f"   ❌ Table 'public.{table_name}' does not exist.")

        else:
             print("   ❌ Failed to establish database connection via init_vector_db.")

    else:
        print("   Skipping database connection as init_vector_db was not imported.")

except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")
finally:
    # --- Close Connection ---
    if session and session.is_active:
        session.close()
        print("\nDatabase session closed.")
    elif engine:
         print("\nDatabase connection closed (or session inactive).")

print(f"\n✅ File type distribution analysis finished.")

# %% [markdown]
# ## Review
#
# 1. Examine the list of file extensions and their counts.
# 2. This shows the distribution of original document types represented by the chunks in the `vector_db` table.



✅ init_vector_db imported successfully.

Connecting to database to fetch file paths from 'vector_db'...
CHECKING IF SEARCH INDICES CREATED: True
   Connected in 0.01s

[1/2] Checking 'vector_db' table and fetching file paths...
   ✅ Table 'public.vector_db' exists.
   Fetched 47822 file paths in 0.05s

[2/2] Analyzing file extensions...

   --- Top 6 File Extensions in 'vector_db' ---


extension
.pdf     40089
.doc      3206
.jpg      3187
.docx      913
.gif       264
.png       163
Name: count, dtype: int64


Database session closed.

✅ File type distribution analysis finished.


In [None]:
#!/usr/bin/env python3
"""
inspect_db_schema.py

Connects to the database specified by the VDB pipeline configuration
and prints the schema (tables, columns, types) found in the 'public' schema.
"""

import sys
import time
from sqlalchemy import create_engine, text, inspect as sql_inspect
import warnings

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import database initializer ---
try:
    # Assuming init_vector_db returns session, engine
    from init_vector_db import init_vector_db
    INIT_DB_AVAILABLE = True
    print("✅ init_vector_db imported successfully.")
except ImportError as e:
    INIT_DB_AVAILABLE = False
    warnings.warn(f"⚠️ Failed to import init_vector_db. Check VDB_PIPELINE_PATH: {e}")
    # Define a placeholder if needed, or just rely on the check later
    def init_vector_db(wipe_database=False): return None, None

# --- Main Execution Function ---
def main():
    """Connects to the DB and prints the schema."""
    session = None
    engine = None

    print("\n--- Database Schema Inspector ---")

    if not INIT_DB_AVAILABLE:
        print("❌ Cannot proceed: init_vector_db failed to import.")
        return

    print("\nConnecting to database to inspect schema...")
    connect_start_time = time.time()

    try:
        session, engine = init_vector_db(wipe_database=False)

        if not engine: # Check if connection was successful
            print("   ❌ Failed to establish database connection via init_vector_db.")
            return # Exit if connection failed

        print(f"   Connected in {time.time() - connect_start_time:.2f}s")
        inspector = sql_inspect(engine)

        # --- Get List of Tables ---
        print("\nInspecting tables in 'public' schema...")
        table_names = inspector.get_table_names(schema='public')

        if not table_names:
            print("   No tables found in the 'public' schema.")
        else:
            print(f"   Found tables: {', '.join(table_names)}")

            # --- Inspect Columns for Each Table ---
            for table_name in table_names:
                print(f"\n--- Schema for table: public.{table_name} ---")
                try:
                    columns = inspector.get_columns(table_name, schema='public')
                    if columns:
                        for column in columns:
                            col_name = column['name']
                            # Use .__str__() for robust type representation
                            col_type = column['type'].__str__()
                            col_nullable = column['nullable']
                            # Basic formatting for alignment
                            print(f"   • {col_name:<20} {col_type:<25} {'NULLABLE' if col_nullable else 'NOT NULL'}")
                    else:
                        print("      No columns found for this table.")
                except Exception as col_error:
                     print(f"      ⚠️ Error inspecting columns for {table_name}: {col_error}")

    except Exception as e:
        print(f"\n❌ An unexpected error occurred: {e}")
    finally:
        # --- Close Connection ---
        if session and session.is_active:
            session.close()
            print("\nDatabase session closed.")
        elif engine: # If engine exists but session might not have been active
             print("\nDatabase connection closed (or session inactive).")

    print(f"\n✅ Schema inspection finished.")


# --- Script Entry Point ---
if __name__ == "__main__":
    main()


✅ init_vector_db imported successfully.

--- Database Schema Inspector ---

Connecting to database to inspect schema...
CHECKING IF SEARCH INDICES CREATED: True
   Connected in 0.01s

Inspecting tables in 'public' schema...
   Found tables: spatial_ref_sys, vector_db, summary_vectors, mock_items_2, vector_db_2

--- Schema for table: public.spatial_ref_sys ---
   • srid                 INTEGER                   NOT NULL
   • auth_name            VARCHAR(256)              NULLABLE
   • auth_srid            INTEGER                   NULLABLE
   • srtext               VARCHAR(2048)             NULLABLE
   • proj4text            VARCHAR(2048)             NULLABLE

--- Schema for table: public.vector_db ---
   • id                   INTEGER                   NOT NULL
   • chunkid              INTEGER                   NULLABLE
   • description          TEXT                      NULLABLE
   • category             TEXT                      NULLABLE
   • md                   JSONB              

In [3]:
#!/usr/bin/env python3
"""
show_full_vector_db_row.py

Connects to the database and fetches a single sample row from the
`vector_db` table, displaying the full value (or a preview) for each column.
"""

import sys
import time
import pandas as pd
from sqlalchemy import text, inspect as sql_inspect
import warnings
import json # For potentially pretty-printing JSONB

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED
TABLE_NAME = "vector_db" # The table to inspect
ROW_OFFSET = 0 # Fetch the first row (change to fetch a different one, e.g., 10)

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import database initializer ---
try:
    from init_vector_db import init_vector_db
    INIT_DB_AVAILABLE = True
    print("✅ init_vector_db imported successfully.")
except ImportError as e:
    INIT_DB_AVAILABLE = False
    warnings.warn(f"⚠️ Failed to import init_vector_db. Check VDB_PIPELINE_PATH: {e}")
    def init_vector_db(wipe_database=False): return None, None

# --- Main Execution ---
def main():
    """Main function to fetch and display a sample row."""
    session = None
    engine = None
    print(f"\n--- Fetching Full Sample Row from '{TABLE_NAME}' ---")

    if not INIT_DB_AVAILABLE:
        print("❌ Cannot proceed: init_vector_db failed to import.")
        return

    try:
        # 1. Connect
        print("Connecting to database...")
        connect_start_time = time.time()
        session, engine = init_vector_db(wipe_database=False)

        if not engine:
            print("   ❌ Failed to establish database connection.")
            return
        print(f"   Connected in {time.time() - connect_start_time:.2f}s")
        inspector = sql_inspect(engine)

        # 2. Check Table and Get Columns
        print(f"\nChecking table 'public.{TABLE_NAME}'...")
        if not inspector.has_table(TABLE_NAME, schema="public"):
             print(f"   ❌ Table 'public.{TABLE_NAME}' does not exist.")
             return

        columns_info = inspector.get_columns(TABLE_NAME, schema="public")
        existing_columns = [col['name'] for col in columns_info] if columns_info else []

        if not existing_columns:
            print(f"   ❌ Could not retrieve columns for table '{TABLE_NAME}'.")
            return
        print(f"   ✅ Table and columns found.")

        # 3. Fetch Sample Row
        print(f"\nFetching sample row (offset {ROW_OFFSET})...")
        fetch_start_time = time.time()
        sample_row = None
        try:
            # Select all columns, order by ID, use OFFSET and LIMIT
            select_cols_str = ", ".join([f'"{col}"' for col in existing_columns]) # Quote names
            query = text(f"""
                SELECT {select_cols_str}
                FROM public.{TABLE_NAME}
                ORDER BY id -- Assuming 'id' is the primary key or an indexed column
                LIMIT 1 OFFSET :offset;
            """)
            result = session.execute(query, {'offset': ROW_OFFSET}).fetchone() # Fetch exactly one row
            if result:
                 sample_row = result._mapping # Access columns by name using mapping proxy
            print(f"   Query completed in {time.time() - fetch_start_time:.2f}s")

        except Exception as query_error:
            print(f"   ❌ Error executing query: {query_error}")

        # 4. Display Row Content
        print("\n--- Full Sample Row Data ---")
        if sample_row:
            for col_name in existing_columns:
                value = sample_row.get(col_name) # Get value by name
                print(f"\n-- Column: {col_name} --")
                # Provide previews for potentially very long fields
                if col_name == 'embedding':
                    value_str = str(value)
                    preview = value_str[:100] + "..." + value_str[-100:] if len(value_str) > 200 else value_str
                    print(f"  Type: {type(value).__name__}, Preview: {preview}")
                elif col_name in ['markdown', 'description'] and isinstance(value, str):
                    preview = value[:500] + "..." if len(value) > 500 else value
                    print(f"  Type: {type(value).__name__}, Preview (first 500 chars):")
                    print(preview)
                elif col_name == 'md' and value is not None: # Pretty print JSONB
                     try:
                         print(f"  Type: {type(value).__name__}, Content:")
                         print(json.dumps(value, indent=2)) # Pretty print JSON
                     except TypeError:
                         print(f"  Value (non-JSON serializable): {value}")
                else:
                    # Print other types directly
                    print(f"  Type: {type(value).__name__}, Value: {value}")
        else:
            print(f"   No row found at offset {ROW_OFFSET} or query failed.")

    except Exception as e:
        print(f"\n❌ An unexpected error occurred: {e}")
    finally:
        # 5. Close connection
        if session and session.is_active:
            session.close()
            print("\nDatabase session closed.")
        elif engine:
             print("\nDatabase connection closed (or session inactive).")

    print(f"\n✅ Sample row display finished.")


if __name__ == "__main__":
    main()


✅ init_vector_db imported successfully.

--- Fetching Full Sample Row from 'vector_db' ---
Connecting to database...
CHECKING IF SEARCH INDICES CREATED: True
   Connected in 0.01s

Checking table 'public.vector_db'...
   ✅ Table and columns found.

Fetching sample row (offset 0)...
   Query completed in 0.00s

--- Full Sample Row Data ---

-- Column: id --
  Type: int, Value: 1

-- Column: chunkid --
  Type: int, Value: 0

-- Column: description --
  Type: str, Preview (first 500 chars):
the provided image appears to be a technical drawing , likely a schematic diagram for an electrical or mechanical system . here is a detailed analysis : # # # layout and structure : - the drawing is divided into two main sections , each containing multiple components arranged in a grid-like pattern . - each component is connected by lines that represent electrical or mechanical connections . - there are labels and annotations next to each component , indicating their names or functions . # # # c...

--

In [4]:
#!/usr/bin/env python3
"""
find_multi_chunk_doc.py

Connects to the database, identifies an original document path associated
with multiple chunks in the `vector_db` table, and displays the details
(ID, Chunk ID, Content Preview) for each of those chunks.
"""

import sys
import time
import pandas as pd
from sqlalchemy import text, inspect as sql_inspect
import warnings
import json

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED
TABLE_NAME = "vector_db" # The table containing chunks
# Column likely representing the original document path (based on previous analysis)
ORIGINAL_DOC_PATH_COLUMN = "markdown"
# How many chunks to display for the found document
MAX_CHUNKS_TO_DISPLAY = 10

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import database initializer ---
try:
    from init_vector_db import init_vector_db
    INIT_DB_AVAILABLE = True
    print("✅ init_vector_db imported successfully.")
except ImportError as e:
    INIT_DB_AVAILABLE = False
    warnings.warn(f"⚠️ Failed to import init_vector_db. Check VDB_PIPELINE_PATH: {e}")
    def init_vector_db(wipe_database=False): return None, None

# --- Main Execution ---
def main():
    """Finds a document with multiple chunks and displays them."""
    session = None
    engine = None
    print(f"\n--- Finding Document with Multiple Chunks in '{TABLE_NAME}' ---")

    if not INIT_DB_AVAILABLE:
        print("❌ Cannot proceed: init_vector_db failed to import.")
        return

    try:
        # 1. Connect
        print("Connecting to database...")
        connect_start_time = time.time()
        session, engine = init_vector_db(wipe_database=False)

        if not engine:
            print("   ❌ Failed to establish database connection.")
            return
        print(f"   Connected in {time.time() - connect_start_time:.2f}s")
        inspector = sql_inspect(engine)

        # 2. Check Table and Columns
        print(f"\nChecking table 'public.{TABLE_NAME}'...")
        if not inspector.has_table(TABLE_NAME, schema="public"):
             print(f"   ❌ Table 'public.{TABLE_NAME}' does not exist.")
             return

        columns_info = inspector.get_columns(TABLE_NAME, schema="public")
        existing_columns = [col['name'] for col in columns_info] if columns_info else []
        required_cols = ['id', 'chunkid', 'description', ORIGINAL_DOC_PATH_COLUMN]

        if not all(col in existing_columns for col in required_cols):
             missing = [col for col in required_cols if col not in existing_columns]
             print(f"   ❌ Required columns missing from table: {', '.join(missing)}")
             return
        print("   ✅ Table and required columns exist.")

        # 3. Find a Document Path with Multiple Chunks
        print(f"\nFinding a document path ({ORIGINAL_DOC_PATH_COLUMN}) with more than one chunk...")
        find_path_start_time = time.time()
        target_path = None
        try:
            # Query to group by the path column and count occurrences, finding one with count > 1
            query_find = text(f"""
                SELECT {ORIGINAL_DOC_PATH_COLUMN}
                FROM public.{TABLE_NAME}
                WHERE {ORIGINAL_DOC_PATH_COLUMN} IS NOT NULL
                GROUP BY {ORIGINAL_DOC_PATH_COLUMN}
                HAVING COUNT(*) > 1
                LIMIT 1;
            """)
            result = session.execute(query_find).scalar_one_or_none() # Get the first path found
            if result:
                 target_path = result
                 print(f"   Found example path: {target_path}")
            else:
                 print("   Could not find any document paths with multiple chunks.")

            print(f"   Path search completed in {time.time() - find_path_start_time:.2f}s")

        except Exception as find_error:
            print(f"   ❌ Error finding multi-chunk document: {find_error}")

        # 4. Fetch and Display Chunks for the Found Path
        if target_path:
            print(f"\nFetching up to {MAX_CHUNKS_TO_DISPLAY} chunks for path: {target_path}")
            fetch_chunks_start_time = time.time()
            chunks = []
            try:
                # Query to get all chunks for the specific path
                query_fetch = text(f"""
                    SELECT id, chunkid, description
                    FROM public.{TABLE_NAME}
                    WHERE {ORIGINAL_DOC_PATH_COLUMN} = :target_path
                    ORDER BY chunkid ASC -- Order by chunk sequence
                    LIMIT :limit;
                """)
                results = session.execute(query_fetch, {
                    'target_path': target_path,
                    'limit': MAX_CHUNKS_TO_DISPLAY
                }).fetchall()
                chunks = results
                print(f"   Query completed in {time.time() - fetch_chunks_start_time:.2f}s")
                print(f"   Fetched {len(chunks)} chunks (display limited to {MAX_CHUNKS_TO_DISPLAY}).")

            except Exception as fetch_error:
                print(f"   ❌ Error fetching chunks: {fetch_error}")

            # Display the fetched chunks
            print("\n--- Chunks for the Selected Document ---")
            if chunks:
                for i, row in enumerate(chunks):
                    row_dict = row._mapping # Access columns by name
                    doc_id = row_dict.get('id')
                    chunk_id = row_dict.get('chunkid')
                    content = row_dict.get('description') # Use description as content based on prior analysis

                    print(f"\n--- Chunk {i+1} (ID: {doc_id}, ChunkID: {chunk_id}) ---")
                    print(f"  Content Preview:")
                    print(f"    {content[:300] if isinstance(content, str) else 'N/A'}...") # Preview content
                    print("-" * 20)
            else:
                print("   No chunks found for the selected path or query failed.")
        else:
            print("\nSkipping chunk display as no multi-chunk document path was found.")


    except Exception as e:
        print(f"\n❌ An unexpected error occurred: {e}")
    finally:
        # 5. Close connection
        if session and session.is_active:
            session.close()
            print("\nDatabase session closed.")
        elif engine:
             print("\nDatabase connection closed (or session inactive).")

    print(f"\n✅ Script finished.")


if __name__ == "__main__":
    main()


✅ init_vector_db imported successfully.

--- Finding Document with Multiple Chunks in 'vector_db' ---
Connecting to database...
CHECKING IF SEARCH INDICES CREATED: True
   Connected in 0.01s

Checking table 'public.vector_db'...
   ✅ Table and required columns exist.

Finding a document path (markdown) with more than one chunk...
   Found example path: /shared_folders/team_1/document_batch/UTILS/UtInfo/001 Staff Folders/_Past Employees/McClay, Charles/DEMIN/Demin 3 Project/1998 D2 Project Files/1998 Demineralization Water Expansion/Vol VI Book 12/Book 12 Tab 6/20110404125325_001.PDF
   Path search completed in 0.03s

Fetching up to 10 chunks for path: /shared_folders/team_1/document_batch/UTILS/UtInfo/001 Staff Folders/_Past Employees/McClay, Charles/DEMIN/Demin 3 Project/1998 D2 Project Files/1998 Demineralization Water Expansion/Vol VI Book 12/Book 12 Tab 6/20110404125325_001.PDF
   Query completed in 0.01s
   Fetched 4 chunks (display limited to 10).

--- Chunks for the Selected Doc

In [5]:
#!/usr/bin/env python3
"""
count_unique_docs.py

Connects to the database and counts the number of unique original document paths
stored in the specified column (`markdown` by default) of the `vector_db` table.
"""

import sys
import time
import warnings
from sqlalchemy import text, inspect as sql_inspect

# --- Configuration ---
# Adjust this path to point to your VDB pipeline location
VDB_PIPELINE_PATH = "/shared_folders/team_1/mark_vdb/vdb_pipeline" #<-- ADJUST IF NEEDED
TABLE_NAME = "vector_db" # The table containing chunks
# Column representing the original document path (based on prior analysis)
UNIQUE_DOC_COLUMN = "markdown"

# --- Add VDB pipeline to Python path ---
if VDB_PIPELINE_PATH not in sys.path:
    sys.path.append(VDB_PIPELINE_PATH)

# --- Import database initializer ---
try:
    from init_vector_db import init_vector_db
    INIT_DB_AVAILABLE = True
    print("✅ init_vector_db imported successfully.")
except ImportError as e:
    INIT_DB_AVAILABLE = False
    warnings.warn(f"⚠️ Failed to import init_vector_db. Check VDB_PIPELINE_PATH: {e}")
    def init_vector_db(wipe_database=False): return None, None

# --- Main Execution ---
def main():
    """Connects to the DB and counts unique document paths."""
    session = None
    engine = None
    print(f"\n--- Counting Unique Documents in '{TABLE_NAME}' (Column: '{UNIQUE_DOC_COLUMN}') ---")

    if not INIT_DB_AVAILABLE:
        print("❌ Cannot proceed: init_vector_db failed to import.")
        return

    try:
        # 1. Connect
        print("Connecting to database...")
        connect_start_time = time.time()
        session, engine = init_vector_db(wipe_database=False)

        if not engine:
            print("   ❌ Failed to establish database connection.")
            return
        print(f"   Connected in {time.time() - connect_start_time:.2f}s")
        inspector = sql_inspect(engine)

        # 2. Check Table and Column
        print(f"\nChecking table 'public.{TABLE_NAME}' and column '{UNIQUE_DOC_COLUMN}'...")
        if not inspector.has_table(TABLE_NAME, schema="public"):
             print(f"   ❌ Table 'public.{TABLE_NAME}' does not exist.")
             return

        columns_info = inspector.get_columns(TABLE_NAME, schema="public")
        existing_columns = [col['name'] for col in columns_info] if columns_info else []

        if UNIQUE_DOC_COLUMN not in existing_columns:
             print(f"   ❌ Column '{UNIQUE_DOC_COLUMN}' not found in table '{TABLE_NAME}'.")
             return
        print("   ✅ Table and specified column exist.")

        # 3. Count Unique Document Paths
        print(f"\nCounting distinct values in '{UNIQUE_DOC_COLUMN}' column...")
        count_start_time = time.time()
        unique_count = 0
        try:
            # Use COUNT(DISTINCT column_name)
            # Ensure the column name is handled safely if it contains special characters
            # (though 'markdown' is standard and likely safe)
            query = text(f"""
                SELECT COUNT(DISTINCT "{UNIQUE_DOC_COLUMN}")
                FROM public."{TABLE_NAME}"
                WHERE "{UNIQUE_DOC_COLUMN}" IS NOT NULL;
            """)
            result = session.execute(query).scalar_one_or_none()
            unique_count = result if result is not None else 0
            print(f"   Query completed in {time.time() - count_start_time:.2f}s")
            print(f"\n   📊 Found {unique_count} unique document paths in '{UNIQUE_DOC_COLUMN}'.")

        except Exception as query_error:
            print(f"   ❌ Error executing count query: {query_error}")

    except Exception as e:
        print(f"\n❌ An unexpected error occurred: {e}")
    finally:
        # 4. Close connection
        if session and session.is_active:
            session.close()
            print("\nDatabase session closed.")
        elif engine:
             print("\nDatabase connection closed (or session inactive).")

    print(f"\n✅ Unique document count finished.")


if __name__ == "__main__":
    main()


✅ init_vector_db imported successfully.

--- Counting Unique Documents in 'vector_db' (Column: 'markdown') ---
Connecting to database...
CHECKING IF SEARCH INDICES CREATED: True
   Connected in 0.01s

Checking table 'public.vector_db' and column 'markdown'...
   ✅ Table and specified column exist.

Counting distinct values in 'markdown' column...
   Query completed in 0.24s

   📊 Found 13755 unique document paths in 'markdown'.

Database session closed.

✅ Unique document count finished.
