In [15]:
#all-mini-lm embeddings

In [None]:
import os
import json
import shutil # Import the shutil library for directory operations

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# --- CONSTANTS ---
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

# ... (The process_and_save_chunks and create_vectordb_from_chunks functions remain unchanged) ...
def process_and_save_chunks(pdf_folder: str, chunks_path: str) -> bool:
    """
    Step 1: Load PDFs, clean up metadata, split them into chunks, and save them to a JSON file.
    Returns True on success, False on failure.
    """
    if not os.path.isdir(pdf_folder):
        print(f"Error: The folder '{pdf_folder}' does not exist.")
        return False

    print(f"--- Step 1: Processing PDFs from '{pdf_folder}' ---")

    # Load documents
    loader = DirectoryLoader(
        pdf_folder,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )
    documents = loader.load()
    if not documents:
        print("No PDF documents found. Exiting.")
        return False
    print(f"Loaded {len(documents)} document(s).")

    # Clean up the 'source' metadata to be the chapter name (filename without extension)
    print("Cleaning up document metadata...")
    for doc in documents:
        source_path = doc.metadata.get('source', '')
        filename = os.path.basename(source_path)
        chapter_name, _ = os.path.splitext(filename)
        doc.metadata['source'] = chapter_name
    print(f"Metadata cleaned. Example source: '{documents[0].metadata['source']}'")

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
    splits = text_splitter.split_documents(documents)
    print(f"Split documents into {len(splits)} chunks.")

    # Save chunks to a JSON file for later use
    print(f"Saving processed chunks to '{chunks_path}'...")
    with open(chunks_path, 'w', encoding='utf-8') as f:
        json_data = [
            {'page_content': doc.page_content, 'metadata': doc.metadata}
            for doc in splits
        ]
        json.dump(json_data, f, indent=2)

    print("--- Step 1 Complete: Chunks saved successfully. ---")
    return True


def create_vectordb_from_chunks(db_path: str, chunks_path: str):
    """
    Step 2: Load the processed chunks from the JSON file and create the
    persistent Chroma vector database.
    """
    if not os.path.exists(chunks_path):
        print(f"Error: Chunks file not found at '{chunks_path}'. Please run the processing step first.")
        return
        
    # Clean up old database directory if it exists to prevent errors
    if os.path.exists(db_path):
        print(f"Found and removing existing DB directory: '{db_path}'")
        shutil.rmtree(db_path)

    # <<< FIX >>> Create the new, empty directory for the database.
    # ChromaDB expects the directory to exist before it can write to it.
    print(f"Creating new empty directory for DB: '{db_path}'")
    os.makedirs(db_path)

    print(f"\n--- Step 2: Creating Vector DB from '{chunks_path}' ---")

    # Load the processed chunks from the JSON file
    with open(chunks_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    # Re-create Document objects from the loaded data
    documents_from_json = [
        Document(page_content=item['page_content'], metadata=item['metadata'])
        for item in json_data
    ]
    print(f"Loaded {len(documents_from_json)} chunks from file.")

    # Initialize the Sentence Transformer embedding model
    print(f"Initializing embedding model: '{EMBEDDING_MODEL_NAME}'...")
    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    print("Embedding model initialized.")

    # Create and persist the Chroma vector store
    print(f"Creating and persisting vector store at '{db_path}'...")
    vectorstore = Chroma.from_documents(
        documents=documents_from_json,
        embedding=embeddings,
        persist_directory=db_path
    )
    print("--- Step 2 Complete: Vector DB Creation Complete! ---")
    print(f"Vector store saved to: {db_path}")


# ==================================================================
# === SCRIPT EXECUTION =============================================
# ==================================================================

if __name__ == "__main__":
    # --- CONFIGURE YOUR PATHS HERE ---
    pdf_source_folder = "/Users/jimharrington/Desktop/ANEET/Quiz/Dataprep/Processed_papers"
    chunks_output_path = "/Users/jimharrington/Desktop/ANEET/solved_question_papers.json"
    db_output_path = "/Users/jimharrington/Desktop/ANEET/chroma_vector_db_solved_question_papers_normic"

    # --- MAIN EXECUTION LOGIC ---
    if os.path.exists(chunks_output_path):
        print(f"Found existing chunks file: '{chunks_output_path}'")
        print("Skipping PDF processing. Proceeding directly to embedding.")
        create_vectordb_from_chunks(db_path=db_output_path, chunks_path=chunks_output_path)
    else:
        print(f"No chunks file found. Starting full process from scratch.")
        if process_and_save_chunks(pdf_folder=pdf_source_folder, chunks_path=chunks_output_path):
            create_vectordb_from_chunks(db_path=db_output_path, chunks_path=chunks_output_path)

No chunks file found. Starting full process from scratch.
--- Step 1: Processing PDFs from '/Users/jimharrington/Desktop/ANEET/Physics/Physics' ---


100%|██████████| 28/28 [00:29<00:00,  1.04s/it]


Loaded 624 document(s).
Cleaning up document metadata...
Metadata cleaned. Example source: 'Physics_11th_NCRT_BOOK_Unit1Chapter_7'
Split documents into 1399 chunks.
Saving processed chunks to './processed_physics_chunks.json'...
--- Step 1 Complete: Chunks saved successfully. ---
Found and removing existing DB directory: './chroma_vector_db_physics'
Creating new empty directory for DB: './chroma_vector_db_physics'

--- Step 2: Creating Vector DB from './processed_physics_chunks.json' ---
Loaded 1399 chunks from file.
Initializing embedding model: 'all-MiniLM-L6-v2'...


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Embedding model initialized.
Creating and persisting vector store at './chroma_vector_db_physics'...


  return forward_call(*args, **kwargs)


--- Step 2 Complete: Vector DB Creation Complete! ---
Vector store saved to: ./chroma_vector_db_physics


In [7]:
db_path_to_query = "./chroma_vector_db_physics"
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Load the persisted database
print(f"Loading database from {db_path_to_query}...")
db = Chroma(persist_directory=db_path_to_query, embedding_function=embedding_function)

# Perform a similarity search
print("Performing a test search...")
# Replace "cell biology" with a term relevant to your documents
results = db.similarity_search("what is the value of gravity?", k=2)

# Print the results and inspect the metadata
print("\n--- Search Results ---")
for doc in results:
    print(f"Source Chapter: {doc.metadata.get('source')}") # Should be "kebo1ps" etc.
    print(f"Page: {doc.metadata.get('page')}")
    print(f"Content: {doc.page_content[:200]}...") # Print snippet of the content
    print("-" * 20)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Loading database from ./chroma_vector_db_physics...
Performing a test search...

--- Search Results ---
Source Chapter: Physics_11th_NCRT_BOOK_Unit1Chapter_7
Page: 6
Content: GRAVITATION 133
hence
E
3
E
G m M rR= (7.10)
If the mass m is situated on the surface of
earth, then  r = RE and the gravitational force on
it is, from Eq. (7.10)
2
E
E
M mF G R= (7.11)
The accelerati...
--------------------
Source Chapter: Physics_11th_NCRT_BOOK_Unit1Chapter_7
Page: 4
Content: GRAVITATION 131
cases, a  simple law results when you do that :
(1) The force of attraction between a hollow
spherical shell of uniform density and a
point mass situated outside is just as if
the enti...
--------------------


In [14]:
##nomic-embed-text

In [2]:
import os
import json
import shutil

# <<< FIX 1 >>> Import OllamaEmbeddings instead of SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# --- CONSTANTS ---
# <<< FIX 2 >>> Define the name of the model as it's known in Ollama
OLLAMA_EMBEDDING_MODEL = "nomic-embed-text" 

def process_and_save_chunks(pdf_folder: str, chunks_path: str) -> bool:
    """
    Step 1: Load PDFs, clean up metadata, split them into chunks, and save them to a JSON file.
    Returns True on success, False on failure.
    """
    if not os.path.isdir(pdf_folder):
        print(f"Error: The folder '{pdf_folder}' does not exist.")
        return False

    print(f"--- Step 1: Processing PDFs from '{pdf_folder}' ---")

    # Load documents
    loader = DirectoryLoader(
        pdf_folder,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True,
        use_multithreading=True
    )
    documents = loader.load()
    if not documents:
        print("No PDF documents found. Exiting.")
        return False
    print(f"Loaded {len(documents)} document(s).")

    # Clean up the 'source' metadata to be the chapter name (filename without extension)
    print("Cleaning up document metadata...")
    for doc in documents:
        source_path = doc.metadata.get('source', '')
        filename = os.path.basename(source_path)
        chapter_name, _ = os.path.splitext(filename)
        doc.metadata['source'] = chapter_name
    print(f"Metadata cleaned. Example source: '{documents[0].metadata['source']}'")

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    splits = text_splitter.split_documents(documents)
    print(f"Split documents into {len(splits)} chunks.")

    # Save chunks to a JSON file for later use
    print(f"Saving processed chunks to '{chunks_path}'...")
    with open(chunks_path, 'w', encoding='utf-8') as f:
        json_data = [
            {'page_content': doc.page_content, 'metadata': doc.metadata}
            for doc in splits
        ]
        json.dump(json_data, f, indent=2)

    print("--- Step 1 Complete: Chunks saved successfully. ---")
    return True


def create_vectordb_from_chunks(db_path: str, chunks_path: str):
    """
    Step 2: Load the processed chunks from the JSON file and create the
    persistent Chroma vector database using Ollama.
    """
    if not os.path.exists(chunks_path):
        print(f"Error: Chunks file not found at '{chunks_path}'. Please run the processing step first.")
        return
        
    # Clean up old database directory if it exists
    if os.path.exists(db_path):
        print(f"Found and removing existing DB directory: '{db_path}'")
        shutil.rmtree(db_path)

    # Create the new, empty directory for the database.
    print(f"Creating new empty directory for DB: '{db_path}'")
    os.makedirs(db_path)

    print(f"\n--- Step 2: Creating Vector DB from '{chunks_path}' ---")

    # Load the processed chunks from the JSON file
    with open(chunks_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    # Re-create Document objects from the loaded data
    documents_from_json = [
        Document(page_content=item['page_content'], metadata=item['metadata'])
        for item in json_data
    ]
    print(f"Loaded {len(documents_from_json)} chunks from file.")

    # <<< FIX 3 >>> Initialize the Ollama embedding model
    # Make sure your Ollama application is running and has the specified model.
    print(f"Initializing Ollama embedding model: '{OLLAMA_EMBEDDING_MODEL}'...")
    embeddings = OllamaEmbeddings(model=OLLAMA_EMBEDDING_MODEL)
    print("Ollama embedding model initialized.")

    # Create and persist the Chroma vector store
    print(f"Creating and persisting vector store at '{db_path}'...")
    vectorstore = Chroma.from_documents(
        documents=documents_from_json,
        embedding=embeddings,
        persist_directory=db_path
    )
    print("--- Step 2 Complete: Vector DB Creation Complete! ---")
    print(f"Vector store saved to: {db_path}")


# ==================================================================
# === SCRIPT EXECUTION =============================================
# ==================================================================

if __name__ == "__main__":
    # --- CONFIGURE YOUR PATHS HERE ---
    pdf_source_folder = "/Users/jimharrington/Desktop/ANEET/NCERT Books Raw Data/MentorGuide"
    chunks_output_path = "/Users/jimharrington/Desktop/ANEET/Processed Data/mentor_data.json"
    db_output_path = "/Users/jimharrington/Desktop/ANEET/VectorDB/nomicLM-Embed-VectorDB/chroma_vector_db_mentor_nomic"

    # --- MAIN EXECUTION LOGIC ---
    if os.path.exists(chunks_output_path):
        print(f"Found existing chunks file: '{chunks_output_path}'")
        print("Skipping PDF processing. Proceeding directly to embedding.")
        create_vectordb_from_chunks(db_path=db_output_path, chunks_path=chunks_output_path)
    else:
        print(f"No chunks file found. Starting full process from scratch.")
        if process_and_save_chunks(pdf_folder=pdf_source_folder, chunks_path=chunks_output_path):
            create_vectordb_from_chunks(db_path=db_output_path, chunks_path=chunks_output_path)

No chunks file found. Starting full process from scratch.
--- Step 1: Processing PDFs from '/Users/jimharrington/Desktop/ANEET/NCERT Books Raw Data/MentorGuide' ---


100%|██████████| 2/2 [00:00<00:00,  2.49it/s]
  embeddings = OllamaEmbeddings(model=OLLAMA_EMBEDDING_MODEL)


Loaded 24 document(s).
Cleaning up document metadata...
Metadata cleaned. Example source: 'ANEETA_NEET_Assistance_Description'
Split documents into 48 chunks.
Saving processed chunks to '/Users/jimharrington/Desktop/ANEET/Processed Data/mentor_data.json'...
--- Step 1 Complete: Chunks saved successfully. ---
Creating new empty directory for DB: '/Users/jimharrington/Desktop/ANEET/VectorDB/nomicLM-Embed-VectorDB/chroma_vector_db_mentor_nomic'

--- Step 2: Creating Vector DB from '/Users/jimharrington/Desktop/ANEET/Processed Data/mentor_data.json' ---
Loaded 48 chunks from file.
Initializing Ollama embedding model: 'nomic-embed-text'...
Ollama embedding model initialized.
Creating and persisting vector store at '/Users/jimharrington/Desktop/ANEET/VectorDB/nomicLM-Embed-VectorDB/chroma_vector_db_mentor_nomic'...
--- Step 2 Complete: Vector DB Creation Complete! ---
Vector store saved to: /Users/jimharrington/Desktop/ANEET/VectorDB/nomicLM-Embed-VectorDB/chroma_vector_db_mentor_nomic


In [2]:
import os
from langchain_community.vectorstores import Chroma
# <<< FIX 1 >>> Import the OllamaEmbeddings class
from langchain_community.embeddings import OllamaEmbeddings

# --- CONFIGURE YOUR QUERY HERE ---

# <<< FIX 2 >>> Point to the database you created with Ollama
db_path_to_query = "/Users/jimharrington/Desktop/ANEET/chroma_vector_db_mentor_nomic" 

# <<< FIX 3 >>> Use the exact same model name as in the creation script
# This is the name Ollama uses, not the Hugging Face identifier.
OLLAMA_MODEL_NAME = "nomic-embed-text" 

# Your query
query_text = "top medical colleges?"
# Number of results to return
k_results = 3

# --- SCRIPT EXECUTION ---

# Check if the database directory exists
if not os.path.exists(db_path_to_query):
    print(f"Error: Database not found at '{db_path_to_query}'")
    print("Please make sure you have run the creation script first and the path is correct.")
else:
    # <<< FIX 4 >>> Initialize the Ollama embedding function, NOT SentenceTransformerEmbeddings
    # This ensures your query is converted to a vector in the same way as the documents were.
    print(f"Initializing Ollama embeddings with model: '{OLLAMA_MODEL_NAME}'...")
    embedding_function = OllamaEmbeddings(model=OLLAMA_MODEL_NAME)
    print("Embedding function initialized.")

    # Load the persisted database
    print(f"Loading database from: '{db_path_to_query}'...")
    db = Chroma(
        persist_directory=db_path_to_query, 
        embedding_function=embedding_function
    )
    print("Database loaded successfully.")

    # Perform a similarity search
    print(f"\nPerforming a similarity search for: '{query_text}'")
    results = db.similarity_search(query_text, k=k_results)

    # Print the results and inspect the metadata
    print("\n--- Search Results ---")
    if not results:
        print("No results found.")
    else:
        for i, doc in enumerate(results):
            print(f"--- Result {i+1} ---")
            print(f"Source Chapter: {doc.metadata.get('source', 'N/A')}")
            print(f"Page Number: {doc.metadata.get('page', 'N/A')}")
            print(f"Content: {doc.page_content[:350]}...") # Print a slightly longer snippet
            print("-" * 25)

Initializing Ollama embeddings with model: 'nomic-embed-text'...
Embedding function initialized.
Loading database from: '/Users/jimharrington/Desktop/ANEET/chroma_vector_db_mentor_nomic'...
Database loaded successfully.

Performing a similarity search for: 'top medical colleges?'

--- Search Results ---
--- Result 1 ---
Source Chapter: mentor_knowledge
Page Number: 18
Content: which
 
you
 
can
 
follow
 
to
 
enhance
 
your
 
readiness
 
for
 
the
 
exam.
  ●  Familiarize  with  the  syllabus  ●  Invest  In  Good  Quality  Study  Materials  ●  Create  a  Certain  Strategy  and  Follow   ●  Finish  the  Most  Important  Topics  First  ●  Create  and  Stick  To  a  Timetable  ●  Make  your  Own  Notes  ●  Take  Breaks  Re...
-------------------------
--- Result 2 ---
Source Chapter: mentor_knowledge
Page Number: 18
Content: Rank  College  City  State  
1  All  India  Institute  of  Medical  Sciences,  Delhi  New  Delhi  Delhi  
2  
Post  Graduate  Institute  of  Medical  Education  and 

  db = Chroma(
