# Program to create a Chroma Vector Store

## Imports

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## My documents

In [2]:
# all_documents = []
# file_paths = [
#     "data/iss-const.pdf",
#     "data/read-n-heed.pdf",
#     "data/AG2526v2.pdf",
#     "data/parent-resource.pdf",
# ]

In [3]:
all_documents = []

pdf_documents = [
    {
        "pdf": "data/iss-const.pdf",
        "source": "The Indian Springs Constitution",
        "school_year": "2024-2025",
    },
    {
        "pdf": "data/read-n-heed.pdf",
        "source": "The Read N Head",
        "school_year": "2024-2025",
    },
    {
        "pdf": "data/AG2526v2.pdf",
        "source": "Academics Guide",
        "school_year": "2025-2026",
    },
    {
        "pdf": "data/parent-resource.pdf",
        "source": "Parent Guide",
        "school_year": "2024-2025",
    },
]

## Get all of the pages from all of the documents

In [4]:
for file_path in pdf_documents:
    loader = PyMuPDFLoader(file_path["pdf"])
    documents = loader.load()

    # Add metadata to each document
    for doc in documents:
        doc.metadata["source"] = file_path["source"]
        doc.metadata["file_path"] = file_path["pdf"]
        doc.metadata["school_year"] = file_path["school_year"]
        all_documents.append(doc)

KeyboardInterrupt: 

In [None]:
for pdf_info in pdf_documents:
    loader = PyMuPDFLoader(pdf_info["pdf"])
    documents = loader.load()

    for doc in documents:
        doc.metadata["source"] = pdf_info["source"]
        doc.metadata["school_year"] = pdf_info["school_year"]
        doc.metadata["file_path"] = pdf_info["pdf"]
        all_documents.append(doc)

In [None]:
# print(all_documents[4])

In [None]:
print("Number of pages:", len(all_documents))

## Split the pages into `chunks`

This is where we need to decide exactly how big a `chunk` should be and how much overlap there should be between `chunks`

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # chunk_size=200,
    # chunk_overlap=50
    # chunk_size=1024,
    # chunk_overlap=256,
    chunk_size=2048,
    chunk_overlap=256,
    separators=["\n\n", "\n", " "]
)
# Split document into chunks with metadata
chunked_documents = text_splitter.split_documents(all_documents)

In [None]:
print("Number of chunks: ", len(chunked_documents))

# Embed and build vector store all at once

In [None]:
from langchain_community.vectorstores import Chroma
# from langchain_huggingface import HuggingFaceEmbeddings

---

### This is only good for finding the model

In [None]:
from ollama import Client

In [None]:
client = Client(
    host="http://iss-cs-hp-01.indiansprings.org:11434"
)

### DEBUG CHECKER

Can I list the models

In [None]:
ollama_models = client.list()

In [None]:
for model in ollama_models['models']:
    print("-", model['model'], "from", model['details']['family'])

---

## Embedding chunks and metadata

In [None]:
print(type(chunked_documents[0]))
print(chunked_documents[0])

In [None]:
from langchain_ollama import OllamaEmbeddings

## Lets make a list of the chunks as String

In [None]:
chunk_strings = []
for i in range(len(chunked_documents)):
    chunk_strings.append(chunked_documents[i].page_content)

## Number of string chunks

In [None]:
print(len(chunk_strings))

In [None]:
# vect = client.embed(model="nomic-embed-text", input=chunk_strings)
ollama_embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
    base_url='http://10.100.200.57:11434'
)

In [None]:
chroma_store = Chroma.from_documents(
    documents=chunked_documents,
    embedding=ollama_embeddings,
    persist_directory='vectorstore',
)

In [None]:
# query = "Tell me about the mayor"
query = "Tell me about Critical Reading and Analytical Writing"

results = chroma_store.similarity_search(query,k=6)
for result in results:
    print("\n--------------------")
    print(result.page_content)