# Program to create a Chroma Vector Store

## Imports

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## My documents

In [2]:
# all_documents = []
# file_paths = [
#     "data/iss-const.pdf",
#     "data/read-n-heed.pdf",
#     "data/AG2526v2.pdf",
#     "data/parent-resource.pdf",
# ]

In [3]:
all_documents = []

pdf_documents = [
    {
        "pdf": "data/iss-const.pdf",
        "source": "The Indian Springs Constitution",
        "school_year": "2024-2025",
    },
    {
        "pdf": "data/read-n-heed.pdf",
        "source": "The Read N Head",
        "school_year": "2024-2025",
    },
    {
        "pdf": "data/AG2526v2.pdf",
        "source": "Academics Guide",
        "school_year": "2025-2026",
    },
    {
        "pdf": "data/parent-resource.pdf",
        "source": "Parent Guide",
        "school_year": "2024-2025",
    },
]

## Get all of the pages from all of the documents

In [None]:
for file_path in file_paths:
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()

    # Add metadata to each document
    for doc in documents:
        doc.metadata["source"] = file_path
        all_documents.append(doc)

In [4]:
for pdf_info in pdf_documents:
    loader = PyMuPDFLoader(pdf_info["pdf"])
    documents = loader.load()

    for doc in documents:
        doc.metadata["source"] = pdf_info["source"]
        doc.metadata["school_year"] = pdf_info["school_year"]
        doc.metadata["file_path"] = pdf_info["pdf"]
        all_documents.append(doc)

In [7]:
# print(all_documents[4])

In [8]:
print("Number of pages:", len(all_documents))

Number of pages: 138


## Split the pages into `chunks`

This is where we need to decide exactly how big a `chunk` should be and how much overlap there should be between `chunks`

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    # chunk_size=200,
    # chunk_overlap=50
    chunk_size=1024,
    chunk_overlap=256,
    separators=["\n\n", "\n", " "]
)
# Split document into chunks with metadata
chunked_documents = text_splitter.split_documents(all_documents)

In [10]:
print("Number of chunks: ", len(chunked_documents))

Number of chunks:  376


# Embed and build vector store all at once

In [11]:
from langchain_community.vectorstores import Chroma
# from langchain_huggingface import HuggingFaceEmbeddings

---

### This is only good for finding the model

In [12]:
from ollama import Client

In [13]:
client = Client(
    host="http://iss-cs-hp-01.indiansprings.org:11434"
)

### DEBUG CHECKER

Can I list the models

In [14]:
ollama_models = client.list()

In [15]:
for model in ollama_models['models']:
    print("-", model['model'], "from", model['details']['family'])

- nomic-embed-text:latest from nomic-bert
- llama3.3:latest from llama
- qwen2.5:14b from qwen2
- deepseek-r1:14b from qwen2
- qwen2.5:latest from qwen2
- qwen2.5-coder:latest from qwen2
- llama3.2:latest from llama
- deepseek-r1:latest from qwen2


---

## Embedding chunks and metadata

In [16]:
print(type(chunked_documents[0]))
print(chunked_documents[0])

<class 'langchain_core.documents.base.Document'>
page_content='The Indian Springs School Constitution
Preamble
Under the freedom granted the faculty and students of Indian Springs School by the Board of Governors of the
Alabama Educational Foundation, the governing body of Indian Springs School, there is herewith created the official Indian
Springs School Constitution.
The purpose of the Indian Springs School Constitution and Government is to promote living and learning in accordance
with humane and democratic values and principles, thereby contributing to the intellectual, physical, emotional, spiritual, and
ethical development of all members of the Indian Springs school-community.
In establishing this Constitution, the members of the Indian Springs school-community affirm that students and teachers
do not have to accept themselves or their surroundings as they are but that it is within their power to improve both through
participation in constructive change. Such participation enhanc

In [17]:
from langchain_ollama import OllamaEmbeddings

## Lets make a list of the chunks as String

In [18]:
chunk_strings = []
for i in range(len(chunked_documents)):
    chunk_strings.append(chunked_documents[i].page_content)

## Number of string chunks

In [19]:
print(len(chunk_strings))

376


In [20]:
# vect = client.embed(model="nomic-embed-text", input=chunk_strings)
ollama_embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
    base_url='http://10.100.200.57:11434'
)

In [21]:
chroma_store = Chroma.from_documents(
    documents=chunked_documents,
    embedding=ollama_embeddings,
    persist_directory='vectorstore',
)

In [30]:
# query = "Tell me about the mayor"
query = "Tell me about Critical Reading and Analytical Writing"

results = chroma_store.similarity_search(query,k=6)
for result in results:
    print("\n--------------------")
    print(result.page_content)


--------------------
4.2 Course Descriptions
25
process. The majority of readings will be non-ﬁction. Topics will often relate to ethics, politics,
social issues, and/or language(s). The course likewise serves as an introduction to norms of com-
munication and conduct in American post-secondary and professional settings. Students will thus
learn and demonstrate dispositions conducive to success in those arenas.
Critical Reading & Analytical Writing
Grifﬁn
Year - 1 Credit
This yearlong course emphasizes critical reading and writing skills through the study of canon-
ical and contemporary texts from around the globe. We will read novels, short stories, drama,
creative nonﬁction, and poetry written during the Renaissance to the present day, with particular
emphasis on works produced during the twentieth and twenty-ﬁrst centuries. We will study these
texts as cultural records, which illuminate and offer commentary on the contexts from which they

--------------------
ing, and writing, and