# Program to create a Chroma Vector Store

## Imports

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## My documents

In [2]:
all_documents = []
file_paths = [
    "data/iss-const.pdf",
    "data/read-n-heed.pdf",
    "data/AG2526v2.pdf",
    "data/parent-resource.pdf",
]

## Get all of the pages from all of the documents

In [3]:
for file_path in file_paths:
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()

    # Add metadata to each document
    for doc in documents:
        doc.metadata["source"] = file_path
        all_documents.append(doc)

In [4]:
print("Number of pages:", len(all_documents))

Number of pages: 138


## Split the pages into `chunks`

This is where we need to decide exactly how big a `chunk` should be and how much overlap there should be between `chunks`

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    # chunk_size=200,
    # chunk_overlap=50
    chunk_size=1024,
    chunk_overlap=256,
    separators=["\n\n", "\n", " "]
)
# Split document into chunks with metadata
chunked_documents = text_splitter.split_documents(all_documents)

In [8]:
print("Number of chunks: ", len(chunked_documents))

Number of chunks:  376


# Embed and build vector store all at once

In [11]:
from langchain_community.vectorstores import Chroma
# from langchain_huggingface import HuggingFaceEmbeddings

In [12]:
from ollama import Client

In [13]:
client = Client(
    host="http://iss-cs-hp-01.indiansprings.org:11434"
)

### DEBUG CHECKER

Can I list the models

In [18]:
ollama_models = client.list()

In [19]:
for model in ollama_models['models']:
    print("-", model['model'], "from", model['details']['family'])

- nomic-embed-text:latest from nomic-bert
- llama3.3:latest from llama
- qwen2.5:14b from qwen2
- deepseek-r1:14b from qwen2
- qwen2.5:latest from qwen2
- qwen2.5-coder:latest from qwen2
- llama3.2:latest from llama
- deepseek-r1:latest from qwen2


In [28]:
print(chunked_documents[0])

page_content='The Indian Springs School Constitution
Preamble
Under the freedom granted the faculty and students of Indian Springs School by the Board of Governors of the
Alabama Educational Foundation, the governing body of Indian Springs School, there is herewith created the official Indian
Springs School Constitution.
The purpose of the Indian Springs School Constitution and Government is to promote living and learning in accordance
with humane and democratic values and principles, thereby contributing to the intellectual, physical, emotional, spiritual, and
ethical development of all members of the Indian Springs school-community.
In establishing this Constitution, the members of the Indian Springs school-community affirm that students and teachers
do not have to accept themselves or their surroundings as they are but that it is within their power to improve both through
participation in constructive change. Such participation enhances the worth and dignity of humankind, develops r

## Lets make a list of the chunks as String

In [29]:
chunk_strings = []
for i in range(len(chunked_documents)):
    chunk_strings.append(chunked_documents[i].page_content)

## Number of string chunks

In [30]:
print(len(chunk_strings))

376


In [31]:
vect = client.embed(model="nomic-embed-text", input=chunk_strings)

In [33]:
chroma_store = Chroma.from_documents(
    documents=chunked_documents,
    embedding=vect.embeddings,
    persist_directory='vectorstore',
)

AttributeError: 'list' object has no attribute 'embed_documents'