In [1]:
!pip install gradio langchain langchain-core langchain-openai langchain-community langchain_chroma langchain-huggingface pypdf chromadb sentence-transformers python-dotenv


Collecting langchain-openai
  Downloading langchain_openai-1.0.2-py3-none-any.whl.metadata (1.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-1.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.1-py3-none-any.whl.metadata (2.1 kB)
Collecting pypdf
  Downloading pypdf-6.1.3-py3-none-any.whl.metadata (7.1 kB)
Collecting chromadb
  Downloading chromadb-1.3.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of langchain-openai to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-openai
  Downloading langchain_openai-1.0.1-py3-none-any.whl.metadata (1.8 kB)
  Downloading langchain_openai-1.0.0-py3-none-any.whl.metadata (1.8 kB)
  Downloading langchain_openai-0.3.35-py3-none-any.whl.me

In [2]:
import os
import pathlib
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma

PDF_PATH = pathlib.Path('IUO_Prospectus-4.pdf')
CHROMA_DIR = pathlib.Path('chroma_db')
COLLECTION_NAME = "iuo_prospectus_collection"

def index_data():
    """
    Loads, splits, embeds, and stores the prospectus PDF data into a Chroma vector database.
    This function should be run once to prepare the RAG knowledge base.
    """
    print(f"Starting data indexing process for {PDF_PATH}")

    try:
        loader = PyPDFLoader(str(PDF_PATH))
        documents = loader.load()
        print(f"Loaded {len(documents)} pages from PDF.")
    except Exception as e:
        print(f"Error loading PDF: {e}. Ensure the document is available.")
        return

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )

    texts = text_splitter.split_documents(documents)
    print(f"Document was split into {len(texts)} chunks for embedding.")

    print("Initializing embedding model...")
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

    # ✅ FIXED: You forgot the parentheses here — CHROMA_DIR.mkdir()
    if not CHROMA_DIR.exists():
        CHROMA_DIR.mkdir(parents=True, exist_ok=True)

    print("Creating Chroma vector database...")
    Chroma.from_documents(
        documents=texts,
        embedding=embeddings,
        persist_directory=str(CHROMA_DIR),
        collection_name=COLLECTION_NAME
    )

    print("✅ Indexing process completed successfully!")
    print(f"Chroma DB is now stored in: {CHROMA_DIR.resolve()}")

if __name__ == "__main__":
    load_dotenv()
    index_data()

Starting data indexing process for IUO_Prospectus-4.pdf
Loaded 901 pages from PDF.
Document was split into 2760 chunks for embedding.
Initializing embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Creating Chroma vector database...
✅ Indexing process completed successfully!
Chroma DB is now stored in: /content/chroma_db
