In [None]:
! pip install langchain langchainhub langchain_classic langchain_community langchain_ollama
! pip install pdfminer.six chromadb unstructured pypandoc

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PDFMinerLoader, UnstructuredEPubLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings, ChatOllama

In [None]:
PERSIST_DIR = "./indexes"

BOOKS = ["Algorithms.pdf", "pg84-images-3.epub"]
ROOT = "./data/"

splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
embedder = OllamaEmbeddings(model = "mxbai-embed-large")


for book in BOOKS:
    path = ROOT + book
    
    if book.endswith(".pdf"):
        loader = PDFMinerLoader(path, mode="single")
    elif book.endswith(".epub"):
        loader = UnstructuredEPubLoader(path, mode="single")
    elif book.endswith(".txt"):
        loader = TextLoader(path , mode="single")
    else:
        raise Exception("Extension not supported")

    doc = loader.load()
    splits = splitter.split_documents(doc)

    vectordb = Chroma.from_documents(
        documents=splits,
        embedding=embedder,
        persist_directory=PERSIST_DIR,
        collection_name=book,
    )

    print(f"Indexed {book} -> {len(splits)} chunks")

In [None]:
llm = ChatOllama(model = "phi3:mini", temperature=0)
retriever = vectordb.as_retriever(search_kwargs={"k": 5})