# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Files
- ChromaDB
- Source info
- 

## Setting up LangChain


In [1]:
import chromadb
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from tqdm import tqdm

## Load multiple and process documents

In [2]:
# Load and process the text files

loader = PyPDFDirectoryLoader('./source/')
documents = loader.load()

In [3]:
#splitting the text into

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [4]:
len(texts)

8214

In [5]:
texts[3]

Document(page_content='Kubernetes is a portable, extensible, open source platform for managing containerized\nworkloads and services, that facilitates both declarative configuration and automation. It has a\nlarge, rapidly growing ecosystem. Kubernetes services, support, and tools are widely available.\nThe name Kubernetes originates from Greek, meaning helmsman or pilot. K8s as an\nabbreviation results from counting the eight letters between the "K" and the "s". Google open-\nsourced the Kubernetes project in 2014. Kubernetes combines over 15 years of Google\'s\nexperience  running production workloads at scale with best-of-breed ideas and practices from\nthe community.\nGoing back in time\nLet\'s take a look at why Kubernetes is so useful by going back in time.\nDeployment evolution\nTraditional deployment era:  Early on, organizations ran applications on physical servers.\nThere was no way to define resource boundaries for applications in a physical server, and this', metadata={'sou

In [6]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = './db'

## here we are using local embeddings

embeddings = HuggingFaceEmbeddings()

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embeddings,
                                 persist_directory=persist_directory)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# persiste the db to disk

vectordb.persist()
vectordb = None