In [12]:
!pip -q install langchain langchain_community langchain-groq tiktoken chromadb huggingface_hub transformers sentence-transformers einops

In [13]:
!pip show langchain

Name: langchain
Version: 0.1.20
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, async-timeout, dataclasses-json, langchain-community, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


# LangChain multi-doc retriever with ChromaDB

***New Points***
- Multiple Texts
- Embedding
- ChromaDB
- QA Chain
- Source info

## Setting up LangChain


In [14]:
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_groq import ChatGroq
from glob import glob
import os
import tiktoken


## Models to use

In [15]:
embedding_name = 'nomic-ai/nomic-embed-text-v1.5'

In [16]:
# Load embedding
model_kwargs = {'trust_remote_code': True}
embedding = HuggingFaceEmbeddings(model_name=embedding_name, model_kwargs=model_kwargs)



## Load multiple and process documents

In [17]:
# Initialize documents and path
documents = []
root  = os.getcwd()

# Sort the file names
paths = sorted(glob('*.txt'))

# Copy each file to new file
for path in paths:
  loader = TextLoader(path)
  document = loader.load()
  documents.extend(document)

In [18]:
# Initialize tiktoken
enc = tiktoken.get_encoding("cl100k_base")
def length_function(text: str) -> int:
    return len(enc.encode(text))

In [19]:
#splitting the text into

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=length_function)
texts = text_splitter.split_documents(documents)

In [20]:
len(texts)

130

In [21]:
for text in texts:
  print(text.metadata)

{'source': 'A00-Abbreviations.txt'}
{'source': 'A01-Description.txt'}
{'source': 'A02-Catholic Identity, Vision Mission and Core Values.txt'}
{'source': 'A03-The University Seal.txt'}
{'source': 'A04-History.txt'}
{'source': 'A04-History.txt'}
{'source': 'A05-University Saints.txt'}
{'source': 'A05-University Saints.txt'}
{'source': 'A06-USC Organizational Structure.txt'}
{'source': 'A07-Admission.txt'}
{'source': 'A08-Admission Steps for Freshmen.txt'}
{'source': 'A09-Admission Steps for International Students.txt'}
{'source': 'A10-Admission Steps for Transferee.txt'}
{'source': 'A11-Admission Steps for Graduate Students.txt'}
{'source': 'A12-Admission Steps for Juris Doctor.txt'}
{'source': 'A13-College Entrance Examination.txt'}
{'source': 'A14-Post-Admission Enrollment Requirements.txt'}
{'source': 'A14-Post-Admission Enrollment Requirements.txt'}
{'source': 'A14-Post-Admission Enrollment Requirements.txt'}
{'source': 'A15-Post-Admission Enrollment Steps.txt'}
{'source': 'A16-Enrol

## Create the DB

In [22]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db_1000_200-tiktoken'

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [23]:
# persiste the db to disk
vectordb.persist()