In [1]:
from google.colab import drive
drive.mount("/content/drive",force_remount = True)

Mounted at /content/drive


In [None]:
%pip install "pdf2image" "pytesseract" "tiktoken" "langchain" "sentence-transformers" "unstructured" 
%pip install chromadb==0.3.29

In [3]:
import os
from chromadb.config import Settings

In [5]:
import re
import os
import glob
from typing import List
from dotenv import load_dotenv
from multiprocessing import Pool
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.document_loaders import (
    PDFMinerLoader,
)
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import TokenTextSplitter

In [6]:
# Define the folder for storing database
persist_directory = 'AllMini_Chroma_Tik_400' # add the name of the folder where you want to store vectorDB

# Define the Chroma settings
CHROMA_SETTINGS = Settings(
        chroma_db_impl='duckdb+parquet',
        persist_directory=persist_directory,
        anonymized_telemetry=False
)
# path to the source documents
source_directory= "/content/drive/MyDrive/daiict_webCrawl self data with chromadb private gpt/web crawl_DAIICT/docs/daiict.ac.in"
# define text chunk size and overlap
chunk_size = 400
chunk_overlap = 40

In [7]:
# load single document
def load_single_document(file_path: str) -> List[Document]:
  """
  Function responsible for loading pdf and text files and data cleaning.
  """
  # define data loader as per file type
  if file_path[-3:]=="pdf":
    loader = PDFMinerLoader(file_path)
  else:
    loader = UnstructuredFileLoader(file_path)

  # load data from file
  result = loader.load()
  page_content = result[0].page_content
  # Remove extra breaklines from the text.
  page_content = page_content.replace('\n',' ').replace('\\n',' ')
  # Remove continous extraspaces from the text
  page_content = re.sub(r"\s+", " ", page_content)
  result[0].page_content = page_content
  return result

In [8]:
def load_documents(source_dir: str) -> List[Document]:
    """
    Loads all documents from the source documents directory.
    """
    results = []
    all_file_paths = []
    # text file paths
    txt_files = glob.glob(os.path.join(source_dir, '*.txt'))
    # pdf files paths
    pdf_files = glob.glob(os.path.join(source_dir, '*.pdf'))
    all_file_paths.extend(txt_files)
    all_file_paths.extend(pdf_files)

    # remove files .zip type
    all_file_paths = [file for file in all_file_paths if not file.endswith('.zip')]
    for file in all_file_paths:
      # remove xls files in txt format
      if(file[-7:]!='xls.txt'):
        doc = load_single_document(file)
        results.extend(doc)

    return results

In [9]:
def process_documents() -> List[Document]:
    """
    Load documents and split in chunks
    """
    print(f"Loading documents from {source_directory}")
    documents = load_documents(source_directory)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from {source_directory}")
    # define text splitter
    text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts

In [10]:
def does_vectorstore_exist(persist_directory: str) -> bool:
    """
    Checks if vectorstore exists
    """
    if os.path.exists(os.path.join(persist_directory, 'index')):
        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
            # At least 3 documents are needed in a working vectorstore
            if len(list_index_files) > 3:
                return True
    return False

In [13]:
def main():
    """
    Run this for creating vector database

    """
    # load embeddings model
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

    if does_vectorstore_exist(persist_directory):
        # Update and store locally vectorstore
        print(f"Appending to existing vectorstore at {persist_directory}")
        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
        collection = db.get()
        texts = persist_directory([metadata['source'] for metadata in collection['metadatas']])
        print(f"Creating embeddings. May take some minutes...")
        db.add_documents(texts)
    else:
        # Create and store locally vectorstore
        print("Creating new vectorstore")
        texts = process_documents()
        print(f"Creating embeddings. May take some minutes...")
        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
    db.persist()
    db = None

    print(f"Ingestion complete! You can now run query the vectorDB for context retrieval")

In [12]:
#Enter path to store the vectorDB
%cd /content/drive/MyDrive/final VectorDBs/Daiict

/content/drive/MyDrive/final VectorDBs/Daiict


In [None]:
# Only run when creating vectorStore
main()

#Context retrieval on vector database

In [None]:
# load embeddings model
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# define vectorDB directory and ChromaDB settings
persist_directory = "AllMpnet_Chroma_Tik_500"
CHROMA_SETTINGS = Settings(
        chroma_db_impl='duckdb+parquet',
        persist_directory=persist_directory,
        anonymized_telemetry=False
)

In [None]:
# vectorDB instance
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)



In [None]:
# query the vectorDB
query = "Name professors in cse department"

# use vectorDB as retriever with top k retrieved context
retriever = db.as_retriever(search_type="mmr",search_kwargs={"k": 2})
retriever.get_relevant_documents(query)

In [None]:
# Retrieval context with similarity score
query = "Who are the Alumni Association Members?"
docs = db.similarity_search_with_score(query)
docs[0]