In [4]:
!pip install langchain==0.0.274
!pip install gpt4all==1.0.8
!pip install chromadb==0.4.7
!pip install llama-cpp-python
!pip install urllib3==2.0.4
!pip install PyMuPDF==1.23.1
!pip install python-dotenv==1.0.0
!pip install unstructured==0.10.8
!pip install extract-msg==0.45.0
!pip install tabulate==0.9.0
!pip install pandoc==2.3
!pip install pypandoc==1.11
!pip install tqdm==4.66.1
!pip install sentence_transformers
!pip install jq




In [5]:
# Standard Library Imports
import os
import glob
import time
from multiprocessing import Pool

# Third-Party Library Imports
from typing import List
from dotenv import load_dotenv
from tqdm import tqdm

# Langchain Imports
from langchain.document_loaders import (
    CSVLoader,
    EverNoteLoader,
    PyMuPDFLoader,
    TextLoader,
    JSONLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain.llms import GPT4All, LlamaCpp
from transformers import AutoModel, AutoTokenizer,  BertModel, BertTokenizer

# ChromaDB Imports
from chromadb.config import Settings
import chromadb

# Argument Parsing
import argparse

import torch  # Import PyTorch to check GPU availability

In [6]:
persist_directory = "/content/db/"
#persist_directory = "/content/drive/MyDrive/Colab/db/"
model_type = "GPT4All"
model_path = "/content/drive/MyDrive/Colab/Models/ggml-gpt4all-j-v1.3-groovy.bin"
#model_type = "LLAMA"
#model_path = "/content/drive/MyDrive/Colab/Models/llama_2_7b_chat_ggmlv3_q4_0.bin"
#source_directory = "/content/drive/MyDrive/Colab/SOR/"
source_directory = "./sor/"
embeddings_model_name = "all-MiniLM-L6-v2"
model_n_ctx = 1000
model_n_batch = 8
target_source_chunks = 4
chunk_size = 500
chunk_overlap = 50

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# List files in the specified source directory
archivos = os.listdir(source_directory)

# Count the number of files in the directory
cantidad_de_archivos = len(archivos)

# Print the number of files in the directory
print(f"The folder '{source_directory}' contains {cantidad_de_archivos} files.")


The folder './sor/' contains 3 files.


In [9]:
# Check if GPU is available and set the device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [10]:
# Create embeddings using Hugging Face model
# The 'embeddings_model_name' specifies the pre-trained model to use for embeddings.

# Load the model and tokenizer from transformers, specifying the device
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),  # Use the CSVLoader for .csv files with no additional arguments.
    ".pdf": (PyMuPDFLoader, {}),  # Use the PyMuPDFLoader for .pdf files with no additional arguments.
    ".txt": (TextLoader, {"encoding": "utf8"}) # Use the TextLoader for .txt files with the specified UTF-8 encoding.
    # Add more mappings for other file extensions and loaders as needed
}


In [12]:
def load_single_document(file_path: str) -> List[Document]:
    # Extract the file extension from the given file path.
    ext = "." + file_path.rsplit(".", 1)[-1].lower()

    # Check if the file extension is in the LOADER_MAPPING dictionary.
    if ext in LOADER_MAPPING:
        # Get the loader class and loader arguments for the specified extension.
        loader_class, loader_args = LOADER_MAPPING[ext]

        # Create an instance of the loader class with the specified file path and arguments.
        loader = loader_class(file_path, **loader_args)

        # Load the document using the loader and return it.
        return loader.load()

    # If the file extension is not supported, raise a ValueError.
    raise ValueError(f"Unsupported file extension '{ext}'")

In [13]:
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
    # Find all files in the source directory with extensions specified in LOADER_MAPPING.
    all_files = []
    for ext in LOADER_MAPPING:
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext.lower()}"), recursive=True)
        )
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext.upper()}"), recursive=True)
        )

    # Filter out files that are in the ignored_files list.
    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]

    # Use a multiprocessing Pool to load documents in parallel.
    with Pool(processes=os.cpu_count()) as pool:
        results = []
        # Create a progress bar for loading documents.
        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
                results.extend(docs)
                pbar.update()

    return results



In [14]:
def process_documents(ignored_files: List[str] = []) -> List[Document]:
    # Print a message indicating that documents are being loaded from the specified source directory.
    print(f"Loading documents from {source_directory}")

    # Load documents from the source directory, excluding any ignored files.
    documents = load_documents(source_directory, ignored_files)

    # Check if there are no documents to process and exit if that's the case.
    if not documents:
        print("No new documents to load")
        exit(0)

    # Print the number of loaded documents and the source directory.
    print(f"Loaded {len(documents)} new documents from {source_directory}")

    # Create a text splitter with the specified chunk size and overlap.
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    # Split the loaded documents into chunks of text using the text splitter.
    texts = text_splitter.split_documents(documents)

    # Print the number of text chunks created and the maximum chunk size.
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")

    # Return the resulting text chunks.
    return texts

In [15]:
def does_vectorstore_exist(persist_directory: str, embeddings: HuggingFaceEmbeddings) -> bool:
    # Create a Chroma vector store instance with the specified persist directory and embeddings.
    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

    # Get the 'documents' data from the vector store. If it's empty, return False; otherwise, return True.
    if not db.get()['documents']:
        return False
    return True

In [16]:
# Check if the vector store already exists in the specified directory with the given embeddings.
if does_vectorstore_exist(persist_directory, embeddings):
    # If the vector store exists, append to it.
    print(f"Appending to existing vector store at {persist_directory}")

    # Create a Chroma vector store instance with the specified directory and embeddings.
    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

    # Get the existing collection from the vector store.
    collection = db.get()

    # Extract source file paths from the collection's metadata.
    source_file_paths = [metadata['source'] for metadata in collection['metadatas']]

    # Process the documents based on the extracted source file paths.
    texts = process_documents(source_file_paths)

    # Inform the user about the embeddings creation process.
    print(f"Creating embeddings. May take some minutes...")

       # Check if 'texts' is not empty before adding documents to the vector store
    if texts:
        # Add the processed documents to the vector store.
        db.add_documents(texts)
    else:
        print("No documents to add. Skipping insertion.")
else:
    # If the vector store does not exist, create a new one.
    print("Creating a new vector store")

    # Process documents without specifying ignored files (default behavior).
    texts = process_documents()

    # Inform the user about the embeddings creation process.
    print(f"Creating embeddings. May take some minutes...")

    # Create a new Chroma vector store with the processed documents and embeddings.
    db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)

# Persist the vector store.
db.persist()

# Clear the db variable to free up resources.
db = None

# Inform the user that the ingestion process is complete.
print(f"Ingestion complete! You can now run privateGPT.py to query your documents")


Appending to existing vector store at /content/db/
Loading documents from ./sor/


Loading new documents: 0it [00:00, ?it/s]

No new documents to load
Loaded 0 new documents from ./sor/
Split into 0 chunks of text (max. 500 tokens each)
Creating embeddings. May take some minutes...
No documents to add. Skipping insertion.
Ingestion complete! You can now run privateGPT.py to query your documents





In [17]:
# Create settings for Chroma database configuration
settings = Settings(
    persist_directory=persist_directory,  # Directory for persisting database data
    anonymized_telemetry=False  # Disable anonymized telemetry
)

# Create Hugging Face embeddings model
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

# Create a persistent Chroma database client
chroma_client = chromadb.PersistentClient(
    settings=settings,  # Database settings
    path=persist_directory  # Path to the database directory
)

# Create a Chroma vector store instance
db = Chroma(
    persist_directory=persist_directory,  # Directory for persisting vector store data
    embedding_function=embeddings,  # Embeddings function
    client_settings=settings,  # Database settings
    client=chroma_client  # Chroma client
)

# Create a retriever for document retrieval
retriever = db.as_retriever(
    search_kwargs={"k": target_source_chunks}  # Search settings (e.g., number of search results)
)


In [18]:
# Initialize an empty list to store callback handlers (you can add handlers here if needed)
callbacks = []

# Prepare the Language Model (LLM) based on the specified model_type
match model_type:
    case "LLAMA":
        # Create an instance of LlamaCpp
        llm = LlamaCpp(
            model_path=model_path,  # Path to the LlamaCpp model
            max_tokens=model_n_ctx,  # Maximum number of tokens in generated text
            n_batch=model_n_batch,  # Batch size for text generation
            callbacks=callbacks,    # List of callback handlers
            verbose=False           # Set to True for verbose output
        )
    case "GPT4All":
        # Create an instance of GPT4All
        llm = GPT4All(
            model=model_path,       # Path to the GPT4All model
            max_tokens=model_n_ctx, # Maximum number of tokens in generated text
            backend='gptj',        # Specify the backend (e.g., 'gptj')
            n_batch=model_n_batch,  # Batch size for text generation
            callbacks=callbacks,    # List of callback handlers
            verbose=False           # Set to True for verbose output
        )
    case _default:
        # Raise an exception if the model_type is not supported
        raise Exception(f"Model type {model_type} is not supported. Please choose one of the following: LlamaCpp, GPT4All")


Found model file at  /content/drive/MyDrive/Colab/Models/ggml-gpt4all-j-v1.3-groovy.bin


In [19]:
# Set 'hide_source' to True or False as needed
hide_source = True  # Set to True to hide source documents, False to show them

# Set 'mute_stream' to True or False as needed
mute_stream = True  # Set to True to mute stream output, False to allow it

# Create a RetrievalQA instance for question-answering
qa = RetrievalQA.from_chain_type(
    llm=llm,                       # The language model (LLM) instance
    chain_type="stuff",            # Chain type (specific to the application)
    retriever=retriever,           # Document retriever instance
    return_source_documents=not hide_source  # Whether to return source documents in responses
)


In [20]:
while True:
    # Prompt the user to enter a query
    query = input("\nEnter a query: ")

    # Check if the user wants to exit the loop
    if query == "exit":
        break

    # Check if the query is empty and continue to the next iteration if it is
    if query.strip() == "":
        continue

    # Get the answer from the question-answering system
    start = time.time()  # Record the start time for performance measurement
    res = qa(query)  # Query the question-answering system using the user's query
    answer, docs = res['result'], [] if hide_source else res['source_documents']  # Extract answer and source documents
    end = time.time()  # Record the end time for performance measurement

    # Print the result
    print("\n\n> Question:")  # Print the user's query
    print(query)
    print(f"\n> Answer (took {round(end - start, 2)} s.):")  # Print the answer and query response time
    print(answer)

    # Print the relevant sources used for the answer, if not hiding sources
    if not hide_source:
        for document in docs:
            print("\n> " + document.metadata["source"] + ":")  # Print the source document's metadata
            print(document.page_content)  # Print the content of the source document



Enter a query: Qué ID tiene el distribuidor Zapata?


> Question:
Qué ID tiene el distribuidor Zapata?

> Answer (took 172.81 s.):
 The given context does not provide information about the specific ID of the distributor named "ZapaTa".

Enter a query: Dame los distritos de Ford


> Question:
Dame los distritos de Ford

> Answer (took 170.72 s.):
 The following are the list of Distrito M's for Ford in Mexico, according to context provided: 
- M1 (1956) - sedan and wagon models
- M2 (1960) - station wagons only
- M3 (1963) - sedans and coupes
- M4 (1967) - fullsize cars with a V8 engine
- M5 (1970) - midsize car, available as a coupe or sedan 
- M6 (1972) - mid-sized luxury car that was also sold in the United States under the name Lincoln Continental.


KeyboardInterrupt: Interrupted by user