RAG (Retrieval Augmented Generation)

In [9]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings 
from langchain.schema import Document 
from langchain_core.prompts import ChatPromptTemplate
from langchain.vectorstores.chroma import Chroma 
from dotenv import load_dotenv 
from langchain.chat_models import ChatOpenAI
import os
import shutil

Loading the Documents

In [3]:
DATA_PATH = './data/'

def load_documents():
  document_loader = PyPDFDirectoryLoader(DATA_PATH)
  return document_loader.load()

documents = load_documents()
print(documents[0])

page_content='Fakultät für\nElektrotechnik\nSeminararbeit\nto obtain the degree\nAbschlussarbeit in WMED-A\nThe Harvard Negotiation Concept and its\nRelevance to Mediation: A Case Study\ncreated by Markus Gisi\nFirst Supervisor Susanne Ihle\nTopic assigned on March 20, 2024\nSubmitted on June 6, 2024\nNon-Disclosure Agreement Y esMarkus Gisi\nIm Hinterfeld 3\n86456 Gablingen\nP +49 8230701015\nmarkus.gisi2@hs-augsburg.de\nEnrolment Number:\n2174789\nTechnische Hochschule\nAugsburg\nAn der Hochschule 1\nD-86161 Augsburg\nP +49 821 5586-0\nF +49 821 5586-3222\nwww.tha.de\ninfo@tha.de' metadata={'source': 'data\\some.pdf', 'page': 0}


Chunking

In [4]:
def split_text(documents: list[Document]):
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300, # Size of each chunk in characters
    chunk_overlap=100, # Overlap between consecutive chunks
    length_function=len, # Function to compute the length of the text
    add_start_index=True, # Flag to add start index to each chunk
  )

  # Split documents into smaller chunks using text splitter
  chunks = text_splitter.split_documents(documents)
  print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

  # Print example of page content and metadata for a chunk
  document = chunks[0]
  print(document.page_content)
  print(document.metadata)

  return chunks # Return the list of split text chunks

Storing Chunks in VectorDB(ChromaDB)

In [5]:
db = None
CHROMA_PATH = "./chroma"
def save_to_chroma(chunks: list[Document]):
  # If a Chroma database already exists, close it and delete the directory
  if os.path.exists(CHROMA_PATH):
    # Close the database connection
    db.close()  # Replace 'db.close()' with the actual method to close the database connection
    # Delete the directory
    shutil.rmtree(CHROMA_PATH)

  # Create a new Chroma database from the documents using OpenAI embeddings
  db = Chroma.from_documents(
    chunks,
    OpenAIEmbeddings(),
    persist_directory=CHROMA_PATH
  )

  # Persist the database to disk
  db.persist()
  print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [6]:
def generate_data_store():
  documents = load_documents() # Load documents from a source
  chunks = split_text(documents) # Split documents into manageable chunks
  save_to_chroma(chunks) # Save the processed data to a data store

# Load environment variables from a .env file
load_dotenv()
# Generate the data store
generate_data_store()

Split 16 documents into 152 chunks.
Fakultät für
Elektrotechnik
Seminararbeit
to obtain the degree
Abschlussarbeit in WMED-A
The Harvard Negotiation Concept and its
Relevance to Mediation: A Case Study
created by Markus Gisi
First Supervisor Susanne Ihle
Topic assigned on March 20, 2024
Submitted on June 6, 2024
{'source': 'data\\some.pdf', 'page': 0, 'start_index': 0}


  warn_deprecated(


Saved 152 chunks to ./chroma.


  warn_deprecated(


Prompt

In [8]:
query_text='What is this document about?'

PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
 - -
Answer the question based on the above context: {question}
"""

Hybrid search in VectorDB with query_text and generation of the answer

In [10]:
def query_rag(query_text):
  # YOU MUST - Use same embedding function as before
  embedding_function = OpenAIEmbeddings()

  # Prepare the database
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
  
  # Retrieving the context from the DB using similarity search
  results = db.similarity_search_with_relevance_scores(query_text, k=3)

  # Check if there are any matching results or if the relevance score is too low
  if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")

  # Combine context from matching documents
  context_text = "\n\n - -\n\n".join([doc.page_content for doc, _score in results])
 
  # Create prompt template using context and query text
  prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
  prompt = prompt_template.format(context=context_text, question=query_text)
  
  # Initialize OpenAI chat model
  model = ChatOpenAI()

  # Generate response text based on the prompt
  response_text = model.predict(prompt)
 
   # Get sources of the matching documents
  sources = [doc.metadata.get("source", None) for doc, _score in results]
 
  # Format and return response including generated text and sources
  formatted_response = f"Response: {response_text}\nSources: {sources}"
  return formatted_response, response_text

# Let's call our function we have defined
formatted_response, response_text = query_rag(query_text)
# and finally, inspect our final response!
print(response_text)

Unable to find matching results.


  warn_deprecated(
  warn_deprecated(


The document is likely about mediation and negotiation processes, specifically in the context of NDA negotiations and organizational pressures.
