# Simple Chatbot with RAG Pipeline (LangChain)

### Step 1: Document ingestion

1.  Redact private info
2.  Text preprocessing : chunking

describe process flow for the section

In [1]:
#import libraries


In [2]:
def load_pdf(file_path):
    '''function to ingest pdf documents '''
    from langchain_community.document_loaders import PyPDFLoader #library required
    
    if isinstance(file_path,str):
        #initialize file path
        file_path = file_path

        #lnitialize pdf loader
        pdf_loader = PyPDFLoader(file_path)

         #load documents
        document = pdf_loader.load()
        print(f"Document loaded successfully.\n Document has {len(document)} pages from the pdf")

        return document
    else:
        raise TypeError("The file_path must be a string")

In [3]:
#load document
hr_manual = load_pdf('hr_manual.pdf')

#preview first 500 characters
#print(hr_manual[20].page_content[:500])

#data type
#print(type(hr_manual))

Document loaded successfully.
 Document has 185 pages from the pdf


In [4]:
#check data type for each element in list
#for item in hr_manual[:10]:
    #print(type(item))


In [5]:
"""
def remove_pdf_metadata(input_pdf, output_pdf):
    from PyPDF2 import PdfReader, PdfWriter
    ''' handle privacy concerns'''

    # Read the PDF file
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Add all pages to the writer
    for page in reader.pages:
        writer.add_page(page)
    
    # Explicitly set metadata to an empty dictionary 
    writer.add_metadata({})  # This removes metadata 

    # Write the modified PDF to a new file
    with open(output_pdf, 'wb') as output_file:
        writer.write(output_file)

    print(f"Metadata removed. Cleaned PDF saved to '{output_pdf}'.")

remove sensitive info
remove_pdf_metadata("hr_manual.pdf", "hr_manual_cleaned.pdf") 

"""

'\ndef remove_pdf_metadata(input_pdf, output_pdf):\n    from PyPDF2 import PdfReader, PdfWriter\n    \'\'\' handle privacy concerns\'\'\'\n\n    # Read the PDF file\n    reader = PdfReader(input_pdf)\n    writer = PdfWriter()\n\n    # Add all pages to the writer\n    for page in reader.pages:\n        writer.add_page(page)\n    \n    # Explicitly set metadata to an empty dictionary \n    writer.add_metadata({})  # This removes metadata \n\n    # Write the modified PDF to a new file\n    with open(output_pdf, \'wb\') as output_file:\n        writer.write(output_file)\n\n    print(f"Metadata removed. Cleaned PDF saved to \'{output_pdf}\'.")\n\nremove sensitive info\nremove_pdf_metadata("hr_manual.pdf", "hr_manual_cleaned.pdf") \n\n'

In [6]:
#load document
#hr_manual = load_pdf('hr_manual_cleaned.pdf')

#preview first 500 characters
#print(hr_manual[50].page_content[:500])

#data type
#print(type(hr_manual))

In [7]:
#verify removal of sensitive info
#print(hr_manual[25].metadata)

In [8]:
def clean_text(text):
    import re
    
    # Remove unwanted characters like /n and unicode escape sequences
    text = re.sub(r'\\n|\\u[0-9a-fA-F]+', ' ', text)  # Replaces newline and unicode escape characters with a space
    
    # Remove extra spaces and line breaks
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with one space
    text = text.strip()  # Remove leading/trailing spaces
    
    # Optionally, remove unwanted page numbers or numeric values if they exist
    text = re.sub(r'\d+', '', text)  # Remove numbers if they're irrelevant
    
    return text

In [9]:
def extract_and_clean_pdf(file_path):
    import fitz
    # Open the PDF
    document = fitz.open(file_path)

    cleaned_text = []

    # Iterate through each page
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        
        # Extract raw text from the page
        raw_text = page.get_text("text")
        
        # Clean the text
        cleaned_page_text = clean_text(raw_text)
        
        # Append the cleaned text for each page
        cleaned_text.append(cleaned_page_text)
    
    return cleaned_text

In [10]:
def save_cleaned_text_to_file(cleaned_text, output_file_path):
    with open(output_file_path, 'w') as f:
        for page_content in cleaned_text:
            f.write(page_content + "\n\n")  # Separate pages with extra line breaks

In [11]:
def load_cleaned_text(file_path):
    with open(file_path, 'r') as f:
        # Read the entire content of the file
        cleaned_text = f.read()
    
    return cleaned_text

In [12]:
def chunk_by_paragraphs(text, chunk_size=512):
    paragraphs = text.split('\n\n')  # Assuming paragraphs are separated by two newlines
    chunks = []
    current_chunk = ""
    
    for paragraph in paragraphs:
        if len(current_chunk) + len(paragraph) <= chunk_size:
            current_chunk += "\n\n" + paragraph
        else:
            chunks.append(current_chunk.strip())  # Save the current chunk
            current_chunk = paragraph  # Start a new chunk with the current paragraph
    
    if current_chunk:
        chunks.append(current_chunk.strip())  # Append the last chunk
    
    return chunks

In [13]:
"""
def chunking_doc(document):
    from langchain_text_splitters import RecursiveCharacterTextSplitter

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
    documents = text_splitter.split_documents(document)
    print(f"Document chunked successfully.\n Document has {len(documents)} pages from the pdf")

    return documents

"""

'\ndef chunking_doc(document):\n    from langchain_text_splitters import RecursiveCharacterTextSplitter\n\n    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)\n    documents = text_splitter.split_documents(document)\n    print(f"Document chunked successfully.\n Document has {len(documents)} pages from the pdf")\n\n    return documents\n\n'

### Step 2: Create Embeddings and FAISS INDEX

1.  Embedding model: "sentence-transformers/all-MiniLM-L6-v2"
2.  Cuda enabled

describe process flow for section:

In [14]:
def initialize_embedding(model_name):
    "initialize embedding model"
    from sentence_transformers import SentenceTransformer #load library

    if isinstance(model_name,str):
        model_name = model_name

        #initialize embedding model using Sentence Transformer
        model_embedding = SentenceTransformer(model_name)

        #save model for reuse
        model_embedding.save("./embedding_model") #saves config + model weights

        return model_embedding
    else:
        raise TypeError("The model_name must be a string.")


In [15]:
def load_embedding_model(file_path,device_name):

    '''Function to load embedding model '''  
    from langchain_community.embeddings import HuggingFaceEmbeddings

    if isinstance(file_path,str) and isinstance(device_name,str):
        emb_model_id = file_path
        model_kwargs = {'device': device_name}
        encode_kwargs = {"normalize_embeddings": False}

        #initialize embedding model
        embedding_model= HuggingFaceEmbeddings(model_name=emb_model_id,
                                               model_kwargs= model_kwargs,
                                               encode_kwargs=encode_kwargs
                                               )
    
        return embedding_model
    else:
        raise TypeError("The file path and device_name must be a string.")

In [16]:
def initialize_faiss(document,embedding_model):
    ''' Create and save FAISS index'''
    import faiss
    from langchain.schema import Document
    from langchain_community.vectorstores import FAISS

    # Wrap the text chunks into Document objects with page_content
    documents = [Document(page_content=doc) for doc in document]

    #create FAISS index
    faiss_index = FAISS.from_documents(documents,embedding_model)

    #save FAISS index
    faiss_index.save_local("./faiss_index")

    return faiss_index  



In [17]:
def load_faiss_index(file_path,embedding_model):
    '''load saved faiss index'''
    import faiss
    from langchain_community.vectorstores import FAISS
    from langchain_community.embeddings import HuggingFaceEmbeddings
    
    if isinstance(file_path,str):
        faiss_index = FAISS.load_local(file_path,
                                       embedding_model,
                                       allow_dangerous_deserialization=True ) # Enable safe loading
        return faiss_index
    
    else:
        raise TypeError('The file path must be string.')

### Step 3: Integrate RAG pipeline

1. create pretrained model pipeline
2. setup RAG pipeline
3. model_id : "google/flan-t5-small"

In [None]:
def rag_pipeline(model_name,faiss_index):
    '''Set up pipeline with RAG for question and answers '''
    from langchain.chains import create_retrieval_chain
    from langchain.chains.combine_documents import create_stuff_documents_chain
    from langchain_core.prompts import ChatPromptTemplate
    from langchain_community.vectorstores import FAISS
    from langchain_core.vectorstores import VectorStoreRetriever
    #from langchain_community.llms import HuggingFacePipeline
    from langchain_huggingface import HuggingFacePipeline
    from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
    import torch
    import sentencepiece



    #initialize llm and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    llm_model = T5ForConditionalGeneration.from_pretrained(model_name,
                                                           device_map = "auto",
                                                           torch_dtype = torch.bfloat16
                                                           )
    
    
    do_sample = True if temperature > 0 else False

    #initialize llm pipeline
    llm_pipeline = pipeline("text2text-generation",
                            model=llm_model,
                            tokenizer=tokenizer,
                            max_new_tokens=256,
                            truncation= True,
                            device_map='auto',
                            temperature = 0.4,
                            top_p = 0.9,
                            do_sample = do_sample
                            )

    #Wrap in LangChain LLM wrapper
    llm = HuggingFacePipeline(pipeline=llm_pipeline)

    
    #initialize retriever
    retriever = faiss_index.as_retriever(search_kwargs={"k":10}) #retrieves top 5 relevant chunks

    #define Prompt
    system_prompt = ("You are a HR assistant. " 
                     "Use the given context to answer the question, "
                     "combining information where necessary."
                     "If you don't know the answer, say you don't know. "
                     "Provide a synthesized response in 2-3 sentences maximum. "
                     "Context: {context}")

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}")
            ]
            )

    #create document combination chain
    question_answer_chain = create_stuff_documents_chain(llm,#hugginfacepipeline wrapped model
                                                         prompt
                                                         )  

    #create retrieval chain
    rag_chain = create_retrieval_chain(retriever,question_answer_chain)  

    return rag_chain
  

In [19]:
def ask_question(rag_chain, query):
    """Function to test the RAG pipeline with a sample question."""
    response = rag_chain.invoke({"input": query})

    print("Raw Response:", response)  # Debugging step to see the response structure

    return response.get("answer", "I am sorry, I am not sure how to respond.")

#### simulation test

In [20]:
#step 1 Document ingestion and processing

#load document
#hr_manual = load_pdf('hr_manual.pdf')

#preprocess text
#hr_manual_cleaned = extract_and_clean_pdf('hr_manual.pdf')

#save_cleaned_text_to_file(hr_manual_cleaned, 'cleaned_hr_manual.txt')

#load processed document
hr_manual = load_cleaned_text('cleaned_hr_manual.txt')

# Chunk the document
chunked_hr_manual = chunk_by_paragraphs(hr_manual)






In [21]:
#Step 2 Create EMbeddings and FAISS index

model_emb_id="sentence-transformers/all-MiniLM-L6-v2"
emb_file_path = "./embedding_model"
device_name = 'cuda'
faiss_file_path = "./faiss_index"

#intialize embedding model
#emb_model= initialize_embedding(model_emb_id)

#load saved model with Huggin face embeddings
embedding_model = load_embedding_model(emb_file_path,device_name)

#create faiss index
#faiss_index = initialize_faiss(chunked_hr_manual,embedding_model)

#load faiss index
faiss_index = load_faiss_index(faiss_file_path,embedding_model)



  embedding_model= HuggingFaceEmbeddings(model_name=emb_model_id,


In [22]:
#Step 3 Inference

model_id_llm = "google/flan-t5-small"

#initialize RAG inference pipeline
rag_retrieval = rag_pipeline(model_id_llm, faiss_index)

question = "What is the company's policy on study leave?"
answer = ask_question(rag_retrieval, question)
print("AI Response:", answer)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cuda:0


Raw Response: {'input': "What is the company's policy on study leave?", 'context': [Document(id='01895847-9d00-46a5-9f26-2a2118e3e6cd', metadata={}, page_content='-  - a) application for study leave by the employee and approved by Manager/Director, b) letter of acceptance from the academic institution indicating the intended course of study, c) indication from the Manager/Director whether a replacement will be needed. This information is to be submitted to the Human Resource Manager, through the Branch Manager or Director. .. USelection for study leave Selection for study leave with full pay will be based on the following considerations: a) Employees must be permanently employed and would have been in the employ of the Agency for a minimum period of two () years at the time of making the application b) The course must be relevant to the business of the Agency c) Employee’s performance must be satisfactory (% and over in the annual appraisal ) d) The number of employees in the Agency cu