In [21]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

### Define the data directory

In [22]:
data_dir = "./text_data"

# Preprocessing for RAG

### 1. Chunking: Structuring Data for LLMs
Breaking large text blocks into smaller, meaningful segments.

The chunking logic includes a custom separator ("\n") to ensure chunks stay within the defined size while maintaining semantic coherence



In [23]:
# Get a list of text files in the directory
files = os.listdir(data_dir)
# Initialize a list to store chunked texts
file_texts = []

# Loop through all files and process them
for file in files:
    file_path = os.path.join(data_dir, file)

    # Open the file and read its content
    with open(file_path, "r", encoding="utf-8") as f:
        file_text = f.read()

    # Create a CharacterTextSplitter object with specified chunk size, overlap, and separator 
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=128, chunk_overlap=32, separator="\n" 
    )

    # Split the text into chunks
    texts = text_splitter.split_text(file_text)

    # Convert each chunk into a Document object with metadata
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text,metadata={ 
                    "doc_title": file.split(".")[0],
                    "chunk_num": i})) 

Display the first chunk

In [24]:
from pprint import pprint

pprint(file_texts[0].page_content)

('Are you interested in joining our team at Big Star Collectibles? Take a look '
 'at our job listings below and apply to what suits you best.\n'
 'If we don’t currently have a job available that you’re hoping to fill, you '
 'can still apply. Just choose “Keep me in mind” as the job you’re applying '
 'for.  \n'
 'We can’t wait to hear from you!\n'
 'Associate E-commerce Web Developer')


### 2. Metadata: Enhancing RAG Applications
Additional data stored alongside embeddings to enhance search and retrieval.


Display the first chunk and its metadata

In [25]:
pprint(file_texts[0].metadata)

{'chunk_num': 0, 'doc_title': 'Careers'}


### 3. Embeddings: Representing Data in Vector Form
Representing data in vector form to capture its semantic meaning.


Include embedding and vector storage using FAISS and HuggingFaceEmbeddings.


In [26]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

In [27]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") # embed your dat

Store the embedded data into a vector database

In [28]:
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

# Querying
Add the querying functionality using the as_retriever method and included an example query.


Prepare the vector store as a retriever

In [29]:
retriever = vector_store.as_retriever()

Query the vector store

In [30]:
retriever.invoke("What year was Big Star Collectibles Started?")

[Document(metadata={'doc_title': 'Our Story', 'chunk_num': 0}, page_content='Our story began at the International Arts Conference in 2013. Our founder, Saura Chen, a trained photographer, captured a series of candid images and portraits of the keynote speaker and presenters at the event, and provided print copies of the photographs to attendees at the end of the day. When she overheard a group of attendees attempting to secure autographs from the presenters and negotiating photo trades, the seeds for Big Star Collectibles were planted.'),
 Document(metadata={'doc_title': 'Our Story', 'chunk_num': 1}, page_content='Launched officially in 2014, Saura was determined to create high-quality trading cards that were desirable and valuable for the collecting community. Besides monthly releases for the casual collector, Big Star Collectibles also releases limited editions and one-of-a-kind items. \nBig Star Collectibles has grown over the years to include memorabilia, contests, events, appraisa

# Adding the LLM 
Include the LLM integration with OpenAI, a prompt template, and a query chain

Load environment variables

In [31]:
from dotenv import load_dotenv

load_dotenv()

True

Instantiate OpenAI LLM

In [32]:
from langchain_openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")
llm = OpenAI(api_key=api_key)

Create prompt template.

Include source citation in the prompt template. The LLM will explicitly cite the sources from the vector store metadata.


In [33]:
from langchain.prompts import ChatPromptTemplate
template="""You are a helpful assistant. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Cite your sources.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

Create the query chain

In [34]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

Query the chain

In [35]:
response = chain.invoke("When did Big Star Collectibles Launch? Cite where you found this information.")

Display response

In [36]:
response

' Big Star Collectibles launched officially in 2014. This information can be found in the context under the document "Our Story" and the chunk number 1. '