# DATA INGESTION & CHUNKING

In [4]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader, PyPDFLoader
from langchain_community.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
# Define the directory containing documents
directory_path = "D:\GenAi\GenAi-Projects\RAG with multiple_docs using AstraDB\data"

txt_loader = DirectoryLoader(
    path=directory_path,
    glob="**/*.txt",
    loader_cls=TextLoader
)

pdf_loader = DirectoryLoader(
    path=directory_path,
    glob="**/*.pdf",
    loader_cls=PyPDFLoader
)

pptx_loader = DirectoryLoader(
    path=directory_path,
    glob="**/*.pptx",
    loader_cls=UnstructuredPowerPointLoader
)

# Load documents from each loader
txt_docs = txt_loader.load()
pdf_docs = pdf_loader.load()
pptx_docs = pptx_loader.load()

# Combine all documents
all_docs = txt_docs + pdf_docs + pptx_docs

splitter = RecursiveCharacterTextSplitter(chunk_size=400,chunk_overlap=70)

In [6]:
docs = splitter.split_documents(all_docs) 
len(docs)

439

In [7]:
print(docs[11].page_content)

Advanced RAG Techniques

While standard RAG provides a robust framework for knowledge-augmented generation, several advanced techniques have been developed to further enhance its performance:

1. Context-Aware Retrieval: This involves using query expansion, query rewriting, or user history to improve retrieval accuracy. It ensures that the most contextually relevant documents are retrieved.


In [8]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [9]:
ASTRA_DB_API_ENDPOINT = os.getenv('ASTRA_DB_API_ENDPOINT')
ASTRA_DB_APPLICATION_TOKEN=os.getenv('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_KEYSPACE='default_keyspace'

In [10]:
groq_api_key=os.getenv('GROQ_API_KEY')

In [11]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
llm = ChatGroq(model_name="meta-llama/llama-4-scout-17b-16e-instruct", temperature=0,api_key=groq_api_key)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from langchain_astradb import AstraDBVectorStore
from langchain.indexes import VectorstoreIndexCreator   

# EMBEDDING AND STORING

In [13]:
db = AstraDBVectorStore(
    embedding=embedding_model,
    collection_name='multidoc_vector',
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE
)

In [14]:
inserted_ides = db.add_documents(docs)

KeyboardInterrupt: 

In [None]:
print(f"\nInserted {len(inserted_ides)} documents.")


Inserted 439 documents.


In [None]:
retriver = db.as_retriever(search_kwargs={'k': 5})

In [None]:
from langchain_core.prompts import ChatPromptTemplate

# PROMPT DESIGNING

In [None]:
prompt = ChatPromptTemplate.from_template(
    '''
    Your a an AI researcher who is an expert in RAG systems.
    Answer any question asked by the user.
    construct answers in the form of bullet points
    Craft your response only from the provided context only.
    If you cannot find any related information from the context, simply say no context provied.
    Do not hallucinate.
    
    <context>
    {context}
    </context>
    
    QUESTION:{question}
    '''
)

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm, prompt) 

# GENERATION

In [None]:
user_prompt = input(str())
relevant_info = retriver.invoke(user_prompt)# Get relevent info from db and will be stuffed into the prompt as {context}.
response = document_chain.invoke({"context": relevant_info, "question": user_prompt})# feed related docs and user query to model.
print(response)


Here is the answer to your question in bullet points:

* Yunfan Gaoa is a researcher affiliated with the Shanghai Research Institute for Intelligent Autonomous Systems, Tongji University. 
* He is one of the authors of the survey paper "Retrieval-Augmented Generation for Large Language Models: A Survey".
