In [37]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [38]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document

#vectorstores
from langchain_community.vectorstores import Chroma

#utility
import numpy as np
from typing import List

In [39]:
# RAG Architecture Overview
print("""
RAG (Retrieval-Augmented Generation) Architecture:

1. Document Loading: Load documents from various sources
2. Document Splitting: Break documents into smaller chunks
3. Embedding Generation: Convert chunks into vector representations
4. Vector Storage: Store embeddings in ChromaDB
5. Query Processing: Convert user query to embedding
6. Similarity Search: Find relevant chunks from vector store
7. Context Augmentation: Combine retrieved chunks with query
8. Response Generation: LLM generates answer using context

Benefits of RAG:
- Reduces hallucinations
- Provides up-to-date information
- Allows citing sources
- Works with domain-specific knowledge
""")


RAG (Retrieval-Augmented Generation) Architecture:

1. Document Loading: Load documents from various sources
2. Document Splitting: Break documents into smaller chunks
3. Embedding Generation: Convert chunks into vector representations
4. Vector Storage: Store embeddings in ChromaDB
5. Query Processing: Convert user query to embedding
6. Similarity Search: Find relevant chunks from vector store
7. Context Augmentation: Combine retrieved chunks with query
8. Response Generation: LLM generates answer using context

Benefits of RAG:
- Reduces hallucinations
- Provides up-to-date information
- Allows citing sources
- Works with domain-specific knowledge



# sample data

In [40]:
sample_docs = [
    """
    Machine Learning Fundamentals
    
    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are three main 
    types of machine learning: supervised learning, unsupervised learning, and reinforcement 
    learning. Supervised learning uses labeled data to train models, while unsupervised 
    learning finds patterns in unlabeled data. Reinforcement learning learns through 
    interaction with an environment using rewards and penalties.
    """,
    
    """
    Deep Learning and Neural Networks
    
    Deep learning is a subset of machine learning based on artificial neural networks. 
    These networks are inspired by the human brain and consist of layers of interconnected 
    nodes. Deep learning has revolutionized fields like computer vision, natural language 
    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly 
    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers 
    excel at sequential data processing.
    """,
    
    """
    Natural Language Processing (NLP)
    
    NLP is a field of AI that focuses on the interaction between computers and human language. 
    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, 
    machine translation, and question answering. Modern NLP heavily relies on transformer 
    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand 
    context and relationships between words in text.
    """
]
sample_docs

['\n    Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models, while unsupervised \n    learning finds patterns in unlabeled data. Reinforcement learning learns through \n    interaction with an environment using rewards and penalties.\n    ',
 '\n    Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks. \n    These networks are inspired by the human brain and consist of layers of interconnected \n    nodes. Deep learning has revolutionized fields like computer vision, natural language \n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly \n    effective f

In [41]:
#save sample document to file(method - 1)
import tempfile
temp_dir = tempfile.mkdtemp()

for i,doc in enumerate(sample_docs):
    with open(f"{temp_dir}doc_{i}.txt","w")as f:
        f.write(doc)
print(f"sample document created : {temp_dir}")

sample document created : /var/folders/5r/svqhb71956990v7rdb94sdmh0000gn/T/tmplibfu7v5


In [42]:
#save sample document file(method - 2)
import tempfile
temp_dir = tempfile.mkdtemp()

for i,doc in enumerate(sample_docs):
    with open(f"doc_{i}.txt","w")as f:
        f.write(doc)

In [43]:
temp_dir

'/var/folders/5r/svqhb71956990v7rdb94sdmh0000gn/T/tmppi_d0f4j'

In [44]:
#document loading
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
#load doc from directory
load_doc = DirectoryLoader(
    "/Users/dhrutamacm2/Desktop/rag_learn/Data_ingestion_parsing/data",
    glob="*.txt",
    loader_cls=TextLoader,
    loader_kwargs={'encoding' : 'utf-8'}
)
documents = load_doc.load()
print(len(documents))

3


In [45]:
documents

[Document(metadata={'source': '/Users/dhrutamacm2/Desktop/rag_learn/Data_ingestion_parsing/data/doc_2.txt'}, page_content='\n    Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand \n    context and relationships between words in text.\n    '),
 Document(metadata={'source': '/Users/dhrutamacm2/Desktop/rag_learn/Data_ingestion_parsing/data/doc_0.txt'}, page_content='\n    Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised l

# text splitting

In [46]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500, #maximum size of each chunk
    chunk_overlap = 50,
    length_function = len,
    separators= ["\n\n","\n",". "," ",""]
)
chunks = text_splitter.split_documents(documents)

In [47]:
print(len(chunks))

7


In [48]:
print(chunks[0].page_content)

Natural Language Processing (NLP)

    NLP is a field of AI that focuses on the interaction between computers and human language. 
    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, 
    machine translation, and question answering. Modern NLP heavily relies on transformer 
    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand 
    context and relationships between words in text.


# embedding model

In [49]:
import os
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [50]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x117b2e490>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x117b2ee90>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

# Initialize chromaDB and store it in Vector Represenation

In [51]:
#create cromadb directory
dire = "./chromadb"

#initialize chromadb with Open AI Embedding
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=OpenAIEmbeddings(),
    persist_directory=dire,
    collection_name="rag_db"
)
print(f"persisted to {dire}")

persisted to ./chromadb


# Text similarity search

In [52]:
query = "what is machine learning"

similar_doc = vectorstore.similarity_search(query,k = 2)
result = vectorstore.similarity_search_with_score(query,k =2)
for doc,score in result:
    print(score)

0.2332560122013092
0.233473539352417


# Initialize LLMs

In [53]:
#method 1 of initialize
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.2,
    max_completion_tokens=100
)
response = llm.invoke("what is LLMs")
print(response)

content='LLMs stands for "Large Language Models." These are a type of artificial intelligence model designed to understand and generate human-like text based on the input they receive. They are typically built using deep learning techniques, particularly neural networks, and are trained on vast amounts of text data to learn patterns, grammar, context, and even some level of reasoning.\n\nSome key characteristics of LLMs include:\n\n1. **Scale**: They have a large number of parameters, often in the billions, which allows them' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 100, 'prompt_tokens': 12, 'total_tokens': 112, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_80956533cb', 'id': 'chatcmpl-C5bDzIFAAgf9Q3BIqZELwQ6lrPeAY', '

# RAG chain

In [54]:
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

In [55]:
#convert vector store to retriver
retriver = vectorstore.as_retriever(
    search_kwargs={"k":3}
)
retriver

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x117b2f390>, search_kwargs={'k': 3})

In [56]:
#create prompt template
system_prompt = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.


context: {context}"""
prompt = ChatPromptTemplate.from_messages([
    ("system" , system_prompt),
    ("human" , "{input}")
])

In [57]:
#create document chain
from langchain.chains.combine_documents import create_stuff_documents_chain
document_loader = create_stuff_documents_chain(llm,prompt)
document_loader

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \nUse three sentences maximum and keep the answer concise.\n\n\ncontext: {context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x117b2efd0>, async_client=<openai.resource

	•	Retriever = librarian (finds the right books).
	•	StuffDocumentsChain = writer (uses the books + question to write an essay).
	•	RetrievalChain = project manager (connects librarian → writer → final answer).

# final rag chain 

In [58]:
final_rag = create_retrieval_chain(retriver,document_loader)
final_rag

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x117b2f390>, search_kwargs={'k': 3}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't

In [59]:
out = final_rag.invoke({"input" : "do u have any idea about india"})

In [60]:
print(out['answer'])

Yes, India is a country in South Asia, known for its diverse culture, languages, and history. It is the world's second-most populous country and has a rapidly growing economy. India is also recognized for its contributions to technology, science, and arts.


In [61]:
# Function to query the modern RAG system
def query_rag_modern(question):
    print(f"Question: {question}")
    print("-" * 50)
    
    # Using create_retrieval_chain approach
    result = final_rag.invoke({"input": question})
    
    print(f"Answer: {result['answer']}")
    print("\nRetrieved Context:")
    for i, doc in enumerate(result['context']):
        print(f"\n--- Source {i+1} ---")
        print(doc.page_content[:200] + "...")
    
    return result

# Test queries
test_questions = [
    "What are the three types of machine learning?",
    "What is deep learning and how does it relate to neural networks?",
    "What are CNNs best used for?"
]

for question in test_questions:
    result = query_rag_modern(question)
    print("\n" + "="*80 + "\n")

Question: What are the three types of machine learning?
--------------------------------------------------
Answer: The three types of machine learning are supervised learning, unsupervised learning, and reinforcement learning.

Retrieved Context:

--- Source 1 ---
Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are three main 
    types of machine l...

--- Source 2 ---
Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are three main 
    types of machine l...

--- Source 3 ---
Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are three main 
    types of machine l...


Question: What is deep learning and how does it relate to neural networks?
----------

# Adding new docs in existing vector store


In [62]:
vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x117b2f390>

In [63]:
new_document = """
Reinforcement Learning in Detail

Reinforcement learning (RL) is a type of machine learning where an agent learns to make 
decisions by interacting with an environment. The agent receives rewards or penalties 
based on its actions and learns to maximize cumulative reward over time. Key concepts 
in RL include: states, actions, rewards, policies, and value functions. Popular RL 
algorithms include Q-learning, Deep Q-Networks (DQN), Policy Gradient methods, and 
Actor-Critic methods. RL has been successfully applied to game playing (like AlphaGo), 
robotics, and autonomous systems.
"""

In [64]:
new_doc = Document(
    page_content=new_document,
    metadata={"source" : "manual" , "topic" : "reinforcement learning"}
)


In [65]:
print(new_doc.page_content)


Reinforcement Learning in Detail

Reinforcement learning (RL) is a type of machine learning where an agent learns to make 
decisions by interacting with an environment. The agent receives rewards or penalties 
based on its actions and learns to maximize cumulative reward over time. Key concepts 
in RL include: states, actions, rewards, policies, and value functions. Popular RL 
algorithms include Q-learning, Deep Q-Networks (DQN), Policy Gradient methods, and 
Actor-Critic methods. RL has been successfully applied to game playing (like AlphaGo), 
robotics, and autonomous systems.



In [66]:
#split this new_doc
new_chunks = text_splitter.split_documents([new_doc])

In [67]:
print(len(new_chunks))

3


In [68]:
# add new_chunks to vectorstore
vectorstore.add_documents(new_chunks)

['c4940a41-f702-4eba-ae97-87f569a7c248',
 'b42e3702-9c41-4cff-9a16-9eba8f50c3d8',
 '842ebb44-e637-47af-851f-5bf8c136271b']

In [69]:
print(vectorstore._collection.count())

31


In [75]:
#new question
final = final_rag.invoke({"input" : "what are prerequisite for start reinforcement learning"})

In [79]:
print(final)

{'input': 'what are prerequisite for start reinforcement learning', 'context': [Document(metadata={'source': 'manual', 'topic': 'reinforcement learning'}, page_content='Reinforcement Learning in Detail'), Document(metadata={'topic': 'reinforcement learning', 'source': 'manual'}, page_content='Reinforcement learning (RL) is a type of machine learning where an agent learns to make \ndecisions by interacting with an environment. The agent receives rewards or penalties \nbased on its actions and learns to maximize cumulative reward over time. Key concepts \nin RL include: states, actions, rewards, policies, and value functions. Popular RL \nalgorithms include Q-learning, Deep Q-Networks (DQN), Policy Gradient methods, and'), Document(metadata={'source': '/Users/dhrutamacm2/Desktop/rag_learn/Data_ingestion_parsing/data/doc_0.txt'}, page_content='Machine Learning Fundamentals')], 'answer': 'To start with reinforcement learning, you should have a good understanding of machine learning fundame