# Basic RAG pipeline
Vector DB - Chroma DB

Embeddings Model - Sentence Transformers - all-mpnet-base-v2

Loaders to load External Data

LLM Integration to generate final response using user query along with retrieved chunks

In [None]:
# installations
%pip install langchain langchain-community chromadb langchain-huggingface protobuf langchain-google-genai BeautifulSoup4

In [None]:
import os
os.environ['USER_AGENT'] = 'myagent'
from langchain.document_loaders import WebBaseLoader
URL = [
    "https://education.nationalgeographic.org/resource/global-warming/",
    "https://en.wikipedia.org/wiki/Climate_change",
    "https://www.nrdc.org/stories/global-warming-101"
]
# load the data
data = WebBaseLoader(URL)
# extract the content
content = data.load()

In [None]:
# Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=60)
chunks = text_splitter.split_documents(content)

In [None]:
len(chunks)

In [None]:
# Downloading the embedding model
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
# Define the vector DB
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(chunks, embeddings)

In [None]:
# Step 1: Retrieval
query = "What are the different causes of global warming?"
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":4})
docs_retrieved = retriever.get_relevant_documents(query)
print(docs_retrieved)

In [None]:
# Creating LLM object
from langchain_google_genai import ChatGoogleGenerativeAI
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')
llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash', api_key=GEMINI_API_KEY)

In [None]:
# Augment
query = 'What are the causes of global warming?'
system_prompt = f"""
You are an AI assistant that responds to a given user query. Please keep your answers relevant to the context you have.

User Query: {query}
"""

In [None]:
# Generation
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="stuff")
response = qa(system_prompt)
print(response['result'])