In [None]:
!pip install -q \
  "llama-index>=0.11.0" \
  llama-index-llms-gemini \
  llama-index-embeddings-huggingface \
  llama-index-readers-web \
  sentence-transformers



In [None]:
# Get API keys from environment variables
from google.colab import userdata
import google.generativeai as genai
GEMINI_API_KEY = userdata.get("GOOGLE_API_KEY")
if not GEMINI_API_KEY:
    print("Warning: GEMINI_API_KEY not found. Gemini model will not run.")

In [None]:
from llama_index.core import Settings
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

gemini_model = "gemini-2.5-flash-lite"
#model = "gemini-2.5-flash"
# 1) Gemini for LLM
Settings.llm = Gemini(
    model=gemini_model
    api_key=GEMINI_API_KEY,
)
Settings.llm.temperature = 0.1

# 2) HuggingFace for embeddings (local / free)
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"     # good small RAG embedding model
)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (ipython-input-2628725308.py, line 9)

In [None]:
from llama_index.readers.web import SimpleWebPageReader

urls = [
    "https://developers.llamaindex.ai/python/framework/understanding/rag/"
]

reader = SimpleWebPageReader(html_to_text=True)
documents = reader.load_data(urls)

len(documents), documents[0].metadata

In [None]:
from llama_index.core import VectorStoreIndex

# Build the vector index using global Settings (Gemini + embeddings)
index = VectorStoreIndex.from_documents(documents)

# Get a basic query engine
basic_query_engine = index.as_query_engine(
    similarity_top_k=5,   # number of chunks to retrieve
)

In [None]:
question = "Explain the high-level RAG pipeline described on this page."
response = basic_query_engine.query(question)
print(response)

In [None]:

## Extra code not part of RAG pipeline

from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L2-v2')
scores = model.predict([
    ("How many people live in Berlin?", "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers."),
    ("How many people live in Berlin?", "Berlin is well known for its museums."),
])
print(scores)
# [ 8.510401 -4.860082]

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

# A lightweight but effective cross-encoder model
reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2",
    top_n=3,       # keep only the top 3 chunks after reranking
)


In [None]:
rerank_query_engine = index.as_query_engine(
    similarity_top_k=10,                # retrieve more candidates
    node_postprocessors=[reranker],     # then rerank down to top_n
)

question = "How does LlamaIndex split RAG into stages like retrieval and postprocessing?"
response = rerank_query_engine.query(question)
print(response)
