In [6]:
import os
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SentenceTransformerRerank

### Local LLM Setup

In [2]:
# 1. Configure the Local LLM (The answer generator)
# This model name must match the one you downloaded with 'ollama run <model_name>'
local_llm = Ollama(
    model="llama3.1:8b", 
    base_url="http://localhost:11434", 
    request_timeout=120.0  # Increase timeout for slow local inference
)

# 2. Configure the Local Embedding Model (The knowledge indexer/retriever)
# BGE-small is the standard choice for excellent performance on a local machine.
local_embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

# 3. Apply the settings globally
# LlamaIndex will automatically use these components for all indexing and querying.
Settings.llm = local_llm
Settings.embed_model = local_embed_model

print(f"LLM set to: {Settings.llm.model} (via Ollama)")
print(f"Embedding Model set to: {Settings.embed_model.model_name} (Hugging Face)")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

LLM set to: llama3.1:8b (via Ollama)
Embedding Model set to: BAAI/bge-small-en-v1.5 (Hugging Face)


In [None]:
# Initialize the Open-Source Reranker
# This model runs locally using the Hugging Face Transformers library.
local_reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-6-v2", # A small, fast, highly effective reranker
    top_n=5 # The final number of chunks passed to the LLM
)
print("Open-source Reranker configured.")

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Open-source Reranker configured.


In [8]:
# # In the query engine creation (Phase 2, Step 4), ensure you use the new object:
# query_engine = RetrieverQueryEngine(
#     retriever=retriever,
#     node_postprocessors=[local_reranker], # <-- Use the local reranker object
#     llm=local_llm # <-- LLM is already set by Settings.llm, but good to be explicit
# )

In [9]:
# query_str = "What is the motto of France and when was the Eiffel Tower finished?"

# response = query_engine.query(query_str)

# # Display the results
# print("\n--- Final RAG Answer ---")
# print(response.response)

# print("\n--- Source Chunks Used ---")
# # The source nodes are the actual text chunks retrieved and passed to the LLM
# for node in response.source_nodes:
#     print(f"* Score: {node.score:.4f}")
#     print(f"  Text: {node.text[:80]}...\n")