# Naive RAG Chatbot - Phase 1 Bootcamp Project

Bootcamp: NSK AI RAG Bootcamp 2025

Objective: Retrieval-Augmented Generation (RAG) chatbot

---

## 1. Setup & Imports

In [1]:

import os
from dotenv import load_dotenv

# Load environment variables (make sure you have a .env file with GROQ_API_KEY)
load_dotenv()


True

In [2]:
import sys
!{sys.executable} -m pip install pypdf




In [5]:
!pip install sentence-transformers




## 2. Document Ingestion

In [13]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Replace with a real PDF inside uploaded_files/
pdf_path = "uploaded_files/kenya-market-update.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(documents)

print(f"Loaded {len(documents)} pages, split into {len(docs)} chunks")


Loaded 16 pages, split into 54 chunks


In [14]:
print(f"Pages loaded: {len(documents)}")
print(documents[0].page_content[:500])  # first 500 chars


Pages loaded: 16
knightfrank.com/research
Kenya
Market Update
H2 2024
30th Edition
Knight Frank’s ultimate guide to real estate market performance and 
opportunities in Kenya.


In [15]:
print(f"Chunks created: {len(docs)}")
print(docs[0].page_content[:500])


Chunks created: 54
knightfrank.com/research
Kenya
Market Update
H2 2024
30th Edition
Knight Frank’s ultimate guide to real estate market performance and 
opportunities in Kenya.


## 3. Indexing & Vector Store

In [6]:
from sentence_transformers import SentenceTransformer

# This will download and cache the model in C:\Users\user\.cache\huggingface\hub\
SentenceTransformer("all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  12%|#1        | 10.5M/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [17]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# Load your PDF
pdf_path = "uploaded_files/kenya-market-update.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(documents)

# Filter out short/empty chunks (e.g. cover/title page)
docs = [doc for doc in docs if len(doc.page_content.strip()) > 50]

print(f"Final chunks after filtering: {len(docs)}")

# Use a local embedding model (cached)
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"local_files_only": True}
)

# Persist to Chroma DB
vectorstore = Chroma.from_documents(
    docs,
    embeddings,
    persist_directory="chroma_store"
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


Final chunks after filtering: 54


## 4. RAG Chain (Retriever + LLM + Prompt)

## We Pre-download LLM (outside the pipeline call) to cache everything

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "distilgpt2"

# Download & cache the model/tokenizer (no timeouts if cached later)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  27%|##6       | 94.4M/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [20]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

model_name = "distilgpt2"

# Load from cache only
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(model_name, local_files_only=True)

llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200
)

llm = HuggingFacePipeline(pipeline=llm_pipeline)

# Custom prompt
prompt_template = """
Use the context to answer the question. Be concise and factual.

Context:
{context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Retrieval-QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)


Device set to use cpu
  llm = HuggingFacePipeline(pipeline=llm_pipeline)


In [21]:
!pip install -U langchain-huggingface




In [22]:
from langchain_huggingface import HuggingFacePipeline


In [23]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
model = AutoModelForCausalLM.from_pretrained(model_name, local_files_only=True)

llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    device=-1   # ensures CPU
)

llm = HuggingFacePipeline(pipeline=llm_pipeline)

prompt_template = """
Use the context to answer the question. Be concise and factual.

Context:
{context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)


Device set to use cpu


## 5. Interactive Q&A (Console Loop) - For CLI Testing

In [24]:
# Optional console-based interaction (can be skipped if using widgets below)
# while True:
#     query = input("Ask a question (or type 'exit'): ")
#     if query.lower() == "exit":
#         break
#     answer = rag_chain.invoke({"question": query})
#     print("\n💡 Answer:", answer, "\n")

while True:
    query = input("Ask a question (or type 'exit'): ")
    if query.lower() == "exit":
        break
    answer = qa_chain.run(query)
    print(f"\nAnswer: {answer}\n")


Ask a question (or type 'exit'):  exit


## Interactive Q&A (Notebook Widgets)

In [25]:
import ipywidgets as widgets
from IPython.display import display

question_box = widgets.Text(
    placeholder="Type your question here...",
    description="Question:",
    layout=widgets.Layout(width="80%")
)

output_box = widgets.Output()

def on_submit(change):
    query = change["new"]
    if query.strip():
        answer = qa_chain.run(query)
        with output_box:
            print(f"\nQ: {query}\nA: {answer}\n")
    question_box.value = ""  # clear after submit

question_box.observe(on_submit, names="value")

display(question_box, output_box)


Text(value='', description='Question:', layout=Layout(width='80%'), placeholder='Type your question here...')

Output()