In [1]:
from google.colab import drive
import os
import yaml

drive.mount("/content/drive")

REPO_URL = "https://github.com/AmjadKudsi/Meta_llama-chatbot.git"
REPO_DIR = "/content/rag-chatbot"

if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} {REPO_DIR}
else:
    %cd {REPO_DIR}
    !git pull

%cd {REPO_DIR}

# install dependencies from repo
!pip -q install -r requirements.txt

# load config
CONFIG_PATH = os.path.join(REPO_DIR, "configs", "app.yaml")

with open(CONFIG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

paths = cfg["paths"]
DOCS_DIR = paths["docs_dir"]
INDEX_DIR = paths["index_dir"]
EVAL_RUNS_DIR = paths["eval_runs_dir"]
TRACES_DIR = paths["traces_dir"]

rag_cfg = cfg.get("rag", {})
TOP_K = rag_cfg.get("top_k", 6)
CHUNK_SIZE = rag_cfg.get("chunk_size", 900)
CHUNK_OVERLAP = rag_cfg.get("chunk_overlap", 150)

print("DOCS_DIR:", DOCS_DIR)
print("INDEX_DIR:", INDEX_DIR)
print("TOP_K:", TOP_K)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/rag-chatbot
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 1), reused 3 (delta 1), pack-reused 0 (from 0)[K
Unpacking objects: 100% (3/3), 326 bytes | 65.00 KiB/s, done.
From https://github.com/AmjadKudsi/Meta_llama-chatbot
   7a6dec9..bdeb0f2  main       -> origin/main
Updating 7a6dec9..bdeb0f2
Fast-forward
 requirements.txt | 1 [32m+[m
 1 file changed, 1 insertion(+)
/content/rag-chatbot
DOCS_DIR: /content/drive/MyDrive/rag-chatbot/raw_docs
INDEX_DIR: /content/drive/MyDrive/rag-chatbot/artifacts/indexes
TOP_K: 6


In [2]:
# sanity check
import glob, os
pdfs = glob.glob(DOCS_DIR + "/*.pdf")
print("PDFs found:", len(pdfs))
print("Example:", os.path.basename(pdfs[0]) if pdfs else "None")

PDFs found: 10
Example: tax01.pdf


> **SimpleDirectoryReader** is the simplest way to load data from local files into LlamaIndex



In [3]:
from llama_index.core import SimpleDirectoryReader
import os

def file_metadata(file_path: str) -> dict:
    return {
        "source_file": os.path.basename(file_path),
        "source_path": file_path,
        "domain": "IRS individual tax documents",
    }

reader = SimpleDirectoryReader(
    input_dir=DOCS_DIR,
    file_metadata=file_metadata,
)

documents = reader.load_data()

print("Documents loaded:", len(documents))
if documents:
    print("Sample metadata:", documents[0].metadata)
    print("Sample text preview:", documents[0].text[:300])


Documents loaded: 469
Sample metadata: {'page_label': '1', 'file_name': 'tax01.pdf', 'source_file': 'tax01.pdf', 'source_path': '/content/drive/MyDrive/rag-chatbot/raw_docs/tax01.pdf', 'domain': 'IRS individual tax documents'}
Sample text preview: Userid: CPM Schema: tipx Leadpct: 100% Pt. size: 8  Draft  Ok to Print
AH XSL/XML Fileid: … ication-17/2024/b/xml/cycle02/source (Init. & Date) _______
Page 1 of 143  6:16 - 23-Jan-2025
The type and rule above prints on all proofs including departmental reproduction proofs. MUST be removed before pr


> **SimpleDirectoryReader** will automatically attach a metadata dictionary to each Document object.<br>
> The PDFs were split into per page Document objects (eg. 143 pages becomes about 143 items). Helpful for citations because every piece is already tied to a page.

The preview also includes lines like “Draft Ok to Print” and internal print proof headers. This is noise and will hurt retrieval and answers. A quick cleaning step is required to remove boilerplate lines that appear on every page.

In [4]:
import re
from llama_index.core import Document

NOISE_PATTERNS = [
    r"Draft\s+Ok to Print",
    r"User[id|ID]:.*",
    r"AH\s+XSL/XML\s+Fileid:.*",
    r"The type and rule above prints on all proofs.*",
    r"Page\s+\d+\s+of\s+\d+.*",
]

noise_re = re.compile("|".join(NOISE_PATTERNS), re.IGNORECASE)

cleaned_documents = []
dropped = 0

for d in documents:
    raw = d.text

    # Remove noise lines
    kept_lines = []
    for line in raw.splitlines():
        if noise_re.search(line):
            continue
        kept_lines.append(line)

    text = "\n".join(kept_lines)
    text = re.sub(r"[ \t]+", " ", text).strip()

    # Drop pages that become too small
    if len(text) < 200:
        dropped += 1
        continue

    cleaned_documents.append(
        Document(
            text=text,
            metadata=d.metadata,
        )
    )

print("Original documents:", len(documents))
print("Cleaned documents:", len(cleaned_documents))
print("Dropped as too small:", dropped)
print("Sample cleaned metadata:", cleaned_documents[0].metadata if cleaned_documents else "None")
print("Sample cleaned preview:", cleaned_documents[0].text[:300] if cleaned_documents else "None")

Original documents: 469
Cleaned documents: 469
Dropped as too small: 0
Sample cleaned metadata: {'page_label': '1', 'file_name': 'tax01.pdf', 'source_file': 'tax01.pdf', 'source_path': '/content/drive/MyDrive/rag-chatbot/raw_docs/tax01.pdf', 'domain': 'IRS individual tax documents'}
Sample cleaned preview: TAX GUIDE
2024
Get forms and other information faster and easier at:
• IRS.gov (English) 
• IRS.gov/Spanish (Español) 
• IRS.gov/Chinese (中文) 
• IRS.gov/Korean (한국어) 
• IRS.gov/Russian (Pусский) 
• IRS.gov/Vietnamese (Tiếng Việt) 
Publication 17 (2024) Catalog Number 10311G
Jan 22, 2025 Department o


In [5]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=CHUNK_SIZE,              # set in app.yaml
    chunk_overlap=CHUNK_OVERLAP,        # set in app.yaml
)

#evaluate using gold.jsonl

nodes = splitter.get_nodes_from_documents(cleaned_documents)

print("Nodes created:", len(nodes))
print("Sample node metadata:", nodes[0].metadata if nodes else "None")
print("Sample node text preview:", nodes[0].get_content()[:300] if nodes else "None")

Nodes created: 1136
Sample node metadata: {'page_label': '1', 'file_name': 'tax01.pdf', 'source_file': 'tax01.pdf', 'source_path': '/content/drive/MyDrive/rag-chatbot/raw_docs/tax01.pdf', 'domain': 'IRS individual tax documents'}
Sample node text preview: TAX GUIDE
2024
Get forms and other information faster and easier at:
• IRS.gov (English) 
• IRS.gov/Spanish (Español) 
• IRS.gov/Chinese (中文) 
• IRS.gov/Korean (한국어) 
• IRS.gov/Russian (Pусский) 
• IRS.gov/Vietnamese (Tiếng Việt) 
Publication 17 (2024) Catalog Number 10311G
Jan 22, 2025 Department o


> **Create FAISS index**<br>
> Dimension depends on the embedding model. LlamaIndex will handle it when building the index.<br>
> We create an empty index with a placeholder dim by building vector store through LlamaIndex.<br>
> The simplest approach is to let LlamaIndex manage embeddings and vectors.

RAG retrieval works by turning text into vectors, then doing similarity search. LlamaIndex needs an embedding model to create those vectors.<br><br>
The default embedding model of LlamaIndex is OpenAI’s text-embedding-ada-002.
But we are going to use a local embedding model before building the index (To avoid paying for API keys).

In [6]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Small and fast, good starter choice
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"

Settings.embed_model = HuggingFaceEmbedding(model_name=embed_model_name)

print("Embedding model set to:", embed_model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model set to: sentence-transformers/all-MiniLM-L6-v2


In [7]:
from llama_index.core import VectorStoreIndex
import os

index = VectorStoreIndex(nodes)

os.makedirs(INDEX_DIR, exist_ok=True)
index.storage_context.persist(persist_dir=INDEX_DIR)

print("Saved index to:", INDEX_DIR)


Saved index to: /content/drive/MyDrive/rag-chatbot/artifacts/indexes
