In [1]:
from google.colab import drive
import os
import yaml

drive.mount("/content/drive")

REPO_URL = "https://github.com/AmjadKudsi/Meta_llama-chatbot.git"
REPO_DIR = "/content/rag-chatbot"

if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} {REPO_DIR}
else:
    %cd {REPO_DIR}
    !git pull

%cd {REPO_DIR}

# install dependencies from repo
!pip -q install -r requirements.txt

# load config
CONFIG_PATH = os.path.join(REPO_DIR, "configs", "app.yaml")

with open(CONFIG_PATH, "r") as f:
    cfg = yaml.safe_load(f)

paths = cfg["paths"]
DOCS_DIR = paths["docs_dir"]
INDEX_DIR = paths["index_dir"]
EVAL_RUNS_DIR = paths["eval_runs_dir"]
TRACES_DIR = paths["traces_dir"]

rag_cfg = cfg.get("rag", {})
TOP_K = rag_cfg.get("top_k", 6)
CHUNK_SIZE = rag_cfg.get("chunk_size", 900)
CHUNK_OVERLAP = rag_cfg.get("chunk_overlap", 150)

print("DOCS_DIR:", DOCS_DIR)
print("INDEX_DIR:", INDEX_DIR)
print("TOP_K:", TOP_K)

Mounted at /content/drive
Cloning into '/content/rag-chatbot'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 47 (delta 19), reused 22 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (47/47), 10.03 KiB | 3.34 MiB/s, done.
Resolving deltas: 100% (19/19), done.
/content/rag-chatbot
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m466.5/466.5 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.2/309.2 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00

In [2]:
# sanity check
import glob, os
pdfs = glob.glob(DOCS_DIR + "/*.pdf")
print("PDFs found:", len(pdfs))
print("Example:", os.path.basename(pdfs[0]) if pdfs else "None")

PDFs found: 10
Example: tax01.pdf


> **SimpleDirectoryReader** is the simplest way to load data from local files into LlamaIndex



In [3]:
from llama_index.core import SimpleDirectoryReader
import os

def file_metadata(file_path: str) -> dict:
    return {
        "source_file": os.path.basename(file_path),
        "source_path": file_path,
        "domain": "IRS individual tax documents",
    }

reader = SimpleDirectoryReader(
    input_dir=DOCS_DIR,
    file_metadata=file_metadata,
)

documents = reader.load_data()

print("Documents loaded:", len(documents))
if documents:
    print("Sample metadata:", documents[0].metadata)
    print("Sample text preview:", documents[0].text[:300])


Documents loaded: 469
Sample metadata: {'page_label': '1', 'file_name': 'tax01.pdf', 'source_file': 'tax01.pdf', 'source_path': '/content/drive/MyDrive/rag-chatbot/raw_docs/tax01.pdf', 'domain': 'IRS individual tax documents'}
Sample text preview: Userid: CPM Schema: tipx Leadpct: 100% Pt. size: 8  Draft  Ok to Print
AH XSL/XML Fileid: … ication-17/2024/b/xml/cycle02/source (Init. & Date) _______
Page 1 of 143  6:16 - 23-Jan-2025
The type and rule above prints on all proofs including departmental reproduction proofs. MUST be removed before pr


SimpleDirectoryReader will automatically attach a metadata dictionary to each Document object.

The PDFs were split into per page Document objects (eg. 143 pages becomes about 143 items). Helpful for citations because every piece is already tied to a page.

The preview also includes lines like “Draft Ok to Print” and internal print proof headers. This is noise and will hurt retrieval and answers. A quick cleaning step is required to remove boilerplate lines that appear on every page.

In [4]:
import re
from llama_index.core import Document

NOISE_PATTERNS = [
    r"Draft\s+Ok to Print",
    r"User[id|ID]:.*",
    r"AH\s+XSL/XML\s+Fileid:.*",
    r"The type and rule above prints on all proofs.*",
    r"Page\s+\d+\s+of\s+\d+.*",
]

noise_re = re.compile("|".join(NOISE_PATTERNS), re.IGNORECASE)

cleaned_documents = []
dropped = 0

for d in documents:
    raw = d.text

    # Remove noise lines
    kept_lines = []
    for line in raw.splitlines():
        if noise_re.search(line):
            continue
        kept_lines.append(line)

    text = "\n".join(kept_lines)
    text = re.sub(r"[ \t]+", " ", text).strip()

    # Drop pages that become too small
    if len(text) < 200:
        dropped += 1
        continue

    cleaned_documents.append(
        Document(
            text=text,
            metadata=d.metadata,
        )
    )

print("Original documents:", len(documents))
print("Cleaned documents:", len(cleaned_documents))
print("Dropped as too small:", dropped)
print("Sample cleaned metadata:", cleaned_documents[0].metadata if cleaned_documents else "None")
print("Sample cleaned preview:", cleaned_documents[0].text[:300] if cleaned_documents else "None")

Original documents: 469
Cleaned documents: 469
Dropped as too small: 0
Sample cleaned metadata: {'page_label': '1', 'file_name': 'tax01.pdf', 'source_file': 'tax01.pdf', 'source_path': '/content/drive/MyDrive/rag-chatbot/raw_docs/tax01.pdf', 'domain': 'IRS individual tax documents'}
Sample cleaned preview: TAX GUIDE
2024
Get forms and other information faster and easier at:
• IRS.gov (English) 
• IRS.gov/Spanish (Español) 
• IRS.gov/Chinese (中文) 
• IRS.gov/Korean (한국어) 
• IRS.gov/Russian (Pусский) 
• IRS.gov/Vietnamese (Tiếng Việt) 
Publication 17 (2024) Catalog Number 10311G
Jan 22, 2025 Department o


In [5]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=CHUNK_SIZE,              # set in app.yaml
    chunk_overlap=CHUNK_OVERLAP,        # set in app.yaml
)

#evaluate using gold.jsonl

nodes = splitter.get_nodes_from_documents(cleaned_documents)

print("Nodes created:", len(nodes))
print("Sample node metadata:", nodes[0].metadata if nodes else "None")
print("Sample node text preview:", nodes[0].get_content()[:300] if nodes else "None")

Nodes created: 1136
Sample node metadata: {'page_label': '1', 'file_name': 'tax01.pdf', 'source_file': 'tax01.pdf', 'source_path': '/content/drive/MyDrive/rag-chatbot/raw_docs/tax01.pdf', 'domain': 'IRS individual tax documents'}
Sample node text preview: TAX GUIDE
2024
Get forms and other information faster and easier at:
• IRS.gov (English) 
• IRS.gov/Spanish (Español) 
• IRS.gov/Chinese (中文) 
• IRS.gov/Korean (한국어) 
• IRS.gov/Russian (Pусский) 
• IRS.gov/Vietnamese (Tiếng Việt) 
Publication 17 (2024) Catalog Number 10311G
Jan 22, 2025 Department o
