## Data preparations file
This notebook covers the text cleaning of the dataset, the segmentation, chunking and embedding. Followed by creating the vector storage with ChromaDB.

# Installations

In [1]:
# !pip install -q langchain langchain-community langchain-text-splitters chromadb sentence-transformers tiktoken

# Imports

In [None]:
from pathlib import Path
import math
import re, unicodedata, json
from typing import List, Dict
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import filter_complex_metadata

# Inspecting dataset
inspecting to decide on chroma or pinecone for vector storage.

In [3]:
RAW = "./dataset.txt" 
chunk_size, overlap = 900, 120
dim, bpf = 384, 4  # MiniLM dims

text = Path(RAW).read_text(encoding="utf-8")
size_mb = Path(RAW).stat().st_size / (1024**2)
est_chunks = (len(text) + (chunk_size - overlap) - 1) // (chunk_size - overlap)
emb_mem_mb = est_chunks * dim * bpf / (1024**2)

print(f"File size: {size_mb:.2f} MB")
print(f"Estimated chunks: {est_chunks:,}")
print(f"Embedding RAM (384-dim) ≈ {emb_mem_mb:.1f} MB")


File size: 0.23 MB
Estimated chunks: 296
Embedding RAM (384-dim) ≈ 0.4 MB


# Config

In [4]:
RAW_PATH = "./dataset.txt"          
DB_DIR   = "./chroma_growguide"
CHUNK_SIZE = 900
CHUNK_OVERLAP = 120
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 384-dim, lightweight

## Loading and cleaning

In [5]:
raw = Path(RAW_PATH).read_text(encoding="utf-8")

def clean_text(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"[ \t]+\n", "\n", s)         # trim trailing spaces
    s = re.sub(r"\n{3,}", "\n\n", s)         # collapse blank lines
    s = s.strip()
    return s

text = clean_text(raw)
print("Loaded & cleaned text. Characters:", len(text))

Loaded & cleaned text. Characters: 230251


## Segmenting into docs
First in 3 main sections, then later per section

In [None]:
# SEGMENT INTO DOCS (Fruits / Veg planner / Veg list) 
# Robust patterns for the three main sections
pat_fruits   = r"^===\s*Fruits\s*===\s*$"
pat_planner  = r"^===\s*Vegetables planner per zone\s*===\s*$"
pat_veglist  = r"^===\s*The list of vegetables and their information\s*===\s*$"

def _find_header(stext: str, pat: str):
    m = re.search(pat, stext, flags=re.M|re.I)
    return None if not m else (m.start(), m.end())

h_fruits  = _find_header(text, pat_fruits)
h_plan    = _find_header(text, pat_planner)
h_veglist = _find_header(text, pat_veglist)

assert all(h is not None for h in [h_fruits, h_plan, h_veglist]), \
    f"Missing one of the headers. Found: fruits={h_fruits}, planner={h_plan}, veg_list={h_veglist}"

# Slice sections by header positions
boundaries = sorted([("fruits",*h_fruits), ("planner",*h_plan), ("veglist",*h_veglist)], key=lambda x: x[1])
sections: Dict[str, str] = {}
for i,(name, start, end) in enumerate(boundaries):
    sec_start = end
    sec_end = boundaries[i+1][1] if i+1 < len(boundaries) else len(text)
    sections[name] = text[sec_start:sec_end].strip()

# --- Fruits: split by '== Subheader ==' ---
fr = sections["fruits"]
fr_sub_pat = r"^==\s*(.+?)\s*==\s*$"
fr_matches = list(re.finditer(fr_sub_pat, fr, flags=re.M))
fruit_docs = []
if not fr_matches:
    fruit_docs.append(("overview", fr.strip()))
else:
    # any text before first subheader = overview
    preface = fr[:fr_matches[0].start()].strip()
    if preface:
        fruit_docs.append(("overview", preface))
    for i,m in enumerate(fr_matches):
        topic = m.group(1).strip()
        start = m.end()
        end = fr_matches[i+1].start() if i+1 < len(fr_matches) else len(fr)
        body = fr[start:end].strip()
        if body:
            fruit_docs.append((topic, body))

# --- Planner: clean intro lines; then split by '== Zones ... ==' and by month lines ---
pl = sections["planner"]

# Droping explanatory lines immediately after the planner header so they don't leak into Zone docs
# keep removing leading lines until we hit a '== ... ==' zone header.
def strip_planner_intro(s: str) -> str:
    lines = s.splitlines()
    i = 0
    while i < len(lines) and not re.match(r"^==\s*Zones?\b", lines[i].strip(), flags=re.I):
        i += 1
    return "\n".join(lines[i:]).strip()

pl = strip_planner_intro(pl)

zone_pat   = r"^==\s*Zones?\s*([0-9abAB ,+and-]+)\s*:?==\s*$"
month_pat  = r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s*:\s*$"

zones = list(re.finditer(zone_pat, pl, flags=re.M))
planner_docs = []
if zones:
    for zi, z in enumerate(zones):
        zone_label = z.group(1).strip()              # e.g., "3 and 4", "5 and 6", "7, 8 and 9+"
        zstart = z.end()
        zend   = zones[zi+1].start() if zi+1 < len(zones) else len(pl)
        zbody  = pl[zstart:zend].strip()

        # Split by months inside the zone
        months = list(re.finditer(month_pat, zbody, flags=re.M))
        if not months:
            planner_docs.append({"zones": zone_label, "month": None, "content": zbody})
        else:
            preface = zbody[:months[0].start()].strip()
            if preface:
                planner_docs.append({"zones": zone_label, "month": None, "content": preface})
            for mi, m in enumerate(months):
                mlabel = m.group(1)
                mstart = m.end()
                mend   = months[mi+1].start() if mi+1 < len(months) else len(zbody)
                mtext  = zbody[mstart:mend].strip()
                if mtext:
                    planner_docs.append({"zones": zone_label, "month": mlabel, "content": mtext})
else:
    planner_docs.append({"zones": None, "month": None, "content": pl})

# --- Vegetable list: split each '== Name ==' and keep the whole entry; also tag common fields if present ---
vl = sections["veglist"]
veg_pat = r"^==\s*([A-Za-z][A-Za-z0-9 .,’'/-]+?)\s*==\s*$"
veg_matches = list(re.finditer(veg_pat, vl, flags=re.M))
veg_docs = []
for i,m in enumerate(veg_matches):
    name  = m.group(1).strip().rstrip(".")
    start = m.end()
    end   = veg_matches[i+1].start() if i+1 < len(veg_matches) else len(vl)
    body  = vl[start:end].strip()
    if body:
        # Light topic extraction for filtering
        vocab = ["yield","planting","conditions","soil","care","companions","harvest","storage",
                 "fertilizer","spacing","watering","pests","diseases","maturity","sun","shade"]
        topics = sorted({t for t in vocab if re.search(rf"\b{t}\b", body, flags=re.I)})
        veg_docs.append({"name": name, "content": body, "topics": topics})

# --- Assemble LangChain Documents ---
docs: List[Document] = []

# Fruits
for topic, body in fruit_docs:
    docs.append(Document(page_content=body, metadata={"category":"fruits","topic":topic.lower()}))

# Planner
for e in planner_docs:
    meta = {"category":"vegetable_planner"}
    if e["zones"]: meta["zones"] = e["zones"]
    if e["month"]: meta["month"] = e["month"]
    docs.append(Document(page_content=e["content"], metadata=meta))

# Veg list
for v in veg_docs:
    docs.append(Document(page_content=v["content"],
                         metadata={"category":"vegetable_list","crop":v["name"].lower(),"topics":v["topics"]}))

print("Docs by category:", {
    "fruits": sum(d.metadata.get("category")=="fruits" for d in docs),
    "planner": sum(d.metadata.get("category")=="vegetable_planner" for d in docs),
    "veg_list": sum(d.metadata.get("category")=="vegetable_list" for d in docs),
})
print("Total docs:", len(docs))

# Peeking at a couple
for i in range(min(3, len(docs))):
    print(f"\nDOC {i+1} META: {docs[i].metadata}")
    print(docs[i].page_content[:220].replace("\n"," ") + " ...")


Docs by category: {'fruits': 3, 'planner': 15, 'veg_list': 66}
Total docs: 84

DOC 1 META: {'category': 'fruits', 'topic': 'overview'}
One of the most essential tips for growing fruits successfully is selecting varieties suited to your specific climate. Growing the wrong fruit for your region leads to poor yields, disease issues, and plant stress. Under ...

DOC 2 META: {'category': 'fruits', 'topic': 'usda zone and suitable fruits'}
Zone 3 Apples, currants, gooseberries, hardy plums Zone 4 Apples, cherries, raspberries, cold-hardy grapes Zone 5 Pears, apricots, plums, blueberries Zone 6 Peaches, nectarines, grapes, blackberries Zone 7 Figs, persimmo ...

DOC 3 META: {'category': 'fruits', 'topic': 'match fruit varieties to local conditions'}
Beyond hardiness, consider other local climate factors such as:  Chill hours: Some fruits like apples and peaches require a specific number of chill hours (hours below 45°F) to produce fruit.  Humidity: High humidity are ...


## Chunk for retrieval

In [7]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n## ","\n### ","\n\n","\n",". "," "],
)

chunks: List[Document] = []
for d in docs:
    parts = splitter.split_text(d.page_content)
    for p in parts:
        chunks.append(Document(page_content=p, metadata=dict(d.metadata)))

print(f"Final chunk count: {len(chunks)}")
for i in range(min(3, len(chunks))):
    print(f"\n--- CHUNK {i+1} META --- {chunks[i].metadata}")
    print(chunks[i].page_content[:250].replace('\n',' ') + " ...")


Final chunk count: 350

--- CHUNK 1 META --- {'category': 'fruits', 'topic': 'overview'}
One of the most essential tips for growing fruits successfully is selecting varieties suited to your specific climate. Growing the wrong fruit for your region leads to poor yields, disease issues, and plant stress. Understanding your local environmen ...

--- CHUNK 2 META --- {'category': 'fruits', 'topic': 'usda zone and suitable fruits'}
Zone 3 Apples, currants, gooseberries, hardy plums Zone 4 Apples, cherries, raspberries, cold-hardy grapes Zone 5 Pears, apricots, plums, blueberries Zone 6 Peaches, nectarines, grapes, blackberries Zone 7 Figs, persimmons, pomegranates, late peaches ...

--- CHUNK 3 META --- {'category': 'fruits', 'topic': 'match fruit varieties to local conditions'}
Beyond hardiness, consider other local climate factors such as:  Chill hours: Some fruits like apples and peaches require a specific number of chill hours (hours below 45°F) to produce fruit.  Humidity: High humidit

In [8]:
# checks on chunks
# 1) Length stats
lengths = [len(c.page_content) for c in chunks]
print("Avg len:", sum(lengths)//len(lengths), "| min:", min(lengths), "| max:", max(lengths))

# 2) Counts by category
from collections import Counter
print("By category:", Counter(c.metadata.get("category") for c in chunks))

# 3) Sample a few planner chunks to confirm zone/month tagging
planner_samples = [c for c in chunks if c.metadata.get("category")=="vegetable_planner"][:3]
for i,c in enumerate(planner_samples,1):
    print(f"\nPlanner sample {i} META:", c.metadata)
    print(c.page_content[:200].replace("\n"," "), "...")


Avg len: 659 | min: 43 | max: 898
By category: Counter({'vegetable_list': 299, 'vegetable_planner': 26, 'fruits': 25})

Planner sample 1 META: {'category': 'vegetable_planner', 'zones': '5 and 6', 'month': 'March'}
Artichoke: Spring of 2nd season Asparagus: Spring of 3rd garden season Beets: 45–60 days (Plant successions at recommended intervals for species.) Onion/Leeks/Shallots*:80–170 days (Plant successions  ...

Planner sample 2 META: {'category': 'vegetable_planner', 'zones': '5 and 6', 'month': 'April'}
Beets: 45–60 days (Plant successions at recommended intervals for species.) Broccoli: 70–100 days Brussels Sprouts: 100–110 days Cabbage: 50–60 days (Plant successions at recommended intervals for spe ...

Planner sample 3 META: {'category': 'vegetable_planner', 'zones': '5 and 6', 'month': 'April'}
Lettuce, Leaf: 40–50 days (Plant successions at recommended intervals for species.) Melons, Summer: 70–100 days Melons, Winter: 110 days Okra: 55–65 days Parsnips: 100–130 days (Plant

## Embedding and persist to Chroma
Uncomment the debugging cells if the embedding cell does not work directly.

In [None]:
# !pip install -q sentence-transformers==2.7.0

In [None]:
# import sys, pkgutil, site
# print("Python:", sys.version)
# print("Executable:", sys.executable)

Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]
Executable: c:\Users\Acer\anaconda3\python.exe


In [None]:
# import sys
# # make sure pip is the one for this kernel
# !{sys.executable} -m pip install -q --upgrade pip
# !{sys.executable} -m pip install -q "sentence-transformers==2.7.0" "transformers>=4.41" "torch>=2.1"


In [None]:
# import sentence_transformers, transformers, torch
# print("sentence-transformers:", sentence_transformers.__version__)
# print("transformers:", transformers.__version__)
# print("torch:", torch.__version__)

# from sentence_transformers import SentenceTransformer
# _ = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # downloads model if needed
# print("Model load OK")

sentence-transformers: 2.7.0
transformers: 4.56.1
torch: 2.8.0+cpu
Model load OK


In [None]:
# Force CPU to avoid CUDA issues
embeddings = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,              # "sentence-transformers/all-MiniLM-L6-v2"
    model_kwargs={"device": "cpu"}       # force CPU to avoid CUDA errors
)

# Sanitize metadata so all values are str/int/float/bool/None
cleaned_chunks = filter_complex_metadata(chunks)

vectordb = Chroma.from_documents(
    documents=cleaned_chunks,
    embedding=embeddings,
    persist_directory=DB_DIR
)
vectordb.persist()
print(f"Chroma DB persisted to: {DB_DIR}")

Chroma DB persisted to: ./chroma_growguide


  vectordb.persist()


## Testing 
Simple retriever to check chroma. Decent result but still some noise. Will make a better retriever as a tool for the agent.

In [19]:
retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 4})

test_queries = [
    "When should I plant peas if I live in zone 5?",
    "Companion plants for tomatoes",
    "Best soil pH and watering for fruit trees",
    "How to store onions after harvest?",
]

for q in test_queries:
    hits = retriever.get_relevant_documents(q)
    print(f"\nQ: {q}")
    for h in hits:
        print("  ->", h.metadata)



Q: When should I plant peas if I live in zone 5?
  -> {'crop': 'garden, english, or snap peas', 'category': 'vegetable_list'}
  -> {'category': 'vegetable_list', 'crop': 'sugar or snow peas'}
  -> {'category': 'vegetable_list', 'crop': 'rhubarb'}
  -> {'category': 'fruits', 'topic': 'overview'}

Q: Companion plants for tomatoes
  -> {'crop': 'cooking, saucing, and paste tomatoes', 'category': 'vegetable_list'}
  -> {'category': 'vegetable_list', 'crop': 'cherry, grape, or miniature tomatoes'}
  -> {'crop': 'romaine lettuce or cos', 'category': 'vegetable_list'}
  -> {'category': 'vegetable_list', 'crop': 'eggplant or aubergine'}

Q: Best soil pH and watering for fruit trees
  -> {'topic': 'match fruit varieties to local conditions', 'category': 'fruits'}
  -> {'category': 'fruits', 'topic': 'match fruit varieties to local conditions'}
  -> {'topic': 'match fruit varieties to local conditions', 'category': 'fruits'}
  -> {'topic': 'match fruit varieties to local conditions', 'category'