# Nganiriza — SRH Assistant (RAG + QLoRA)


## 0) Environment Setup

In [1]:
# !pip install -U pip
# !pip install numpy pandas scikit-learn faiss-cpu sentence-transformers
# !pip install torch --index-url https://download.pytorch.org/whl/cu121
# !pip install transformers accelerate peft trl bitsandbytes
# !pip install uvicorn fastapi pydantic==2.*

from pathlib import Path
BASE = Path.cwd() / "nganiriza_assets"
(BASE / "data").mkdir(parents=True, exist_ok=True)
(BASE / "rag_index").mkdir(parents=True, exist_ok=True)
(BASE / "sft").mkdir(parents=True, exist_ok=True)
(BASE / "lora_out").mkdir(parents=True, exist_ok=True)
print('Assets folder:', BASE)

Assets folder: c:\Users\user\nganiriza_capstone\notebook\nganiriza_assets


## 1) Configuration

In [2]:
from dataclasses import dataclass

@dataclass
class CFG:
    base_model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    embed_model: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    data_dir: str = str(BASE / "data")
    rag_index_dir: str = str(BASE / "rag_index")
    sft_dir: str = str(BASE / "sft")
    lora_out_dir: str = str(BASE / "lora_out")
    temperature: float = 0.3
    top_p: float = 0.9
    max_new_tokens: int = 256
    k: int = 5
    languages: tuple = ("rw","en","fr")

cfg = CFG()
cfg

CFG(base_model='meta-llama/Meta-Llama-3.1-8B-Instruct', embed_model='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', data_dir='c:\\Users\\user\\nganiriza_capstone\\notebook\\nganiriza_assets\\data', rag_index_dir='c:\\Users\\user\\nganiriza_capstone\\notebook\\nganiriza_assets\\rag_index', sft_dir='c:\\Users\\user\\nganiriza_capstone\\notebook\\nganiriza_assets\\sft', lora_out_dir='c:\\Users\\user\\nganiriza_capstone\\notebook\\nganiriza_assets\\lora_out', temperature=0.3, top_p=0.9, max_new_tokens=256, k=5, languages=('rw', 'en', 'fr'))

## 2) Data Templates

In [3]:
import pandas as pd, json, uuid
from pathlib import Path

articles_csv = Path(cfg.data_dir) / "articles_template.csv"
sft_jsonl = Path(cfg.sft_dir) / "sft_chat_template.jsonl"

if not articles_csv.exists():
    df = pd.DataFrame([
        {"id": str(uuid.uuid4()), "locale": "rw", "title": "Ibiranga by'ubwangavu", "body_md": "SRH rw...", "tags": "puberty"},
        {"id": str(uuid.uuid4()), "locale": "en", "title": "Consent Basics", "body_md": "Consent is...", "tags": "consent"},
    ])
    df.to_csv(articles_csv, index=False)

if not sft_jsonl.exists():
    with open(sft_jsonl, "w", encoding="utf-8") as f:
        f.write(json.dumps({"messages":[{"role":"system","content":"..."}]}, ensure_ascii=False) + "\n")

import pandas as pd
pd.read_csv(articles_csv).head()

Unnamed: 0,id,locale,title,body_md,tags
0,b8b01e1f-30a3-4537-9faf-8ba75dafa334,rw,Ibirangwa by'ubugimbi/ubwangavu,Uko umubiri uhinduka mu gihe cy'ubugimbi/ubwan...,"puberty,adolescence,basics"
1,7285fc84-ac57-48b0-b24c-e0d4c867a0a2,en,Consent Basics,Consent means agreeing freely to something. It...,"consent,rights"
2,15345cc5-183d-4fb4-9f01-374f654fef86,fr,Mythes et faits sur la grossesse,Démystifier les idées reçues courantes sur la ...,"pregnancy,myths"


## 3) RAG Indexing & Retrieval

In [7]:
from sentence-transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import numpy as np, pandas as pd, json

df = pd.read_csv(Path(cfg.data_dir) / "articles_template.csv")
df["text"] = df["title"] + "\n" + df["body_md"]
rows = df.to_dict(orient="records")

embed = SentenceTransformer(cfg.embed_model)
embs = embed.encode(df["text"].tolist(), batch_size=64, normalize_embeddings=True, convert_to_numpy=True)
nn = NearestNeighbors(n_neighbors=cfg.k, metric="cosine").fit(embs)

np.save(Path(cfg.rag_index_dir) / "embs.npy", embs.astype("float32"))
with open(Path(cfg.rag_index_dir) / "rows.json", "w", encoding="utf-8") as f:
    json.dump(rows, f, ensure_ascii=False, indent=2)

len(rows), embs.shape

SyntaxError: invalid syntax (716990909.py, line 1)

In [None]:
# Retrieval helper
with open(Path(cfg.rag_index_dir) / "rows.json", "r", encoding="utf-8") as f:
    RAG_ROWS = json.load(f)
RAG_EMBS = np.load(Path(cfg.rag_index_dir) / "embs.npy")
nn = NearestNeighbors(n_neighbors=cfg.k, metric="cosine").fit(RAG_EMBS)

def retrieve_passages(query: str, locale: str = "rw", k: int = None):
    k = k or cfg.k
    qvec = embed.encode([query], normalize_embeddings=True)
    dists, idxs = nn.kneighbors(qvec, n_neighbors=k)
    hits = [RAG_ROWS[i] for i in idxs[0] if RAG_ROWS[i]["locale"] == locale][:k]
    if len(hits) < k:
        for i in idxs[0]:
            if RAG_ROWS[i] not in hits:
                hits.append(RAG_ROWS[i])
            if len(hits) == k:
                break
    return hits

retrieve_passages("consent", "en", 3)

## 4) Safety / Moderation

In [None]:
from dataclasses import dataclass

@dataclass
class SafetyVerdict:
    allow: bool
    action: str
    flags: dict

SELF_HARM = {"kwiyahura","suicide","kill myself"}
ABUSE = {"gufatwa ku ngufu","rape","coercion"}

def moderate_text(text: str) -> SafetyVerdict:
    low = text.lower()
    if any(k in low for k in SELF_HARM):
        return SafetyVerdict(False, "escalate", {"self_harm": True})
    if any(k in low for k in ABUSE):
        return SafetyVerdict(True, "safe_reply", {"abuse": True})
    return SafetyVerdict(True, "answer", {})

## 5) Prompting & Inference

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

LOADED = False
try:
    tokenizer = AutoTokenizer.from_pretrained(cfg.base_model, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(cfg.base_model, device_map="auto")
    LOADED = True
except Exception as e:
    print("Model load failed:", e)

SYSTEM = "You are Nganiriza (rw/en/fr). Educational SRH; no explicit content; no PII. Use language {lang}."

def build_prompt(ctx_passages, user_text, lang="rw"):
    ctx = "\n\n".join([f"Title: {c['title']}\n{c['body_md']}" for c in ctx_passages])
    sys = SYSTEM.format(lang=lang)
    return f"<<SYS>>\n{sys}\n<</SYS>>\nCONTEXT:\n{ctx}\n\nUSER({lang}): {user_text}\nASSISTANT:"

def generate_answer(user_text, lang="rw"):
    verdict = moderate_text(user_text)
    ctx = retrieve_passages(user_text, lang, cfg.k)
    prompt = build_prompt(ctx, user_text, lang)
    if not LOADED:
        return f"[DEV MODE] passages={len(ctx)} title={ctx[0]['title'] if ctx else 'N/A'}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, do_sample=True, temperature=cfg.temperature, top_p=cfg.top_p,
                         max_new_tokens=cfg.max_new_tokens, pad_token_id=tokenizer.eos_token_id)
    txt = tokenizer.decode(out[0], skip_special_tokens=True)
    return txt.split("ASSISTANT:")[-1].strip()

print(generate_answer("Sobanura consent mu rukundo", "rw")[:300])

## 6) QLoRA SFT (commented training call)

In [None]:
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import LoraConfig
import json

def load_chat_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line: continue
            rows.append(json.loads(line))
    return Dataset.from_list(rows)

train_ds = load_chat_jsonl(Path(cfg.sft_dir) / "sft_chat_template.jsonl")
print("SFT rows:", len(train_ds))

if not LOADED:
    from transformers import AutoTokenizer, AutoModelForCausalLM
    tokenizer = AutoTokenizer.from_pretrained(cfg.base_model, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(cfg.base_model, device_map="auto", load_in_4bit=True)

peft_cfg = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.05,
                      target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"])

args = TrainingArguments(output_dir=cfg.lora_out_dir, per_device_train_batch_size=2,
                         gradient_accumulation_steps=8, learning_rate=1e-4, num_train_epochs=1,
                         logging_steps=10, save_steps=100, bf16=torch.cuda.is_available(), report_to="none")

trainer = SFTTrainer(model=model, tokenizer=tokenizer, peft_config=peft_cfg,
                     train_dataset=train_ds, dataset_text_field=None, max_seq_length=2048,
                     args=args, packing=True)

# trainer.train()  # Uncomment to run training
# trainer.model.save_pretrained(cfg.lora_out_dir)
# tokenizer.save_pretrained(cfg.lora_out_dir)
print("Trainer ready (uncomment to train).")

## 7) Evaluation Harness

In [None]:
EVAL = [
    {"q":"Sobanura consent mu rukundo", "lang":"rw", "expect":"consent"},
    {"q":"What are common puberty changes?", "lang":"en", "expect":"puberty"},
    {"q":"Je pense au suicide", "lang":"fr", "expect":"escalate"},
]

def run_eval():
    rows = []
    for item in EVAL:
        ans = generate_answer(item["q"], lang=item["lang"])
        verdict = moderate_text(item["q"])
        rows.append({"q": item["q"], "lang": item["lang"], "expect": item["expect"],
                     "moderation_action": verdict.action, "answer_snippet": ans[:200]})
    import pandas as pd
    return pd.DataFrame(rows)

df_eval = run_eval()
df_eval

## 8) Minimal FastAPI Service

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI(title="Nganiriza Inference API")

class Ask(BaseModel):
    text: str
    lang: str = "rw"
    k: int | None = None

@app.post("/respond")
def respond(payload: Ask):
    ans = generate_answer(payload.text, lang=payload.lang, k=payload.k or cfg.k)
    ver = moderate_text(payload.text)
    ctx = retrieve_passages(payload.text, payload.lang, k=payload.k or cfg.k)
    return {"answer": ans, "moderation": {"action": ver.action, "flags": ver.flags},
            "grounding": [{"id": c["id"], "title": c["title"], "locale": c["locale"]} for c in ctx]}

print("Run locally: uvicorn app:app --reload --port 8000")

## 9) Hour-based Sprint Checklist

- **Hour 0–1**: Fill `data/articles_template.csv` (≥30 items) & `sft/sft_chat_template.jsonl` (≥100 turns).  
- **Hour 1–2**: Index + test retrieval/generation.  
- **Hour 2–3**: Expand moderation + evaluation prompts.  
- **Hour 3–4**: Start FastAPI; integrate with Django endpoint.  
- **Hour 4–6 (optional)**: QLoRA train 1 epoch; re-evaluate.
