In [85]:
!pip install -q sentence-transformers transformers accelerate bitsandbytes torch --upgrade

In [86]:
import os, json, textwrap, math
from pathlib import Path
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [87]:
DOCS_PATH = "/content/docs.jsonl"        # input docs (jsonl with fields id/title/text)
QUESTIONS_PATH = "/content/questions.json"  # list of questions (json)
OUTPUT_DIR = "/content/submissions"
OUTPUT_PATH = os.path.join(OUTPUT_DIR, "rag_answers.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [88]:
docs = []
with open(DOCS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        title = obj.get("title", "").strip()
        text = obj.get("text", "").strip()
        combined = f"{title}: {text}" if title else text
        docs.append({"id": obj.get("id"), "title": title, "text": text, "chunk": combined})

# Build chunked_texts list (one chunk per doc)
chunked_texts = [d["chunk"] for d in docs]
print(f"Loaded {len(chunked_texts)} chunks (one per document). Example:\n", chunked_texts[:1])


Loaded 27 chunks (one per document). Example:
 ['NebulaDB: The default port for NebulaDB is 7342. NebulaDB provides role-based access control with three roles: reader, writer, admin.']


In [89]:
with open(QUESTIONS_PATH, "r", encoding="utf-8") as f:
    questions = json.load(f)

# questions expected to be list of {"id": "...", "question": "...", "answers": [...] } or similar
print(f"Loaded {len(questions)} questions. Example:\n", questions[:1])


Loaded 25 questions. Example:
 [{'id': 'q0', 'question': 'State a key feature of NebulaDB.', 'answers': ['nebuladb', 'lightweight', 'document', 'store', 'built', 'edge']}]


In [90]:
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
print("Loading embeddings model:", embed_model_name)
embed_model = SentenceTransformer(embed_model_name, device=device)

# compute chunk embeddings in batches
batch_size = 64
emb_batches = []
for i in range(0, len(chunked_texts), batch_size):
    batch = chunked_texts[i:i+batch_size]
    emb = embed_model.encode(batch, convert_to_tensor=True, show_progress_bar=True)
    emb_batches.append(emb)
doc_embeddings = torch.cat(emb_batches, dim=0)  # (num_chunks, dim)
doc_embeddings = F.normalize(doc_embeddings, p=2, dim=1)  # normalize for cosine via dot
print("Document embeddings shape:", doc_embeddings.shape)

Loading embeddings model: sentence-transformers/all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document embeddings shape: torch.Size([27, 384])


In [91]:
# CELL 7: Load LLM (GPU) - adjust model name if needed
# =========================
# Default: TinyLlama (1.1B). Replace with another small model if not available.
llm_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
print("Loading LLM:", llm_model_name)

tokenizer = AutoTokenizer.from_pretrained(llm_model_name, use_fast=True)

# load model with device_map="auto" and fp16 where possible for T4
model = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=60,
    do_sample=False,
    temperature=0.0
)


print("LLM pipeline ready.")

Loading LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0


Device set to use cuda:0


LLM pipeline ready.


In [104]:
top_k = 15  # check multiple chunks
similarity_threshold = 0.47

rag_answers = {}

for i, q in enumerate(questions):
    if isinstance(q, dict):
        question_text = q.get("question") or q.get("q") or q.get("text") or str(q)
    else:
        question_text = str(q)

    # 1) Encode question
    query_emb = embed_model.encode(question_text, convert_to_tensor=True)
    query_emb = F.normalize(query_emb, p=2, dim=0)

    # 2) Cosine similarity
    scores = torch.matmul(doc_embeddings, query_emb)
    top_vals, top_idx = torch.topk(scores, k=top_k)

    # 3) Look for a relevant chunk above threshold
    best_chunk = None
    for idx, score in zip(top_idx, top_vals):
        if score >= similarity_threshold:
            best_chunk = chunked_texts[idx]
            break

    if not best_chunk:
        best_chunk = "Not found in context"

    context = best_chunk if best_chunk != "Not found in context" else ""

    # 4) Prompt for LLM
    prompt = textwrap.dedent(f"""
    Answer the following question using ONLY the provided context.
    Be concise and factual (one sentence).
    If the context does not contain the answer, reply exactly: Not found in context.

    Context:
    {context}

    Question:
    {question_text}

    Answer:
    """)

    # 5) Generate answer
    if best_chunk == "Not found in context":
        answer = best_chunk
    else:
        out = generator(prompt)
        raw = out[0]["generated_text"]
        answer = raw.split("Answer:")[-1].strip()
        if not answer or len(answer) < 2:
            answer = "Not found in context"

    rag_answers_key = q.get("id") if isinstance(q, dict) and q.get("id") else f"q{i}"
    rag_answers[rag_answers_key] = answer

    print(f"[{i+1}/{len(questions)}] {rag_answers_key} -> {answer}")




[1/25] q0 -> NebulaDB is a distributed NoSQL database that provides a scalable, fault-tolerant, and high-performance solution for storing and querying large volumes of data.
[2/25] q1 -> NebulaDB is a distributed NoSQL database that provides a scalable, fault-tolerant, and high-performance solution for storing and querying large volumes of data.
[3/25] q2 -> NebulaDB is a distributed NoSQL database that provides a scalable, fault-tolerant, and high-performance solution for storing and querying large volumes of data.
[4/25] q3 -> The default port for NebulaDB is 7342.
[5/25] q4 -> Admin
[6/25] q5 -> MercuryRL does not include any RL algorithms.
[7/25] q6 -> MercuryRL does not include any RL algorithms.
[8/25] q7 -> MercuryRL does not include any RL algorithms.
[9/25] q8 -> MercuryRL does not include any RL algorithms.
[10/25] q9 -> MercuryRL does not include any RL algorithms.
[11/25] q10 -> AuroraCalc uses FFT-based multiplication to factor polynomials over the integers and rationals.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [105]:
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(rag_answers, f, indent=4, ensure_ascii=False)
print("Saved rag_answers to", OUTPUT_PATH)

Saved rag_answers to /content/submissions/rag_answers.json


In [106]:
def simple_normalize(s):
    return "".join(ch for ch in s.lower() if ch.isalnum() or ch.isspace()).strip()

def token_f1(pred, golds):
    # golds: list of acceptable strings (from questions.json 'answers' list)
    pred_tokens = simple_normalize(pred).split()
    best_f1 = 0.0
    best_gold = None
    for g in golds:
        gold_tokens = simple_normalize(g).split()
        if len(pred_tokens) == 0 and len(gold_tokens) == 0:
            f1 = 1.0
        elif len(pred_tokens) == 0 or len(gold_tokens) == 0:
            f1 = 0.0
        else:
            common = set(pred_tokens) & set(gold_tokens)
            if not common:
                f1 = 0.0
            else:
                prec = len(common) / len(pred_tokens)
                rec = len(common) / len(gold_tokens)
                f1 = 2 * (prec * rec) / (prec + rec)
        if f1 > best_f1:
            best_f1 = f1
            best_gold = g
    return best_f1, best_gold

# run evaluation if questions include 'answers' lists
eval_results = {}
total_exact = 0
total_f1 = 0.0
n_eval = 0

for q in questions:
    qid = q.get("id") if isinstance(q, dict) and q.get("id") else None
    if not qid:
        continue
    golds = q.get("answers") or []
    if not golds:
        continue
    pred = rag_answers.get(qid, "")
    # exact match (case-insensitive normalized)
    if simple_normalize(pred) in [simple_normalize(g) for g in golds]:
        total_exact += 1
    f1, best_gold = token_f1(pred, golds)
    total_f1 += f1
    n_eval += 1
    eval_results[qid] = {"pred": pred, "best_gold": best_gold, "f1": f1}

if n_eval > 0:
    print(f"Exact match: {total_exact}/{n_eval} = {total_exact/n_eval:.2%}")
    print(f"Avg token-F1: {total_f1/n_eval:.3f}")
else:
    print("No gold answers present for evaluation.")

Exact match: 1/25 = 4.00%
Avg token-F1: 0.138
