In [1]:
!pip install -q sentence-transformers transformers accelerate bitsandbytes torch --upgrade

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.6/486.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, json, textwrap, math
from pathlib import Path
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [None]:
DOCS_PATH = "/content/docs.jsonl"        
QUESTIONS_PATH = "/content/questions.json"  
OUTPUT_DIR = "/content/submissions"
OUTPUT_PATH = os.path.join(OUTPUT_DIR, "rag_answers.json")
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
docs = []
with open(DOCS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        title = obj.get("title", "").strip()
        text = obj.get("text", "").strip()
        combined = f"{title}: {text}" if title else text
        docs.append({"id": obj.get("id"), "title": title, "text": text, "chunk": combined})


chunked_texts = [d["chunk"] for d in docs]
print(f"Loaded {len(chunked_texts)} chunks (one per document). Example:\n", chunked_texts[:1])


Loaded 27 chunks (one per document). Example:
 ['NebulaDB: The default port for NebulaDB is 7342. NebulaDB provides role-based access control with three roles: reader, writer, admin.']


In [5]:
with open(QUESTIONS_PATH, "r", encoding="utf-8") as f:
    questions = json.load(f)

# questions expected to be list of {"id": "...", "question": "...", "answers": [...] } or similar
print(f"Loaded {len(questions)} questions. Example:\n", questions[:1])


Loaded 25 questions. Example:
 [{'id': 'q0', 'question': 'State a key feature of NebulaDB.', 'answers': ['nebuladb', 'lightweight', 'document', 'store', 'built', 'edge']}]


In [None]:
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
print("Loading embeddings model:", embed_model_name)
embed_model = SentenceTransformer(embed_model_name, device=device)


batch_size = 50
emb_batches = []
for i in range(0, len(chunked_texts), batch_size):
    batch = chunked_texts[i:i+batch_size]
    emb = embed_model.encode(batch, convert_to_tensor=True, show_progress_bar=True)
    emb_batches.append(emb)
doc_embeddings = torch.cat(emb_batches, dim=0) 
doc_embeddings = F.normalize(doc_embeddings, p=2, dim=1)  
print("Document embeddings shape:", doc_embeddings.shape)

Loading embeddings model: sentence-transformers/all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document embeddings shape: torch.Size([27, 384])


In [None]:
llm_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
print("Loading LLM:", llm_model_name)

tokenizer = AutoTokenizer.from_pretrained(llm_model_name, use_fast=True)


model = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=60,
    do_sample=False,
    temperature=0.8
)


print("LLM pipeline ready.")

Loading LLM: TinyLlama/TinyLlama-1.1B-Chat-v1.0


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


LLM pipeline ready.


In [None]:
top_k = 5  
similarity_threshold = 0.47

rag_answers = {}

for i, q in enumerate(questions):
    if isinstance(q, dict):
        question_text = q.get("question") or q.get("q") or q.get("text") or str(q)
    else:
        question_text = str(q)

    query_emb = embed_model.encode(question_text, convert_to_tensor=True)
    query_emb = F.normalize(query_emb, p=2, dim=0)

    #  Cosine similarity
    scores = torch.matmul(doc_embeddings, query_emb)
    top_vals, top_idx = torch.topk(scores, k=top_k)

    best_chunk = None
    for idx, score in zip(top_idx, top_vals):
        if score >= similarity_threshold:
            best_chunk = chunked_texts[idx]
            break

    if not best_chunk:
        best_chunk = "Not found in context"

    context = best_chunk if best_chunk != "Not found in context" else ""

    #  Prompt for LLM
    prompt = textwrap.dedent(f"""
    Answer the following question using ONLY the provided context.
    Be concise and factual (one sentence).
    If the context does not contain the answer, reply exactly: Not found in context.

    Context:
    {context}

    Question:
    {question_text}

    Answer:
    """)

    #  Generate answer
    if best_chunk == "Not found in context":
        answer = best_chunk
    else:
        out = generator(prompt)
        raw = out[0]["generated_text"]
        answer = raw.split("Answer:")[-1].strip()
        if not answer or len(answer) < 2:
            answer = "Not found in context"

    rag_answers_key = q.get("id") if isinstance(q, dict) and q.get("id") else f"q{i}"
    rag_answers[rag_answers_key] = answer

    print(f"[{i+1}/{len(questions)}] {rag_answers_key} -> {answer}")




[1/25] q0 -> NebulaDB is a distributed NoSQL database that provides a scalable, fault-tolerant, and high-performance solution for storing and querying large volumes of data.
[2/25] q1 -> NebulaDB is a distributed NoSQL database that provides a scalable, fault-tolerant, and high-performance solution for storing and querying large volumes of data.
[3/25] q2 -> NebulaDB is a distributed NoSQL database that provides a scalable, fault-tolerant, and high-performance solution for storing and querying large volumes of data.
[4/25] q3 -> The default port for NebulaDB is 7342.
[5/25] q4 -> Admin
[6/25] q5 -> MercuryRL does not include any RL algorithms.
[7/25] q6 -> MercuryRL does not include any RL algorithms.
[8/25] q7 -> MercuryRL does not include any RL algorithms.
[9/25] q8 -> MercuryRL does not include any RL algorithms.
[10/25] q9 -> MercuryRL does not include any RL algorithms.
[11/25] q10 -> AuroraCalc uses FFT-based multiplication to factor polynomials over the integers and rationals.


In [10]:
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(rag_answers, f, indent=4, ensure_ascii=False)
print("Saved rag_answers to", OUTPUT_PATH)

Saved rag_answers to /content/submissions/rag_answers.json
