In [None]:
%%bash
# Cell 1: install dependencies & set your Hugging Face API token
pip install torch transformers tqdm jsonschema sentence-transformers faiss-cpu huggingface_hub
export HUGGINGFACE_HUB_TOKEN="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"


In [None]:
# Cell 2: Imports & Configuration
import os, json, logging
from pathlib import Path

# ensure HF token is visible to 🤗 transformers
os.environ["HUGGINGFACE_HUB_TOKEN"] = "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"

import faiss
import jsonschema
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# ─── Config ───────────────────────────────────────────────────────────────
MODEL_NAME   = "meta-llama/Meta-Llama-3-70B-Instruct"
CORPUS_FILE  = Path("seating_corpus.txt")      # one seed seating puzzle per line
OUTPUT_FILE  = Path("dataset_seating.json")    # <-- final JSON path
BATCH_SIZE   = 100
TOTAL        = 4000
BATCHES      = TOTAL // BATCH_SIZE
RETRIES      = 3
TOP_K        = 5       # RAG retrieve per batch
DEVICE       = 0       # GPU index

# JSON‐schema to enforce each batch
SCHEMA = {
    "type":"array",
    "minItems":BATCH_SIZE,"maxItems":BATCH_SIZE,
    "items":{
      "type":"object",
      "properties":{
        "topic":{"const":"Seating Arrangements"},
        "question":{"type":"string"},
        "choices":{
          "type":"array","minItems":4,"maxItems":4,
          "items":{"type":"string"}
        },
        "answer":{"type":"string","pattern":"^[A-D]$"},
        "explanation":{"type":"string"}
      },
      "required":["topic","question","choices","answer","explanation"],
      "additionalProperties":False
    }
}

SYSTEM_BASE = """You are a question-generation engine. Output **only** a JSON array of
100 distinct, high-quality “Seating Arrangements” puzzles in this schema:

[
  {{
    "topic": "Seating Arrangements",
    "question": "...",
    "choices": ["A) ...","B) ...","C) ...","D) ..."],
    "answer": "<A|B|C|D>",
    "explanation": "<≤100‐word rationale>"
  }},
  … 100 items total …
]

Constraints:
- Exactly 100 objects, no extra keys.
- Include both linear and circular seating scenarios.
- Exclude pure permutation/combination formula questions.
- Vary people, seat‐counts, and positional constraints.
- Keep each block under ~100 tokens.
- **Output nothing but the JSON array**.
"""

USER_INSTR = "Generate 100 such questions now."

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s:%(message)s")
logger = logging.getLogger()

# ─── Build FAISS index ───────────────────────────────────────────────────────
def build_index(corpus_path: Path):
    lines = [L.strip() for L in corpus_path.read_text().splitlines() if L.strip()]
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    embs = embedder.encode(lines, convert_to_numpy=True, show_progress_bar=True)
    idx = faiss.IndexFlatL2(embs.shape[1]); idx.add(embs)
    return idx, lines, embedder

# ─── Init Llama + pipeline ─────────────────────────────────────────────────
logger.info(f"Loading model {MODEL_NAME}…")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
                torch_dtype=torch.float16, device_map="auto")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer,
                     device=DEVICE, return_full_text=False)

# ─── Generate one batch ─────────────────────────────────────────────────────
def generate_batch(i, idx, corpus, embd):
    # retrieve RAG examples
    q_emb = embd.encode(["Seating Arrangements puzzle"], convert_to_numpy=True)
    _, ids = idx.search(q_emb, TOP_K)
    exs = [corpus[j] for j in ids[0]]
    rag = "\n\nExamples:\n" + "\n".join(f"- {e}" for e in exs) + "\n\n"
    prompt = SYSTEM_BASE + rag + "USER:\n" + USER_INSTR

    for attempt in range(1, RETRIES+1):
        logger.info(f"Batch {i+1}/{BATCHES} attempt {attempt}")
        out = generator(prompt, max_new_tokens=4096,
                        temperature=0.8, top_p=0.95,
                        do_sample=True,
                        pad_token_id=tokenizer.eos_token_id)[0]["generated_text"]
        try:
            js = out[out.index("["):out.rindex("]")+1]
            arr = json.loads(js)
            jsonschema.validate(arr, SCHEMA)
            if len(arr)==BATCH_SIZE: return arr
        except Exception as e:
            logger.warning(f"  failed: {e}")
    raise RuntimeError(f"Batch {i+1} failed after {RETRIES} retries")

# ─── Main Loop ────────────────────────────────────────────────────────────────
def main():
    # Build index and generate all questions
    idx, corpus, embd = build_index(CORPUS_FILE)
    all_qs = []
    for i in tqdm(range(BATCHES), desc="Generating"):
        all_qs.extend(generate_batch(i, idx, corpus, embd))

    # Deduplicate by question text
    uniq = {q["question"]: q for q in all_qs}
    deduped = list(uniq.values())
    if len(deduped) < TOTAL:
        logger.warning(f"Only {len(deduped)} unique after dedupe")

    # Prepare output directory
    from pathlib import Path
    DATASET_DIR = Path("/jupyter-tutorial/DATASET")
    DATASET_DIR.mkdir(exist_ok=True)

    # Define full dataset path
    FULL_OUT = DATASET_DIR / "seating_arrangements_full.json"

    # Save only the full dataset
    with open(FULL_OUT, "w") as f:
        json.dump(deduped, f, indent=2)

    print(f"\n✅ Full dataset saved to: {FULL_OUT.resolve()}")

# Execute in Jupyter
if __name__ == "__main__":
    main()
