# configure path and parameters

In [2]:
from pathlib import Path

In [4]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
PDF_DIR = DATA_DIR / "pdfs"
RAW_TEXT_DIR = DATA_DIR / "raw_text"
CHUNKS_DIR = DATA_DIR / "chunks"
OUT_DIR = BASE_DIR / "outputs"

for p in [DATA_DIR, PDF_DIR, RAW_TEXT_DIR, CHUNKS_DIR, OUT_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Add your API key here


PDF_URLS = [
    ("GPT-3", "https://arxiv.org/pdf/2005.14165.pdf"),
    ("GPT-4", "https://arxiv.org/pdf/2303.08774.pdf"),
    ("PaLM", "https://arxiv.org/pdf/2204.02311.pdf"),
    ("PaLM2", "https://arxiv.org/pdf/2305.10403.pdf"),
    ("Gemini 1.0", "https://arxiv.org/pdf/2312.11805.pdf"),
    ("Gemini 1.5 (2024)", " https://arxiv.org/pdf/2403.05530.pdf"),
    ("Gemma (2024)", " https://arxiv.org/pdf/2403.08295.pdf"),
    ("Gemma 2 (2024)", " https://arxiv.org/pdf/2408.00118.pdf"),
    ("Gemma 3", " https://arxiv.org/pdf/2503.19786.pdf"),
    ("CodeGemma (2024)", " https://arxiv.org/pdf/2406.11409.pdf"),
    ("RecurrentGemma (2024)", " https://arxiv.org/pdf/2404.07839.pdf"),
    ("LLaMA (2023)", " https://arxiv.org/pdf/2302.13971.pdf"),
    ("Llama 2 (2023)", " https://arxiv.org/pdf/2307.09288.pdf"),
    ("Llama 3 (2024)", " https://arxiv.org/pdf/2407.21783.pdf"),
    # Mistral
    ("Mistral 7B (2023)", " https://arxiv.org/pdf/2310.06825.pdf"),
    ("Mixtral of Experts 8x7B (2024)", " https://arxiv.org/pdf/2401.04088.pdf"),
    # NVIDIA
    ("Nemotron-4 340B Technical Report (2024)", " https://arxiv.org/pdf/2406.11704.pdf"),
    ("NVLM 1.0 (2024)", " https://arxiv.org/pdf/2409.11402.pdf"),
    # Alibaba / Qwen series
    ("Qwen2 Technical Report (2024)", " https://arxiv.org/pdf/2407.10671.pdf"),
    ("Qwen2-VL (2024)", " https://arxiv.org/pdf/2409.12191.pdf"),
    ("Qwen2-Audio (2024)", " https://arxiv.org/pdf/2407.10759.pdf"),
    ("Qwen2.5 Technical Report (2024)", " https://arxiv.org/pdf/2412.15115.pdf"),
    ("Qwen2.5-VL Technical Report (2025)", " https://arxiv.org/pdf/2502.13923.pdf"),
    ("Qwen2.5-Omni Technical Report (2025)", " https://arxiv.org/pdf/2503.20215.pdf"),
    ("Qwen3 Technical Report (2025)", " https://arxiv.org/pdf/2505.09388.pdf"),
    # DeepSeek series
    ("DeepSeek-V2 (2024)", " https://arxiv.org/pdf/2405.04434.pdf"),
    ("DeepSeek-V3 Technical Report (2024)", " https://arxiv.org/pdf/2412.19437.pdf"),
    ("DeepSeek-R1 (2025)", " https://arxiv.org/pdf/2501.12948.pdf"),
    ("DeepSeek-Coder (2024)", " https://arxiv.org/pdf/2401.14196.pdf"),
    # ZhipuAI
    ("GLM-130B (2022)", " https://arxiv.org/pdf/2210.02414.pdf"),
    # Shanghai AI Lab
    ("InternLM2 Technical Report (2024)", " https://arxiv.org/pdf/2403.17297.pdf"),
    ("InternVL 2.5 (2024)", " https://arxiv.org/pdf/2412.05271.pdf"),
    # Microsoft
    ("Phi-3 Technical Report (2024)", " https://arxiv.org/pdf/2404.14219.pdf"),
    ("Phi-3 Safety Post-Training (2024)", " https://arxiv.org/pdf/2407.13833.pdf"),
    # AI21
    ("Jamba: Hybrid Transformer–Mamba (2024)", " https://arxiv.org/pdf/2403.19887.pdf"),
    # Huawei
    ("PanGu-Σ (2023)", " https://arxiv.org/pdf/2303.10845.pdf"),
    # 01.AI
    ("Yi: Open Foundation Models (2024)", " https://arxiv.org/pdf/2403.04652.pdf")
]

MAX_TOKENS_PER_CHUNK = 700
MIN_TOKENS_PER_CHUNK = 300
OVERLAP_TOKENS = 80
N_TRIPLETS_PER_CHUNK = 2
TEMPERATURE = 0.3

SYSTEM_PROMPT = (
    "You are a meticulous data constructor. Given a technical passage, "
    "produce instruction-tuning triplets that are useful for training. "
    "Prefer concrete, unambiguous, domain-grounded questions."
)


# Utilities

In [8]:
try:
    import tiktoken
    _ENC = tiktoken.get_encoding("cl100k_base")
except Exception:
    _ENC = None

In [10]:
import requests
import re
from pathlib import Path
from PyPDF2 import PdfReader

# If you use pdfminer.six:
from pdfminer.high_level import extract_text as pdfminer_extract_text

from tenacity import retry, stop_after_attempt, wait_exponential_jitter

@retry(stop=stop_after_attempt(3), wait=wait_exponential_jitter(initial=1, max=6))
def download_file(url: str, dest: Path) -> Path:
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open(dest, "wb") as f:
        f.write(r.content)
    return dest

def extract_pdf_text(pdf_path: Path) -> str:
    try:
        text = pdfminer_extract_text(str(pdf_path))
        if text.strip():
            return text
    except Exception:
        pass
    try:
        reader = PdfReader(str(pdf_path))
        pages = [p.extract_text() or "" for p in reader.pages]
        return "\n".join(pages)
    except Exception as e:
        print(f"Failed to extract {pdf_path.name}: {e}")
        return ""

def basic_clean(text: str) -> str:
    t = text.replace("\r\n", "\n").replace("\r", "\n")
    t = re.sub(r"(?<=\w)-\n(?=\w)", "", t)     # de-hyphenate at line breaks
    t = re.sub(r"\n{3,}", "\n\n", t)           # collapse long newline runs
    t = re.sub(r"[ \t]+", " ", t)              # collapse spaces/tabs
    t = "\n".join([line.strip() for line in t.splitlines()])
    return t.strip()

def encode_tokens(s: str):
    # Expects `_ENC` (tiktoken encoding) to be defined outside this snippet
    if _ENC is None:
        return s.split()
    # OPTION B: allow all special tokens
    return _ENC.encode(s, allowed_special='all')

def token_len(s: str):
    return len(encode_tokens(s))

def split_into_token_chunks(text: str,
                            max_tokens=700,
                            overlap_tokens=80,
                            min_tokens=300):
    if not text.strip():
        return []

    if _ENC is None:
        # word-based fallback
        words = text.split()
        chunks, i = [], 0
        while i < len(words):
            j = min(i + max_tokens, len(words))
            chunk = " ".join(words[i:j])
            chunks.append(chunk)
            if j == len(words):
                break
            i = max(0, j - overlap_tokens)
        if chunks and token_len(chunks[-1]) < min_tokens and len(chunks) > 1:
            chunks[-2] += "\n\n" + chunks[-1]
            chunks.pop()
        return chunks

    # OPTION B here as well
    toks = _ENC.encode(text, allowed_special='all')
    chunks, i = [], 0
    while i < len(toks):
        j = min(i + max_tokens, len(toks))
        chunks.append(_ENC.decode(toks[i:j]))
        if j == len(toks):
            break
        i = max(0, j - overlap_tokens)  # defensive bound
    if chunks and token_len(chunks[-1]) < min_tokens and len(chunks) > 1:
        chunks[-2] += "\n\n" + chunks[-1]
        chunks.pop()
    return chunks


# Download and load pdfs

In [12]:
from tqdm import tqdm

In [13]:
for name, url in tqdm(PDF_URLS):
    filename = name.replace(" ", "_").replace("/", "_") + ".pdf"
    dest = PDF_DIR / filename
    if dest.exists():
        print(f"[SKIP] {filename}")
        continue
    try:
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        with open(dest, "wb") as f:
            f.write(r.content)
        print(f"[OK] {filename}")
    except Exception as e:
        print(f"[FAIL] {filename}: {e}")

print("All PDFs stored in:", PDF_DIR)

  3%|▎         | 1/37 [00:12<07:14, 12.08s/it]

[OK] GPT-3.pdf


  5%|▌         | 2/37 [00:13<03:32,  6.07s/it]

[OK] GPT-4.pdf


  8%|▊         | 3/37 [00:15<02:10,  3.82s/it]

[OK] PaLM.pdf


 11%|█         | 4/37 [00:16<01:35,  2.89s/it]

[OK] PaLM2.pdf


 14%|█▎        | 5/37 [00:19<01:28,  2.76s/it]

[OK] Gemini_1.0.pdf


 16%|█▌        | 6/37 [00:21<01:20,  2.60s/it]

[OK] Gemini_1.5_(2024).pdf


 19%|█▉        | 7/37 [00:23<01:10,  2.33s/it]

[OK] Gemma_(2024).pdf


 22%|██▏       | 8/37 [00:24<00:54,  1.87s/it]

[OK] Gemma_2_(2024).pdf


 24%|██▍       | 9/37 [00:24<00:43,  1.56s/it]

[OK] Gemma_3.pdf


 27%|██▋       | 10/37 [00:26<00:39,  1.45s/it]

[OK] CodeGemma_(2024).pdf


 30%|██▉       | 11/37 [00:27<00:40,  1.55s/it]

[OK] RecurrentGemma_(2024).pdf


 32%|███▏      | 12/37 [00:29<00:36,  1.47s/it]

[OK] LLaMA_(2023).pdf


 35%|███▌      | 13/37 [00:38<01:33,  3.91s/it]

[OK] Llama_2_(2023).pdf


 38%|███▊      | 14/37 [00:44<01:42,  4.45s/it]

[OK] Llama_3_(2024).pdf


 41%|████      | 15/37 [00:46<01:23,  3.78s/it]

[OK] Mistral_7B_(2023).pdf


 43%|████▎     | 16/37 [00:47<01:00,  2.88s/it]

[OK] Mixtral_of_Experts_8x7B_(2024).pdf


 46%|████▌     | 17/37 [00:48<00:48,  2.42s/it]

[OK] Nemotron-4_340B_Technical_Report_(2024).pdf


 49%|████▊     | 18/37 [00:54<01:02,  3.30s/it]

[OK] NVLM_1.0_(2024).pdf


 51%|█████▏    | 19/37 [00:56<00:54,  3.03s/it]

[OK] Qwen2_Technical_Report_(2024).pdf


 54%|█████▍    | 20/37 [01:12<01:57,  6.91s/it]

[OK] Qwen2-VL_(2024).pdf


 57%|█████▋    | 21/37 [01:14<01:25,  5.34s/it]

[OK] Qwen2-Audio_(2024).pdf


 59%|█████▉    | 22/37 [01:15<01:01,  4.07s/it]

[OK] Qwen2.5_Technical_Report_(2024).pdf


 62%|██████▏   | 23/37 [01:16<00:44,  3.19s/it]

[OK] Qwen2.5-VL_Technical_Report_(2025).pdf


 65%|██████▍   | 24/37 [01:18<00:37,  2.88s/it]

[OK] Qwen2.5-Omni_Technical_Report_(2025).pdf


 68%|██████▊   | 25/37 [01:20<00:30,  2.56s/it]

[OK] Qwen3_Technical_Report_(2025).pdf


 70%|███████   | 26/37 [01:21<00:22,  2.09s/it]

[OK] DeepSeek-V2_(2024).pdf


 73%|███████▎  | 27/37 [01:23<00:21,  2.17s/it]

[OK] DeepSeek-V3_Technical_Report_(2024).pdf


 76%|███████▌  | 28/37 [01:24<00:16,  1.80s/it]

[OK] DeepSeek-R1_(2025).pdf


 78%|███████▊  | 29/37 [01:25<00:12,  1.54s/it]

[OK] DeepSeek-Coder_(2024).pdf


 81%|████████  | 30/37 [01:28<00:12,  1.81s/it]

[OK] GLM-130B_(2022).pdf


 84%|████████▍ | 31/37 [01:29<00:10,  1.81s/it]

[OK] InternLM2_Technical_Report_(2024).pdf


 86%|████████▋ | 32/37 [01:31<00:08,  1.80s/it]

[OK] InternVL_2.5_(2024).pdf


 89%|████████▉ | 33/37 [01:34<00:08,  2.01s/it]

[OK] Phi-3_Technical_Report_(2024).pdf


 92%|█████████▏| 34/37 [01:34<00:04,  1.60s/it]

[OK] Phi-3_Safety_Post-Training_(2024).pdf


 95%|█████████▍| 35/37 [01:35<00:02,  1.48s/it]

[OK] Jamba:_Hybrid_Transformer–Mamba_(2024).pdf


 97%|█████████▋| 36/37 [01:38<00:01,  1.77s/it]

[OK] PanGu-Σ_(2023).pdf


100%|██████████| 37/37 [01:52<00:00,  3.03s/it]

[OK] Yi:_Open_Foundation_Models_(2024).pdf
All PDFs stored in: e:\Hermon\Homework2\data\pdfs





# Extract and clean text


In [14]:
pdfs = list(PDF_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs")

for pdf in tqdm(pdfs):
    out = RAW_TEXT_DIR / (pdf.stem + ".txt")
    if out.exists(): continue
    raw = extract_pdf_text(pdf)
    clean = basic_clean(raw)
    out.write_text(clean, encoding="utf-8")
    print("Extracted:", pdf.name)


Found 35 PDFs


  3%|▎         | 1/35 [00:00<00:25,  1.31it/s]

Extracted: CodeGemma_(2024).pdf


  6%|▌         | 2/35 [00:02<00:44,  1.36s/it]Cannot set gray non-stroke color because /'H1' is an invalid float value
Cannot set gray non-stroke color because /'H1' is an invalid float value
Cannot set gray non-stroke color because /'H1' is an invalid float value
Cannot set gray non-stroke color because /'H1' is an invalid float value
Cannot set gray non-stroke color because /'H1' is an invalid float value
Cannot set gray non-stroke color because /'H1' is an invalid float value
Cannot set gray non-stroke color because /'H1' is an invalid float value


Extracted: DeepSeek-Coder_(2024).pdf


  9%|▊         | 3/35 [00:03<00:39,  1.25s/it]

Extracted: DeepSeek-R1_(2025).pdf


Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is a

Extracted: DeepSeek-V2_(2024).pdf


Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
 14%|█▍        | 5/35 [00:13<01:50,  3.67s/it]

Extracted: DeepSeek-V3_Technical_Report_(2024).pdf


 17%|█▋        | 6/35 [00:20<02:16,  4.72s/it]

Extracted: Gemini_1.0.pdf


 20%|██        | 7/35 [00:32<03:17,  7.06s/it]

Extracted: Gemini_1.5_(2024).pdf


 23%|██▎       | 8/35 [00:33<02:20,  5.20s/it]

Extracted: Gemma_(2024).pdf


 26%|██▌       | 9/35 [00:34<01:44,  4.03s/it]

Extracted: Gemma_2_(2024).pdf


 29%|██▊       | 10/35 [00:36<01:21,  3.26s/it]

Extracted: Gemma_3.pdf


 31%|███▏      | 11/35 [00:47<02:18,  5.75s/it]

Extracted: GLM-130B_(2022).pdf


 34%|███▍      | 12/35 [00:51<01:59,  5.19s/it]

Extracted: GPT-3.pdf


 37%|███▋      | 13/35 [00:56<01:49,  4.97s/it]

Extracted: GPT-4.pdf


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
 40%|████      | 14/35 [01:00<01:43,  4.92s/it]

Extracted: InternLM2_Technical_Report_(2024).pdf


 43%|████▎     | 15/35 [01:05<01:35,  4.78s/it]

Extracted: InternVL_2.5_(2024).pdf


 46%|████▌     | 16/35 [01:07<01:16,  4.03s/it]

Extracted: LLaMA_(2023).pdf


 49%|████▊     | 17/35 [01:30<02:53,  9.62s/it]

Extracted: Llama_2_(2023).pdf


 51%|█████▏    | 18/35 [01:44<03:07, 11.03s/it]

Extracted: Llama_3_(2024).pdf


 54%|█████▍    | 19/35 [01:45<02:06,  7.88s/it]

Extracted: Mistral_7B_(2023).pdf


 57%|█████▋    | 20/35 [01:46<01:27,  5.85s/it]

Extracted: Mixtral_of_Experts_8x7B_(2024).pdf


 60%|██████    | 21/35 [01:47<01:04,  4.60s/it]

Extracted: Nemotron-4_340B_Technical_Report_(2024).pdf


 63%|██████▎   | 22/35 [01:53<01:02,  4.83s/it]

Extracted: NVLM_1.0_(2024).pdf


 66%|██████▌   | 23/35 [01:59<01:04,  5.36s/it]

Extracted: PaLM.pdf


Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
 69%|██████▊   | 24/35 [02:05<00:59,  5.45s/it]

Extracted: PaLM2.pdf


Cannot set gray non-stroke color because /'P14' is an invalid float value
 71%|███████▏  | 25/35 [02:10<00:53,  5.40s/it]

Extracted: PanGu-Σ_(2023).pdf


 74%|███████▍  | 26/35 [02:11<00:36,  4.03s/it]

Extracted: Phi-3_Safety_Post-Training_(2024).pdf


 77%|███████▋  | 27/35 [02:13<00:26,  3.29s/it]

Extracted: Phi-3_Technical_Report_(2024).pdf


 80%|████████  | 28/35 [02:14<00:18,  2.66s/it]

Extracted: Qwen2-Audio_(2024).pdf


 83%|████████▎ | 29/35 [02:17<00:16,  2.70s/it]

Extracted: Qwen2-VL_(2024).pdf


 86%|████████▌ | 30/35 [02:18<00:11,  2.29s/it]

Extracted: Qwen2.5-Omni_Technical_Report_(2025).pdf


 89%|████████▊ | 31/35 [02:20<00:08,  2.08s/it]

Extracted: Qwen2.5-VL_Technical_Report_(2025).pdf


 91%|█████████▏| 32/35 [02:21<00:05,  1.99s/it]

Extracted: Qwen2.5_Technical_Report_(2024).pdf


 94%|█████████▍| 33/35 [02:23<00:03,  1.85s/it]

Extracted: Qwen2_Technical_Report_(2024).pdf


 97%|█████████▋| 34/35 [02:26<00:02,  2.17s/it]

Extracted: Qwen3_Technical_Report_(2025).pdf


100%|██████████| 35/35 [02:26<00:00,  4.19s/it]

Extracted: RecurrentGemma_(2024).pdf





# Chunk text into segments

In [16]:
!pip install jsonlines



In [17]:
import jsonlines

In [18]:
text_files = list(RAW_TEXT_DIR.glob("*.txt"))
print(f"Found {len(text_files)} text files")

for txt in tqdm(text_files):
    text = txt.read_text(encoding="utf-8")
    chunks = split_into_token_chunks(text, MAX_TOKENS_PER_CHUNK, OVERLAP_TOKENS, MIN_TOKENS_PER_CHUNK)
    out = CHUNKS_DIR / (txt.stem + ".chunks.jsonl")
    with jsonlines.open(out, "w") as w:
        for i, ch in enumerate(chunks):
            w.write({"source": txt.name, "chunk_id": i, "tokens": token_len(ch), "text": ch})
    print(f"Chunked {txt.name}: {len(chunks)} chunks")


Found 35 text files


 14%|█▍        | 5/35 [00:00<00:00, 38.97it/s]

Chunked CodeGemma_(2024).txt: 16 chunks
Chunked DeepSeek-Coder_(2024).txt: 32 chunks
Chunked DeepSeek-R1_(2025).txt: 27 chunks
Chunked DeepSeek-V2_(2024).txt: 63 chunks
Chunked DeepSeek-V3_Technical_Report_(2024).txt: 70 chunks
Chunked Gemini_1.0.txt: 94 chunks


 34%|███▍      | 12/35 [00:00<00:00, 24.03it/s]

Chunked Gemini_1.5_(2024).txt: 182 chunks
Chunked Gemma_(2024).txt: 25 chunks
Chunked Gemma_2_(2024).txt: 32 chunks
Chunked Gemma_3.txt: 38 chunks
Chunked GLM-130B_(2022).txt: 82 chunks
Chunked GPT-3.txt: 106 chunks


 43%|████▎     | 15/35 [00:00<00:00, 20.48it/s]

Chunked GPT-4.txt: 121 chunks
Chunked InternLM2_Technical_Report_(2024).txt: 72 chunks
Chunked InternVL_2.5_(2024).txt: 102 chunks
Chunked LLaMA_(2023).txt: 43 chunks


 51%|█████▏    | 18/35 [00:00<00:00, 18.65it/s]

Chunked Llama_2_(2023).txt: 115 chunks
Chunked Llama_3_(2024).txt: 151 chunks
Chunked Mistral_7B_(2023).txt: 11 chunks
Chunked Mixtral_of_Experts_8x7B_(2024).txt: 15 chunks
Chunked Nemotron-4_340B_Technical_Report_(2024).txt: 38 chunks
Chunked NVLM_1.0_(2024).txt: 60 chunks


 74%|███████▍  | 26/35 [00:01<00:00, 23.31it/s]

Chunked PaLM.txt: 133 chunks
Chunked PaLM2.txt: 122 chunks
Chunked PanGu-Σ_(2023).txt: 61 chunks
Chunked Phi-3_Safety_Post-Training_(2024).txt: 18 chunks
Chunked Phi-3_Technical_Report_(2024).txt: 30 chunks
Chunked Qwen2-Audio_(2024).txt: 17 chunks
Chunked Qwen2-VL_(2024).txt: 54 chunks
Chunked Qwen2.5-Omni_Technical_Report_(2025).txt: 36 chunks


100%|██████████| 35/35 [00:01<00:00, 26.04it/s]

Chunked Qwen2.5-VL_Technical_Report_(2025).txt: 42 chunks
Chunked Qwen2.5_Technical_Report_(2024).txt: 50 chunks
Chunked Qwen2_Technical_Report_(2024).txt: 43 chunks
Chunked Qwen3_Technical_Report_(2025).txt: 70 chunks
Chunked RecurrentGemma_(2024).txt: 8 chunks





# Triplet generation via vLLM

In [20]:
# ============================================
# Llama-3-70B triplet generation via vLLM (OpenAI-compatible)
# ============================================
import os, json, math, random, time, collections, datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

import jsonlines
from tenacity import retry, stop_after_attempt, wait_exponential_jitter

try:
    from openai import OpenAI
except Exception:
    !pip -q install openai==1.51.2
    from openai import OpenAI

# -----------------------
# CONFIG
# -----------------------
# Local vLLM server (started in step 1)
BASE_URL   = "http://127.0.0.1:8000/v1"
API_KEY    = "EMPTY"  # vLLM ignores, but the client expects a string
MODEL_NAME = "TheBloke/Llama-3-70B-Instruct-GPTQ"   # same as you passed to vLLM; or --served-model-name

# Target accounting (you already produced 73 from InternVL)
TARGET_TOTAL = 2000
ALREADY_HAVE = 73
REMAINING    = TARGET_TOTAL - ALREADY_HAVE  # 1927

# Papers to skip this run
SKIP_SUBSTRINGS = ["InternVL"]

# Concurrency
MAX_WORKERS = 8
RNG_SEED    = 13
random.seed(RNG_SEED)

# Triplet policy (per chunk)
N_TRIPLETS_PER_CHUNK = 1           # predictable volume & cost

# Paths (adjust if needed)
CHUNKS_DIR = Path("data/chunks")   # folder with *.chunks.jsonl created earlier
OUT_DIR    = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Output file for this run
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
triplets_path = OUT_DIR / f"triplets_run_llama70b_{ts}.jsonl"

# Prompts (same structure as your GPT-5 pipeline)
SYSTEM_PROMPT = (
    "You are a meticulous data constructor. Given a technical passage (CHUNK), "
    "produce instruction-tuning triplets that are useful for training. "
    "Prefer concrete, unambiguous, domain-grounded questions."
)
GEN_PROMPT = (
    "You will receive a technical passage (CHUNK). Produce {k} high-quality instruction-tuning triplets.\n"
    'Each triplet is a JSON object with fields: "question", "input", "response".\n'
    "Return a JSON array of triplets only."
)

# OpenAI-compatible client pointed at vLLM
client = OpenAI(base_url=BASE_URL, api_key=API_KEY)

# -----------------------
# DISCOVER ELIGIBLE PAPERS
# -----------------------
def skip_file(p: Path) -> bool:
    name = p.name.lower()
    return any(s.lower() in name for s in SKIP_SUBSTRINGS)

chunk_files = sorted(CHUNKS_DIR.glob("*.chunks.jsonl"))
eligible_files = [p for p in chunk_files if not skip_file(p)]
assert eligible_files, "No eligible papers after applying SKIP_SUBSTRINGS."

num_papers = len(eligible_files)   # should be 36 for you
base = REMAINING // num_papers
extra = REMAINING - base * num_papers  # first 'extra' papers get +1
per_paper_targets = {p: base for p in eligible_files}
for p in eligible_files[:extra]:
    per_paper_targets[p] += 1

print(f"Eligible papers: {num_papers}")
print(f"Per-paper targets: base={base}, extra={extra}  → 53/54 mix")
print(f"k (triplets per chunk) = {N_TRIPLETS_PER_CHUNK}")
print(f"Output file → {triplets_path}")

# -----------------------
# READ CHUNKS & BUILD WORK
# -----------------------
def read_chunks(file_path: Path):
    rows = []
    with jsonlines.open(file_path, "r") as r:
        for obj in r:
            rows.append(obj)
    return rows

work = []  # list[(paper_path, chunk_obj)]
for cf in eligible_files:
    need_triplets = per_paper_targets[cf]                 # 53 or 54
    need_chunks   = math.ceil(need_triplets / N_TRIPLETS_PER_CHUNK)
    rows = read_chunks(cf)
    chosen = rows if len(rows) <= need_chunks else random.sample(rows, need_chunks)
    for obj in chosen:
        work.append((cf, obj))

# cap by theoretical max chunk need
max_chunks_needed = math.ceil(REMAINING / N_TRIPLETS_PER_CHUNK)
if len(work) > max_chunks_needed:
    work = work[:max_chunks_needed]

print(f"Planned chunks to process: {len(work)}  (cap: {max_chunks_needed})")

# -----------------------
# MODEL CALL (with retry)
# -----------------------
@retry(stop=stop_after_attempt(4), wait=wait_exponential_jitter(initial=1, max=8))
def call_model(chunk_text: str, k: int):
    user_prompt = f"CHUNK:\n\n{chunk_text}\n\n---\nPlease output exactly a JSON array of {k} triplets."
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user",   "content": GEN_PROMPT.format(k=k)},
        {"role": "user",   "content": user_prompt},
    ]
    # vLLM supports Chat Completions compat
    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=0.2,          # adjust if you need more variety
        max_tokens=800,           # guardrail for long outputs
    )
    text = resp.choices[0].message.content
    try:
        data = json.loads(text)
        return data if isinstance(data, list) else []
    except Exception:
        # Be permissive: if malformed, return empty instead of failing the batch
        return []

def make_records(meta_chunk_obj, triplets_list):
    out = []
    for t in triplets_list:
        rec = {
            "question": (t.get("question") or "").strip(),
            "input":     t.get("input", ""),
            "response": (t.get("response") or "").strip(),
            "meta": meta_chunk_obj,  # carries source, chunk_id, tokens, etc.
        }
        if rec["question"] and rec["response"]:
            out.append(rec)
    return out

# -----------------------
# PARALLEL EXECUTION (single writer)
# -----------------------
written = 0
per_source_written = collections.Counter()

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex, jsonlines.open(triplets_path, "a") as w:
    futures = []
    for (src_path, chunk_obj) in work:
        fut = ex.submit(call_model, chunk_obj["text"], N_TRIPLETS_PER_CHUNK)
        fut.meta_chunk = chunk_obj
        futures.append(fut)

    for fut in as_completed(futures):
        triplets = fut.result()
        recs = make_records(fut.meta_chunk, triplets)
        for rec in recs:
            if written >= REMAINING:
                break
            w.write(rec)  # single writer (main thread) → no file corruption
            written += 1
            per_source_written[rec["meta"]["source"]] += 1
        if written >= REMAINING:
            break

print(f"[DONE] Wrote {written} triplets → {triplets_path}")
print("Per-paper tally (top 10):")
for k, v in per_source_written.most_common(10):
    print(f"  {k}: {v}")
print("Total papers written:", len(per_source_written))


Eligible papers: 34
Per-paper targets: base=56, extra=23  → 53/54 mix
k (triplets per chunk) = 1
Output file → outputs\triplets_run_llama70b_20251028_000630.jsonl
Planned chunks to process: 1428  (cap: 1927)


RetryError: RetryError[<Future at 0x2301a0b5360 state=finished raised APIConnectionError>]