<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import argparse, time, re, sys, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# [ADD] Py3.8/3.9 typing support
from typing import Optional
# [ADD] Read prompt from a file path
from pathlib import Path

PRESETS = {
    "gpt2": "openai-community/gpt2",
    "qwen3-0.6b": "Qwen/Qwen3-0.6B",
    "nemotron-1.5b": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
}

DTYPE_MAP = {
    "auto": "auto",
    "float32": torch.float32,
    "float16": torch.float16,
    "bfloat16": torch.bfloat16,
}

# [ADD] Public API: загрузка модели и один запуск генерации из Python
def load_model(model: str = "gpt2", device: Optional[str] = None, dtype: str = "auto", local_files_only: bool = False):
    """[API] Загрузить токенайзер и модель. Возвращает (tokenizer, model, device).
    Пример:
        tok, mdl, dev = load_model("gpt2", device="cpu")
    """
    model_id = PRESETS.get(model.lower(), model)
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cuda" and not torch.cuda.is_available():
        device = "cpu"
    torch_dtype = DTYPE_MAP[dtype]

    tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=local_files_only)
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model_obj = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        local_files_only=local_files_only,
    ).eval().to(device)

    return tokenizer, model_obj, device

# [ADD] [API] Один запуск генерации поверх уже загруженных токенайзера/модели
def generate_once(
    tokenizer,
    model,
    *,
    prompt: Optional[str] = None,
    prompt_file: Optional[str] = None,
    system: Optional[str] = None,
    thinking: bool = False,
    strip_think: bool = False,
    max_new_tokens: int = 64,
    do_sample: bool = False,
    temperature: float = 0.0,
    top_p: float = 1.0,
    device: Optional[str] = None,
    warmup: bool = True,
):
    """[API] Сгенерировать текст. Возвращает (text, stats_dict).
    Пример:
        text, stats = generate_once(tok, mdl, prompt="Hello", max_new_tokens=16)
    """
    if prompt is None and prompt_file:
        try:
            prompt = Path(prompt_file).read_text(encoding="utf-8").strip()
        except Exception as e:
            raise RuntimeError(f"Failed to read prompt_file '{prompt_file}': {e}")
    if prompt is None:
        prompt = ""

    # Подготовка входа
    inputs = build_inputs(tokenizer, prompt, system=system, enable_thinking=thinking, device=device or ("cuda" if torch.cuda.is_available() else "cpu"))

    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature if do_sample else None,
        top_p=top_p if do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

    # Прогрев (минимальный)
    if warmup:
        with torch.inference_mode():
            _ = model.generate(**{k: v.clone() for k, v in inputs.items()}, max_new_tokens=1)

    if (device or (torch.cuda.is_available() and model.device.type == "cuda")) and model.device.type == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)
    if model.device.type == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    new_tokens = int(out.shape[-1] - inputs["input_ids"].shape[-1])
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    if strip_think:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    stats = {
        "device": str(model.device),
        "elapsed_s": dt,
        "new_tokens": new_tokens,
        "tokens_per_s": (new_tokens / dt) if dt > 0 else float("inf"),
    }
    return text, stats

# [ADD] [API] Удобная обёртка: загрузка + генерация за один вызов
def generate_text(
    *,
    model: str = "gpt2",
    device: Optional[str] = None,
    dtype: str = "auto",
    local_files_only: bool = False,
    prompt: Optional[str] = None,
    prompt_file: Optional[str] = None,
    system: Optional[str] = None,
    thinking: bool = False,
    strip_think: bool = False,
    max_new_tokens: int = 64,
    do_sample: bool = False,
    temperature: float = 0.0,
    top_p: float = 1.0,
):
    tok, mdl, dev = load_model(model=model, device=device, dtype=dtype, local_files_only=local_files_only)
    text, stats = generate_once(
        tok,
        mdl,
        prompt=prompt,
        prompt_file=prompt_file,
        system=system,
        thinking=thinking,
        strip_think=strip_think,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        device=dev,
    )
    return text, stats

# [ADD] Helpers to make the script robust in notebooks / Jupyter where argv contains ipykernel args to make the script robust in notebooks / Jupyter where argv contains ipykernel args
def _in_notebook() -> bool:
    try:
        from IPython import get_ipython  # type: ignore
        return get_ipython() is not None
    except Exception:
        return False

# [ADD] Safe prompt reader that won't block in notebooks when stdin is not provided
# [MOD] Py3.8/3.9 compatible typing (PEP 604 not available); use Optional[str]
def _read_prompt_or_default(arg_prompt: Optional[str]) -> str:
    if arg_prompt is not None:
        return arg_prompt
    # Try to read from stdin only if data is available; otherwise fall back to empty prompt
    try:
        if sys.stdin and not sys.stdin.isatty():
            data = sys.stdin.read()
            if data:
                return data
    except Exception:
        pass
    # Fallback: empty prompt (safe for generation) with a short notice printed by caller
    return ""

def build_inputs(tokenizer, prompt, system=None, enable_thinking=None, device="cpu"):
    """Return input_ids tensor on the target device, using chat template if present."""
    use_chat = getattr(tokenizer, "chat_template", None) not in (None, "", False)
    if use_chat:
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        # enable_thinking is used by Qwen3; harmless for tokenizers that ignore extra vars
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking if enable_thinking is not None else False,
        )
        model_inputs = tokenizer([text], return_tensors="pt")
    else:
        model_inputs = tokenizer([prompt], return_tensors="pt")
        # [ADD] Ensure non-empty input for decoder-only models when prompt is empty
        if model_inputs["input_ids"].shape[1] == 0:
            fallback_id = tokenizer.eos_token_id
            if fallback_id is None:
                fallback_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
            model_inputs = {
                "input_ids": torch.tensor([[fallback_id]], dtype=torch.long),
                "attention_mask": torch.tensor([[1]], dtype=torch.long),
            }

    return {k: v.to(device) for k, v in model_inputs.items()}

def main():
    p = argparse.ArgumentParser(description="Simple local HF generation harness")
    # [MOD] Make --model optional with a sensible default so the script runs inside notebooks
    p.add_argument(
        "--model",
        default=(PRESETS.get("gpt2") or "openai-community/gpt2"),
        help="preset key (gpt2|qwen3-0.6b|nemotron-1.5b) or any HF repo id; default=gpt2",
    )
    p.add_argument("--device", choices=["cpu", "cuda"], default="cuda" if torch.cuda.is_available() else "cpu")
    p.add_argument("--dtype", choices=list(DTYPE_MAP.keys()), default="auto",
                   help="torch dtype for model weights")
    p.add_argument("--max-new-tokens", type=int, default=64)
    p.add_argument("--temperature", type=float, default=0.0, help="0.0 => greedy")
    p.add_argument("--top-p", type=float, default=1.0)
    p.add_argument("--do-sample", action="store_true", help="enable sampling (else greedy)")
    p.add_argument("--thinking", action="store_true",
                   help="For Qwen3: enable thinking mode (adds <think>...</think> content)")
    p.add_argument("--strip-think", action="store_true",
                   help="Strip <think>...</think> block from decoded output (if present)")
    p.add_argument("--system", default=None, help="Optional system prompt for chat models")
    p.add_argument("--prompt", default=None, help="Prompt; if omitted, read from stdin; in notebooks defaults to empty string")
    # [ADD] Read prompt from file
    p.add_argument("--prompt-file", default=None, help="Path to a text file with the prompt (UTF-8)")
    p.add_argument("--print-tokens", action="store_true", help="Also print token counts and toks/sec")
    # [MOD] In notebooks, ignore unrelated ipykernel args by parsing only known flags
    args = p.parse_args([]) if _in_notebook() else p.parse_args()

    model_id = PRESETS.get(args.model.lower(), args.model)
    device = args.device
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU", file=sys.stderr)
        device = "cpu"

    torch_dtype = DTYPE_MAP[args.dtype]

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    # Ensure pad_token_id exists for generation
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
    )
    model.eval().to(device)

    # [MOD] Prompt resolution priority: --prompt-file > --prompt > stdin > PROMPT.TXT > empty
    prompt = None
    if args.prompt_file:
        try:
            prompt = Path(args.prompt_file).read_text(encoding="utf-8").strip()
            print(f"[info] Loaded prompt from file: {args.prompt_file}")
        except Exception as e:
            print(f"[warn] Failed to read --prompt-file '{args.prompt_file}': {e}", file=sys.stderr)
    if prompt is None and args.prompt is not None:
        prompt = args.prompt
    if prompt is None:
        # try stdin (non-blocking path)
        data = _read_prompt_or_default(None)
        if data:
            prompt = data
    if prompt is None and Path("PROMPT.TXT").exists():
        try:
            prompt = Path("PROMPT.TXT").read_text(encoding="utf-8").strip()
            print("[info] Using PROMPT.TXT from current directory.")
        except Exception as e:
            print(f"[warn] Failed to read PROMPT.TXT: {e}", file=sys.stderr)
    if prompt is None:
        prompt = ""
        if _in_notebook():
            print("[info] No --prompt/--prompt-file/stdin and PROMPT.TXT not found; using empty prompt.")

    inputs = build_inputs(
        tokenizer,
        prompt,
        system=args.system,
        enable_thinking=args.thinking,
        device=device
    )

    gen_kwargs = dict(
        max_new_tokens=args.max_new_tokens,
        do_sample=args.do_sample,
        temperature=args.temperature if args.do_sample else None,
        top_p=args.top_p if args.do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    # Remove None entries (generate() complains)
    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

    # Warmup (tiny, optional)
    with torch.inference_mode():
        _ = model.generate(**{k: v.clone() for k, v in inputs.items()}, max_new_tokens=1)

    # Timed run
    if device == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)
    if device == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    # Separate new tokens from the continuation
    new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
    text = tokenizer.decode(out[0], skip_special_tokens=True)

    if args.strip_think:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    print(text)

    if args.print_tokens:
        toks_per_s = new_tokens / dt if dt > 0 else float("inf")
        print("\n--- stats ---")
        print(f"device: {device}")
        if device == "cuda":
            try:
                print(f"gpu: {torch.cuda.get_device_name()}")
            except Exception:
                pass
        print(f"elapsed_s: {dt:.3f}")
        print(f"new_tokens: {new_tokens}")
        print(f"tokens_per_s: {toks_per_s:.2f}")

if __name__ == "__main__":
    main()


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


[info] Using PROMPT.TXT from current directory.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


For overdetermined reasons, I’ve lately found the world an increasingly terrifying and depressing place. It’s gotten harder and harder to concentrate on research, or even popular science writing. Every so often, though, something breaks through that wakes my inner child, reminds me of why I fell in love with research thirty years ago, and helps me forget about the triumphantly strutting factions working to destroy everything I value.

I've been a researcher for twenty years, and I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never


In [3]:
import argparse, time, re, sys, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# [ADD] Py3.8/3.9 typing support
from typing import Optional
# [ADD] Read prompt from a file path
from pathlib import Path

PRESETS = {
    "gpt2": "openai-community/gpt2",
    "qwen3": "Qwen/Qwen3-0.6B",  # [ADD] alias for convenience
    "qwen3-0.6b": "Qwen/Qwen3-0.6B",
    "nemotron-1.5b": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
}

DTYPE_MAP = {
    "auto": "auto",
    "float32": torch.float32,
    "float16": torch.float16,
    "bfloat16": torch.bfloat16,
}

# [ADD] Public API: загрузка модели и один запуск генерации из Python
def load_model(model: str = "gpt2", device: Optional[str] = None, dtype: str = "auto", local_files_only: bool = False, trust_remote_code: bool = False):  # [MOD] added trust_remote_code
    """[API] Загрузить токенайзер и модель. Возвращает (tokenizer, model, device).
    Пример:
        tok, mdl, dev = load_model("gpt2", device="cpu")
    """
    model_id = PRESETS.get(model.lower(), model)
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cuda" and not torch.cuda.is_available():
        device = "cpu"
    torch_dtype = DTYPE_MAP[dtype]

    tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=local_files_only, trust_remote_code=trust_remote_code)  # [MOD] pass trust_remote_code
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model_obj = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        local_files_only=local_files_only,
        trust_remote_code=trust_remote_code,  # [MOD]
    ).eval().to(device)

    return tokenizer, model_obj, device

# [ADD] [API] Один запуск генерации поверх уже загруженных токенайзера/модели
def generate_once(
    tokenizer,
    model,
    *,
    prompt: Optional[str] = None,
    prompt_file: Optional[str] = None,
    system: Optional[str] = None,
    thinking: bool = False,
    strip_think: bool = False,
    max_new_tokens: int = 64,
    do_sample: bool = False,
    temperature: float = 0.0,
    top_p: float = 1.0,
    device: Optional[str] = None,
    warmup: bool = True,
):
    """[API] Сгенерировать текст. Возвращает (text, stats_dict).
    Пример:
        text, stats = generate_once(tok, mdl, prompt="Hello", max_new_tokens=16)
    """
    if prompt is None and prompt_file:
        try:
            prompt = Path(prompt_file).read_text(encoding="utf-8").strip()
        except Exception as e:
            raise RuntimeError(f"Failed to read prompt_file '{prompt_file}': {e}")
    if prompt is None:
        prompt = ""

    # Подготовка входа
    inputs = build_inputs(tokenizer, prompt, system=system, enable_thinking=thinking, device=device or ("cuda" if torch.cuda.is_available() else "cpu"))

    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature if do_sample else None,
        top_p=top_p if do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

    # Прогрев (минимальный)
    if warmup:
        with torch.inference_mode():
            _ = model.generate(**{k: v.clone() for k, v in inputs.items()}, max_new_tokens=1)

    if (device or (torch.cuda.is_available() and model.device.type == "cuda")) and model.device.type == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)
    if model.device.type == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    new_tokens = int(out.shape[-1] - inputs["input_ids"].shape[-1])
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    if strip_think:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    stats = {
        "device": str(model.device),
        "elapsed_s": dt,
        "new_tokens": new_tokens,
        "tokens_per_s": (new_tokens / dt) if dt > 0 else float("inf"),
    }
    return text, stats

# [ADD] [API] Удобная обёртка: загрузка + генерация за один вызов
def generate_text(
    *,
    model: str = "gpt2",
    device: Optional[str] = None,
    dtype: str = "auto",
    local_files_only: bool = False,
    prompt: Optional[str] = None,
    prompt_file: Optional[str] = None,
    system: Optional[str] = None,
    thinking: bool = False,
    strip_think: bool = False,
    max_new_tokens: int = 64,
    do_sample: bool = False,
    temperature: float = 0.0,
    top_p: float = 1.0,
):
    tok, mdl, dev = load_model(model=model, device=device, dtype=dtype, local_files_only=local_files_only)
    text, stats = generate_once(
        tok,
        mdl,
        prompt=prompt,
        prompt_file=prompt_file,
        system=system,
        thinking=thinking,
        strip_think=strip_think,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        device=dev,
    )
    return text, stats

# [ADD] Helpers to make the script robust in notebooks / Jupyter where argv contains ipykernel args to make the script robust in notebooks / Jupyter where argv contains ipykernel args
def _in_notebook() -> bool:
    try:
        from IPython import get_ipython  # type: ignore
        return get_ipython() is not None
    except Exception:
        return False

# [ADD] Safe prompt reader that won't block in notebooks when stdin is not provided
# [MOD] Py3.8/3.9 compatible typing (PEP 604 not available); use Optional[str]
def _read_prompt_or_default(arg_prompt: Optional[str]) -> str:
    if arg_prompt is not None:
        return arg_prompt
    # Try to read from stdin only if data is available; otherwise fall back to empty prompt
    try:
        if sys.stdin and not sys.stdin.isatty():
            data = sys.stdin.read()
            if data:
                return data
    except Exception:
        pass
    # Fallback: empty prompt (safe for generation) with a short notice printed by caller
    return ""

def build_inputs(tokenizer, prompt, system=None, enable_thinking=None, device="cpu"):
    """Return input_ids tensor on the target device, using chat template if present."""
    use_chat = getattr(tokenizer, "chat_template", None) not in (None, "", False)
    if use_chat:
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        # enable_thinking is used by Qwen3; harmless for tokenizers that ignore extra vars
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking if enable_thinking is not None else False,
        )
        model_inputs = tokenizer([text], return_tensors="pt")
    else:
        model_inputs = tokenizer([prompt], return_tensors="pt")
        # [ADD] Ensure non-empty input for decoder-only models when prompt is empty
        if model_inputs["input_ids"].shape[1] == 0:
            fallback_id = tokenizer.eos_token_id
            if fallback_id is None:
                fallback_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
            model_inputs = {
                "input_ids": torch.tensor([[fallback_id]], dtype=torch.long),
                "attention_mask": torch.tensor([[1]], dtype=torch.long),
            }

    return {k: v.to(device) for k, v in model_inputs.items()}

def main():
    p = argparse.ArgumentParser(description="Simple local HF generation harness")
    # [MOD] Make --model optional with a sensible default so the script runs inside notebooks
    p.add_argument(
        "--model",
        default=(PRESETS.get("gpt2") or "openai-community/gpt2"),
        help="preset key (gpt2|qwen3-0.6b|nemotron-1.5b) or any HF repo id; default=gpt2",
    )
    p.add_argument("--device", choices=["cpu", "cuda"], default="cuda" if torch.cuda.is_available() else "cpu")
    p.add_argument("--dtype", choices=list(DTYPE_MAP.keys()), default="auto",
                   help="torch dtype for model weights")
    p.add_argument("--max-new-tokens", type=int, default=64)
    p.add_argument("--temperature", type=float, default=0.0, help="0.0 => greedy")
    p.add_argument("--top-p", type=float, default=1.0)
    p.add_argument("--do-sample", action="store_true", help="enable sampling (else greedy)")
    p.add_argument("--thinking", action="store_true",
                   help="For Qwen3: enable thinking mode (adds <think>...</think> content)")
    p.add_argument("--strip-think", action="store_true",
                   help="Strip <think>...</think> block from decoded output (if present)")
    p.add_argument("--system", default=None, help="Optional system prompt for chat models")
    p.add_argument("--prompt", default=None, help="Prompt; if omitted, read from stdin; in notebooks defaults to empty string")
    # [ADD] Read prompt from file
    p.add_argument("--prompt-file", default=None, help="Path to a text file with the prompt (UTF-8)")
    p.add_argument("--print-tokens", action="store_true", help="Also print token counts and toks/sec")
    # [ADD] allow remote code (needed by some repos)
    p.add_argument("--trust-remote-code", action="store_true", help="Allow custom modeling code from repo (use only for trusted repos)")
    # [MOD] In notebooks, ignore unrelated ipykernel args by parsing only known flags
    args = p.parse_args([]) if _in_notebook() else p.parse_args()

    model_id = PRESETS.get(args.model.lower(), args.model)
    device = args.device
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU", file=sys.stderr)
        device = "cpu"

    torch_dtype = DTYPE_MAP[args.dtype]

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=args.trust_remote_code)  # [MOD]
    # Ensure pad_token_id exists for generation
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        trust_remote_code=args.trust_remote_code,  # [MOD]
    )
    model.eval().to(device)

    # [MOD] Prompt resolution priority: --prompt-file > --prompt > stdin > PROMPT.TXT > empty
    prompt = None
    if args.prompt_file:
        try:
            prompt = Path(args.prompt_file).read_text(encoding="utf-8").strip()
            print(f"[info] Loaded prompt from file: {args.prompt_file}")
        except Exception as e:
            print(f"[warn] Failed to read --prompt-file '{args.prompt_file}': {e}", file=sys.stderr)
    if prompt is None and args.prompt is not None:
        prompt = args.prompt
    if prompt is None:
        # try stdin (non-blocking path)
        data = _read_prompt_or_default(None)
        if data:
            prompt = data
    if prompt is None and Path("PROMPT.TXT").exists():
        try:
            prompt = Path("PROMPT.TXT").read_text(encoding="utf-8").strip()
            print("[info] Using PROMPT.TXT from current directory.")
        except Exception as e:
            print(f"[warn] Failed to read PROMPT.TXT: {e}", file=sys.stderr)
    if prompt is None:
        prompt = ""
        if _in_notebook():
            print("[info] No --prompt/--prompt-file/stdin and PROMPT.TXT not found; using empty prompt.")

    inputs = build_inputs(
        tokenizer,
        prompt,
        system=args.system,
        enable_thinking=args.thinking,
        device=device
    )

    gen_kwargs = dict(
        max_new_tokens=args.max_new_tokens,
        do_sample=args.do_sample,
        temperature=args.temperature if args.do_sample else None,
        top_p=args.top_p if args.do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    # Remove None entries (generate() complains)
    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

    # Warmup (tiny, optional)
    with torch.inference_mode():
        _ = model.generate(**{k: v.clone() for k, v in inputs.items()}, max_new_tokens=1)

    # Timed run
    if device == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)
    if device == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    # Separate new tokens from the continuation
    new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
    text = tokenizer.decode(out[0], skip_special_tokens=True)

    if args.strip_think:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    print(text)

    if args.print_tokens:
        toks_per_s = new_tokens / dt if dt > 0 else float("inf")
        print("\n--- stats ---")
        print(f"device: {device}")
        if device == "cuda":
            try:
                print(f"gpu: {torch.cuda.get_device_name()}")
            except Exception:
                pass
        print(f"elapsed_s: {dt:.3f}")
        print(f"new_tokens: {new_tokens}")
        print(f"tokens_per_s: {toks_per_s:.2f}")

if __name__ == "__main__":
    main()


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[info] Using PROMPT.TXT from current directory.
For overdetermined reasons, I’ve lately found the world an increasingly terrifying and depressing place. It’s gotten harder and harder to concentrate on research, or even popular science writing. Every so often, though, something breaks through that wakes my inner child, reminds me of why I fell in love with research thirty years ago, and helps me forget about the triumphantly strutting factions working to destroy everything I value.

I've been a researcher for twenty years, and I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never


In [4]:
import argparse, time, re, sys, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# [ADD] Py3.8/3.9 typing support
from typing import Optional
# [ADD] Read prompt from a file path
from pathlib import Path

PRESETS = {
    "gpt2": "openai-community/gpt2",
    "qwen3": "Qwen/Qwen3-0.6B",  # [ADD] alias for convenience
    "qwen3-0.6b": "Qwen/Qwen3-0.6B",
    "nemotron-1.5b": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
}

DTYPE_MAP = {
    "auto": "auto",
    "float32": torch.float32,
    "float16": torch.float16,
    "bfloat16": torch.bfloat16,
}

# [ADD] Public API: загрузка модели и один запуск генерации из Python
def load_model(model: str = "qwen3", device: Optional[str] = None, dtype: str = "auto", local_files_only: bool = False, trust_remote_code: Optional[bool] = None):  # [MOD] default model qwen3; infer trust by model if None  # [MOD] added trust_remote_code
    """[API] Загрузить токенайзер и модель. Возвращает (tokenizer, model, device).
    Пример:
        tok, mdl, dev = load_model("gpt2", device="cpu")
    """
    model_id = PRESETS.get(model.lower(), model)
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cuda" and not torch.cuda.is_available():
        device = "cpu"
    torch_dtype = DTYPE_MAP[dtype]

    # [ADD] If trust_remote_code is not specified, auto-enable for Qwen models
    if trust_remote_code is None:
        try:
            if (model.lower().startswith("qwen") or "Qwen/" in model_id):
                trust_remote_code = True
        except Exception:
            trust_remote_code = False

    tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=local_files_only, trust_remote_code=trust_remote_code)  # [MOD] pass trust_remote_code
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model_obj = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        local_files_only=local_files_only,
        trust_remote_code=trust_remote_code,  # [MOD]
    ).eval().to(device)

    return tokenizer, model_obj, device

# [ADD] [API] Один запуск генерации поверх уже загруженных токенайзера/модели
def generate_once(
    tokenizer,
    model,
    *,
    prompt: Optional[str] = None,
    prompt_file: Optional[str] = None,
    system: Optional[str] = None,
    thinking: bool = False,
    strip_think: bool = False,
    max_new_tokens: int = 64,
    do_sample: bool = False,
    temperature: float = 0.0,
    top_p: float = 1.0,
    device: Optional[str] = None,
    warmup: bool = True,
):
    """[API] Сгенерировать текст. Возвращает (text, stats_dict).
    Пример:
        text, stats = generate_once(tok, mdl, prompt="Hello", max_new_tokens=16)
    """
    if prompt is None and prompt_file:
        try:
            prompt = Path(prompt_file).read_text(encoding="utf-8").strip()
        except Exception as e:
            raise RuntimeError(f"Failed to read prompt_file '{prompt_file}': {e}")
    if prompt is None:
        prompt = ""

    # Подготовка входа
    inputs = build_inputs(tokenizer, prompt, system=system, enable_thinking=thinking, device=device or ("cuda" if torch.cuda.is_available() else "cpu"))

    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature if do_sample else None,
        top_p=top_p if do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

    # Прогрев (минимальный)
    if warmup:
        with torch.inference_mode():
            _ = model.generate(**{k: v.clone() for k, v in inputs.items()}, max_new_tokens=1)

    if (device or (torch.cuda.is_available() and model.device.type == "cuda")) and model.device.type == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)
    if model.device.type == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    new_tokens = int(out.shape[-1] - inputs["input_ids"].shape[-1])
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    if strip_think:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    stats = {
        "device": str(model.device),
        "elapsed_s": dt,
        "new_tokens": new_tokens,
        "tokens_per_s": (new_tokens / dt) if dt > 0 else float("inf"),
    }
    return text, stats

# [ADD] [API] Удобная обёртка: загрузка + генерация за один вызов
def generate_text(
    *,
    model: str = "qwen3",
    device: Optional[str] = None,
    dtype: str = "auto",
    local_files_only: bool = False,
    prompt: Optional[str] = None,
    prompt_file: Optional[str] = None,
    system: Optional[str] = None,
    thinking: bool = False,
    strip_think: bool = False,
    max_new_tokens: int = 64,
    do_sample: bool = False,
    temperature: float = 0.0,
    top_p: float = 1.0,
    trust_remote_code: Optional[bool] = None,
):
    tok, mdl, dev = load_model(model=model, device=device, dtype=dtype, local_files_only=local_files_only, trust_remote_code=trust_remote_code)
    text, stats = generate_once(
        tok,
        mdl,
        prompt=prompt,
        prompt_file=prompt_file,
        system=system,
        thinking=thinking,
        strip_think=strip_think,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        device=dev,
    )
    return text, stats

# [ADD] Helpers to make the script robust in notebooks / Jupyter where argv contains ipykernel args to make the script robust in notebooks / Jupyter where argv contains ipykernel args
def _in_notebook() -> bool:
    try:
        from IPython import get_ipython  # type: ignore
        return get_ipython() is not None
    except Exception:
        return False

# [ADD] Safe prompt reader that won't block in notebooks when stdin is not provided
# [MOD] Py3.8/3.9 compatible typing (PEP 604 not available); use Optional[str]
def _read_prompt_or_default(arg_prompt: Optional[str]) -> str:
    if arg_prompt is not None:
        return arg_prompt
    # Try to read from stdin only if data is available; otherwise fall back to empty prompt
    try:
        if sys.stdin and not sys.stdin.isatty():
            data = sys.stdin.read()
            if data:
                return data
    except Exception:
        pass
    # Fallback: empty prompt (safe for generation) with a short notice printed by caller
    return ""

def build_inputs(tokenizer, prompt, system=None, enable_thinking=None, device="cpu"):
    """Return input_ids tensor on the target device, using chat template if present."""
    use_chat = getattr(tokenizer, "chat_template", None) not in (None, "", False)
    if use_chat:
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        # enable_thinking is used by Qwen3; harmless for tokenizers that ignore extra vars
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking if enable_thinking is not None else False,
        )
        model_inputs = tokenizer([text], return_tensors="pt")
    else:
        model_inputs = tokenizer([prompt], return_tensors="pt")
        # [ADD] Ensure non-empty input for decoder-only models when prompt is empty
        if model_inputs["input_ids"].shape[1] == 0:
            fallback_id = tokenizer.eos_token_id
            if fallback_id is None:
                fallback_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
            model_inputs = {
                "input_ids": torch.tensor([[fallback_id]], dtype=torch.long),
                "attention_mask": torch.tensor([[1]], dtype=torch.long),
            }

    return {k: v.to(device) for k, v in model_inputs.items()}

def main():
    p = argparse.ArgumentParser(description="Simple local HF generation harness")
    # [MOD] Make --model optional with a sensible default so the script runs inside notebooks
    p.add_argument(
        "--model",
        default=(PRESETS.get("qwen3") or "Qwen/Qwen3-0.6B"),
        help="preset key (gpt2|qwen3-0.6b|nemotron-1.5b) or any HF repo id; default=qwen3",
    )
    p.add_argument("--device", choices=["cpu", "cuda"], default="cuda" if torch.cuda.is_available() else "cpu")
    p.add_argument("--dtype", choices=list(DTYPE_MAP.keys()), default="auto",
                   help="torch dtype for model weights")
    p.add_argument("--max-new-tokens", type=int, default=64)
    p.add_argument("--temperature", type=float, default=0.0, help="0.0 => greedy")
    p.add_argument("--top-p", type=float, default=1.0)
    p.add_argument("--do-sample", action="store_true", help="enable sampling (else greedy)")
    p.add_argument("--thinking", action="store_true",
                   help="For Qwen3: enable thinking mode (adds <think>...</think> content)")
    p.add_argument("--strip-think", action="store_true",
                   help="Strip <think>...</think> block from decoded output (if present)")
    p.add_argument("--system", default=None, help="Optional system prompt for chat models")
    p.add_argument("--prompt", default=None, help="Prompt; if omitted, read from stdin; in notebooks defaults to empty string")
    # [ADD] Read prompt from file
    p.add_argument("--prompt-file", default=None, help="Path to a text file with the prompt (UTF-8)")
    p.add_argument("--print-tokens", action="store_true", help="Also print token counts and toks/sec")
    # [ADD] allow remote code (needed by some repos)
    p.add_argument("--trust-remote-code", action="store_true", help="Allow custom modeling code from repo (use only for trusted repos)")
    # [MOD] In notebooks, ignore unrelated ipykernel args by parsing only known flags
    args = p.parse_args([]) if _in_notebook() else p.parse_args()

    model_id = PRESETS.get(args.model.lower(), args.model)
    device = args.device
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU", file=sys.stderr)
        device = "cpu"

    torch_dtype = DTYPE_MAP[args.dtype]

    # [ADD] Auto-enable trust for Qwen models if flag not set
    trust_flag = args.trust_remote_code or ("Qwen/" in model_id or args.model.lower().startswith("qwen"))
    if trust_flag and not args.trust_remote_code:
        print("[info] Auto-enabling trust_remote_code for Qwen model.", file=sys.stderr)

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_flag)  # [MOD] use inferred flag
    # Ensure pad_token_id exists for generation
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        trust_remote_code=trust_flag,  # [MOD] use inferred flag
    )
    model.eval().to(device)

    # [MOD] Prompt resolution priority: --prompt-file > --prompt > stdin > PROMPT.TXT > empty
    prompt = None
    if args.prompt_file:
        try:
            prompt = Path(args.prompt_file).read_text(encoding="utf-8").strip()
            print(f"[info] Loaded prompt from file: {args.prompt_file}")
        except Exception as e:
            print(f"[warn] Failed to read --prompt-file '{args.prompt_file}': {e}", file=sys.stderr)
    if prompt is None and args.prompt is not None:
        prompt = args.prompt
    if prompt is None:
        # try stdin (non-blocking path)
        data = _read_prompt_or_default(None)
        if data:
            prompt = data
    if prompt is None and Path("PROMPT.TXT").exists():
        try:
            prompt = Path("PROMPT.TXT").read_text(encoding="utf-8").strip()
            print("[info] Using PROMPT.TXT from current directory.")
        except Exception as e:
            print(f"[warn] Failed to read PROMPT.TXT: {e}", file=sys.stderr)
    if prompt is None:
        prompt = ""
        if _in_notebook():
            print("[info] No --prompt/--prompt-file/stdin and PROMPT.TXT not found; using empty prompt.")

    inputs = build_inputs(
        tokenizer,
        prompt,
        system=args.system,
        enable_thinking=args.thinking,
        device=device
    )

    gen_kwargs = dict(
        max_new_tokens=args.max_new_tokens,
        do_sample=args.do_sample,
        temperature=args.temperature if args.do_sample else None,
        top_p=args.top_p if args.do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    # Remove None entries (generate() complains)
    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

    # Warmup (tiny, optional)
    with torch.inference_mode():
        _ = model.generate(**{k: v.clone() for k, v in inputs.items()}, max_new_tokens=1)

    # Timed run
    if device == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)
    if device == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    # Separate new tokens from the continuation
    new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
    text = tokenizer.decode(out[0], skip_special_tokens=True)

    if args.strip_think:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    print(text)

    if args.print_tokens:
        toks_per_s = new_tokens / dt if dt > 0 else float("inf")
        print("\n--- stats ---")
        print(f"device: {device}")
        if device == "cuda":
            try:
                print(f"gpu: {torch.cuda.get_device_name()}")
            except Exception:
                pass
        print(f"elapsed_s: {dt:.3f}")
        print(f"new_tokens: {new_tokens}")
        print(f"tokens_per_s: {toks_per_s:.2f}")

if __name__ == "__main__":
    main()


[info] Auto-enabling trust_remote_code for Qwen model.


generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


[info] Using PROMPT.TXT from current directory.


AttributeError: module 'jinja2' has no attribute 'pass_eval_context'

In [5]:
import argparse, time, re, sys, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# [ADD] Py3.8/3.9 typing support
from typing import Optional
# [ADD] Read prompt from a file path
from pathlib import Path

PRESETS = {
    "gpt2": "openai-community/gpt2",
    "qwen3": "Qwen/Qwen3-0.6B",  # [ADD] alias for convenience
    "qwen3-0.6b": "Qwen/Qwen3-0.6B",
    "nemotron-1.5b": "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B",
}

DTYPE_MAP = {
    "auto": "auto",
    "float32": torch.float32,
    "float16": torch.float16,
    "bfloat16": torch.bfloat16,
}

# [ADD] Public API: загрузка модели и один запуск генерации из Python
def load_model(model: str = "qwen3", device: Optional[str] = None, dtype: str = "auto", local_files_only: bool = False, trust_remote_code: Optional[bool] = None):  # [MOD] default model qwen3; infer trust by model if None  # [MOD] added trust_remote_code
    """[API] Загрузить токенайзер и модель. Возвращает (tokenizer, model, device).
    Пример:
        tok, mdl, dev = load_model("gpt2", device="cpu")
    """
    model_id = PRESETS.get(model.lower(), model)
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    if device == "cuda" and not torch.cuda.is_available():
        device = "cpu"
    torch_dtype = DTYPE_MAP[dtype]

    # [ADD] If trust_remote_code is not specified, auto-enable for Qwen models
    if trust_remote_code is None:
        try:
            if (model.lower().startswith("qwen") or "Qwen/" in model_id):
                trust_remote_code = True
        except Exception:
            trust_remote_code = False

    tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=local_files_only, trust_remote_code=trust_remote_code)  # [MOD] pass trust_remote_code
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model_obj = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        local_files_only=local_files_only,
        trust_remote_code=trust_remote_code,  # [MOD]
    ).eval().to(device)

    return tokenizer, model_obj, device

# [ADD] [API] Один запуск генерации поверх уже загруженных токенайзера/модели
def generate_once(
    tokenizer,
    model,
    *,
    prompt: Optional[str] = None,
    prompt_file: Optional[str] = None,
    system: Optional[str] = None,
    thinking: bool = False,
    strip_think: bool = False,
    max_new_tokens: int = 64,
    do_sample: bool = False,
    temperature: float = 0.0,
    top_p: float = 1.0,
    device: Optional[str] = None,
    warmup: bool = True,
):
    """[API] Сгенерировать текст. Возвращает (text, stats_dict).
    Пример:
        text, stats = generate_once(tok, mdl, prompt="Hello", max_new_tokens=16)
    """
    if prompt is None and prompt_file:
        try:
            prompt = Path(prompt_file).read_text(encoding="utf-8").strip()
        except Exception as e:
            raise RuntimeError(f"Failed to read prompt_file '{prompt_file}': {e}")
    if prompt is None:
        prompt = ""

    # Подготовка входа
    inputs = build_inputs(tokenizer, prompt, system=system, enable_thinking=thinking, device=device or ("cuda" if torch.cuda.is_available() else "cpu"))

    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature if do_sample else None,
        top_p=top_p if do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

    # Прогрев (минимальный)
    if warmup:
        with torch.inference_mode():
            _ = model.generate(**{k: v.clone() for k, v in inputs.items()}, max_new_tokens=1)

    if (device or (torch.cuda.is_available() and model.device.type == "cuda")) and model.device.type == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)
    if model.device.type == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    new_tokens = int(out.shape[-1] - inputs["input_ids"].shape[-1])
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    if strip_think:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    stats = {
        "device": str(model.device),
        "elapsed_s": dt,
        "new_tokens": new_tokens,
        "tokens_per_s": (new_tokens / dt) if dt > 0 else float("inf"),
    }
    return text, stats

# [ADD] [API] Удобная обёртка: загрузка + генерация за один вызов
def generate_text(
    *,
    model: str = "qwen3",
    device: Optional[str] = None,
    dtype: str = "auto",
    local_files_only: bool = False,
    prompt: Optional[str] = None,
    prompt_file: Optional[str] = None,
    system: Optional[str] = None,
    thinking: bool = False,
    strip_think: bool = False,
    max_new_tokens: int = 64,
    do_sample: bool = False,
    temperature: float = 0.0,
    top_p: float = 1.0,
    trust_remote_code: Optional[bool] = None,
):
    tok, mdl, dev = load_model(model=model, device=device, dtype=dtype, local_files_only=local_files_only, trust_remote_code=trust_remote_code)
    text, stats = generate_once(
        tok,
        mdl,
        prompt=prompt,
        prompt_file=prompt_file,
        system=system,
        thinking=thinking,
        strip_think=strip_think,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_p=top_p,
        device=dev,
    )
    return text, stats

# [ADD] Helpers to make the script robust in notebooks / Jupyter where argv contains ipykernel args to make the script robust in notebooks / Jupyter where argv contains ipykernel args
def _in_notebook() -> bool:
    try:
        from IPython import get_ipython  # type: ignore
        return get_ipython() is not None
    except Exception:
        return False

# [ADD] Safe prompt reader that won't block in notebooks when stdin is not provided
# [MOD] Py3.8/3.9 compatible typing (PEP 604 not available); use Optional[str]
def _read_prompt_or_default(arg_prompt: Optional[str]) -> str:
    if arg_prompt is not None:
        return arg_prompt
    # Try to read from stdin only if data is available; otherwise fall back to empty prompt
    try:
        if sys.stdin and not sys.stdin.isatty():
            data = sys.stdin.read()
            if data:
                return data
    except Exception:
        pass
    # Fallback: empty prompt (safe for generation) with a short notice printed by caller
    return ""

def build_inputs(tokenizer, prompt, system=None, enable_thinking=None, device="cpu"):
    """Return input_ids tensor on the target device, using chat template if present.
    Надёжно обрабатывает отсутствие jinja2 (требуется для chat_template):
    при ошибке откатывается к обычной подаче prompt без шаблона и печатает предупреждение.
    """
    model_inputs = None
    use_chat = getattr(tokenizer, "chat_template", None) not in (None, "", False)

    if use_chat:
        try:
            messages = []
            if system:
                messages.append({"role": "system", "content": system})
            messages.append({"role": "user", "content": prompt})
            # enable_thinking is used by Qwen3; harmless for tokenizers that ignore extra vars
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=enable_thinking if enable_thinking is not None else False,
            )
            model_inputs = tokenizer([text], return_tensors="pt")
        except (ImportError, AttributeError) as e:
            # [WARN] jinja2 старой версии или отсутствует → откатываемся на простой prompt
            print("[warn] Chat template requires jinja2>=3.1; falling back to plain prompt. Error:", e, file=sys.stderr)
            use_chat = False
        except Exception as e:
            # Любая другая ошибка рендера — также откат на простой prompt
            print("[warn] Failed to render chat template; falling back to plain prompt. Error:", e, file=sys.stderr)
            use_chat = False

    if not use_chat or model_inputs is None:
        model_inputs = tokenizer([prompt], return_tensors="pt")
        # [ADD] Ensure non-empty input for decoder-only models when prompt is empty
        if model_inputs["input_ids"].shape[1] == 0:
            fallback_id = tokenizer.eos_token_id
            if fallback_id is None:
                fallback_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
            model_inputs = {
                "input_ids": torch.tensor([[fallback_id]], dtype=torch.long),
                "attention_mask": torch.tensor([[1]], dtype=torch.long),
            }

    return {k: v.to(device) for k, v in model_inputs.items()}

def main():
    p = argparse.ArgumentParser(description="Simple local HF generation harness")
    # [MOD] Make --model optional with a sensible default so the script runs inside notebooks
    p.add_argument(
        "--model",
        default=(PRESETS.get("qwen3") or "Qwen/Qwen3-0.6B"),
        help="preset key (gpt2|qwen3-0.6b|nemotron-1.5b) or any HF repo id; default=qwen3",
    )
    p.add_argument("--device", choices=["cpu", "cuda"], default="cuda" if torch.cuda.is_available() else "cpu")
    p.add_argument("--dtype", choices=list(DTYPE_MAP.keys()), default="auto",
                   help="torch dtype for model weights")
    p.add_argument("--max-new-tokens", type=int, default=64)
    p.add_argument("--temperature", type=float, default=0.0, help="0.0 => greedy")
    p.add_argument("--top-p", type=float, default=1.0)
    p.add_argument("--do-sample", action="store_true", help="enable sampling (else greedy)")
    p.add_argument("--thinking", action="store_true",
                   help="For Qwen3: enable thinking mode (adds <think>...</think> content)")
    p.add_argument("--strip-think", action="store_true",
                   help="Strip <think>...</think> block from decoded output (if present)")
    p.add_argument("--system", default=None, help="Optional system prompt for chat models")
    p.add_argument("--prompt", default=None, help="Prompt; if omitted, read from stdin; in notebooks defaults to empty string")
    # [ADD] Read prompt from file
    p.add_argument("--prompt-file", default=None, help="Path to a text file with the prompt (UTF-8)")
    p.add_argument("--print-tokens", action="store_true", help="Also print token counts and toks/sec")
    # [ADD] allow remote code (needed by some repos)
    p.add_argument("--trust-remote-code", action="store_true", help="Allow custom modeling code from repo (use only for trusted repos)")
    # [MOD] In notebooks, ignore unrelated ipykernel args by parsing only known flags
    args = p.parse_args([]) if _in_notebook() else p.parse_args()

    model_id = PRESETS.get(args.model.lower(), args.model)
    device = args.device
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU", file=sys.stderr)
        device = "cpu"

    torch_dtype = DTYPE_MAP[args.dtype]

    # [ADD] Auto-enable trust for Qwen models if flag not set
    trust_flag = args.trust_remote_code or ("Qwen/" in model_id or args.model.lower().startswith("qwen"))
    if trust_flag and not args.trust_remote_code:
        print("[info] Auto-enabling trust_remote_code for Qwen model.", file=sys.stderr)

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_flag)  # [MOD] use inferred flag
    # Ensure pad_token_id exists for generation
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    # Model
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        trust_remote_code=trust_flag,  # [MOD] use inferred flag
    )
    model.eval().to(device)

    # [MOD] Prompt resolution priority: --prompt-file > --prompt > stdin > PROMPT.TXT > empty
    prompt = None
    if args.prompt_file:
        try:
            prompt = Path(args.prompt_file).read_text(encoding="utf-8").strip()
            print(f"[info] Loaded prompt from file: {args.prompt_file}")
        except Exception as e:
            print(f"[warn] Failed to read --prompt-file '{args.prompt_file}': {e}", file=sys.stderr)
    if prompt is None and args.prompt is not None:
        prompt = args.prompt
    if prompt is None:
        # try stdin (non-blocking path)
        data = _read_prompt_or_default(None)
        if data:
            prompt = data
    if prompt is None and Path("PROMPT.TXT").exists():
        try:
            prompt = Path("PROMPT.TXT").read_text(encoding="utf-8").strip()
            print("[info] Using PROMPT.TXT from current directory.")
        except Exception as e:
            print(f"[warn] Failed to read PROMPT.TXT: {e}", file=sys.stderr)
    if prompt is None:
        prompt = ""
        if _in_notebook():
            print("[info] No --prompt/--prompt-file/stdin and PROMPT.TXT not found; using empty prompt.")

    inputs = build_inputs(
        tokenizer,
        prompt,
        system=args.system,
        enable_thinking=args.thinking,
        device=device
    )

    gen_kwargs = dict(
        max_new_tokens=args.max_new_tokens,
        do_sample=args.do_sample,
        temperature=args.temperature if args.do_sample else None,
        top_p=args.top_p if args.do_sample else None,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    # Remove None entries (generate() complains)
    gen_kwargs = {k: v for k, v in gen_kwargs.items() if v is not None}

    # Warmup (tiny, optional)
    with torch.inference_mode():
        _ = model.generate(**{k: v.clone() for k, v in inputs.items()}, max_new_tokens=1)

    # Timed run
    if device == "cuda":
        torch.cuda.synchronize()
    t0 = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(**inputs, **gen_kwargs)
    if device == "cuda":
        torch.cuda.synchronize()
    dt = time.perf_counter() - t0

    # Separate new tokens from the continuation
    new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
    text = tokenizer.decode(out[0], skip_special_tokens=True)

    if args.strip_think:
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

    print(text)

    if args.print_tokens:
        toks_per_s = new_tokens / dt if dt > 0 else float("inf")
        print("\n--- stats ---")
        print(f"device: {device}")
        if device == "cuda":
            try:
                print(f"gpu: {torch.cuda.get_device_name()}")
            except Exception:
                pass
        print(f"elapsed_s: {dt:.3f}")
        print(f"new_tokens: {new_tokens}")
        print(f"tokens_per_s: {toks_per_s:.2f}")

if __name__ == "__main__":
    main()


[info] Auto-enabling trust_remote_code for Qwen model.


[info] Using PROMPT.TXT from current directory.


[warn] Chat template requires jinja2>=3.1; falling back to plain prompt. Error: module 'jinja2' has no attribute 'pass_eval_context'
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


For overdetermined reasons, I’ve lately found the world an increasingly terrifying and depressing place. It’s gotten harder and harder to concentrate on research, or even popular science writing. Every so often, though, something breaks through that wakes my inner child, reminds me of why I fell in love with research thirty years ago, and helps me forget about the triumphantly strutting factions working to destroy everything I value. That something is a story. And I’ve been writing stories about the world, but I’m not sure if I’m doing it right. I’m not sure if I’m writing about the world or about myself. And I’m not sure if I’m writing about the world in a way that’s meaningful or just a
