<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
from typing import List, Callable, Dict, Tuple, Optional
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, pipeline
import pandas as pd
from pathlib import Path


pd.set_option("display.max_rows", None)  #убрали ограничение на число строк при отображении датафрейма
try:
    from IPython.display import display
    HAVE_DISPLAY = True
except Exception:
    HAVE_DISPLAY = False
    
    
import time as tm
t1_itogo = tm.time() # длЯ расчета времени исполнения всей тетрадки



USE_FILE_PROMPTS = True
PROMPT_FILE = "prompt.txt"
PROMPTS = (
    [Path(PROMPT_FILE).read_text(encoding="utf-8").strip()] if USE_FILE_PROMPTS else [
        "Write me a code for sorting an array in Python",
    ]
)

# Суммаризация
#SUMM_MODEL   = "sshleifer/distilbart-cnn-12-6"
#CUSTOM_WORDS = "Сводка:"

#_summarizer = pipeline(
#    "summarization",
#    model=SUMM_MODEL,
#    tokenizer=SUMM_MODEL,
#    device=0 if DEVICE=="cuda" else -1,
#)


MODS = [
    ("original", lambda p: p),
    ("typo first e", lambda p: p.replace("e", "3", 1)),
    ("add salutation", lambda p: "Dear user, " + p),
    ("префикс ======", lambda p: "=" * 10 + p),
    ("префикс вопрос", lambda p: "I have a question. " + p),
    ("суффикс 10 лет?", lambda p: p + " in the next decade?"),
  #  ("summary", lambda p: CUSTOM_WORDS + _summarizer(p, max_length=60, min_length=10, do_sample=False)[0]['summary_text'])
    #summary для коротких промптов не тимеет смысла, пока закомментим строчку, длдя Summary нукжно много что добавлять
]

MODEL_NAME = "gpt2"
STEPS = 100

# --------------------------------------------------
# 1.  Loading
# --------------------------------------------------

def setup_model(model_name: str = "gpt2", device: Optional[str] = None, dtype: Optional[torch.dtype] = None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    tok = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    if tok.pad_token_id is None:
        tok.pad_token = tok.eos_token
    mk = {"torch_dtype": dtype} if dtype is not None else {}
    mdl = AutoModelForCausalLM.from_pretrained(model_name, **mk).to(device)
    if device == "cuda":
        torch.backends.cuda.matmul.allow_tf32 = True
    return mdl, tok, device

# --------------------------------------------------
# 2.  Batch preparation
# --------------------------------------------------

def prepare_batch(
    prompts: List[str],
    mods: List[Tuple[str, Callable[[str], str]]],
    tokenizer,
    device: str,
):
    texts, meta = [], []  # meta: (prompt_idx, mod_name)
    for i, p in enumerate(prompts):
        for m_name, m_fn in mods:
            meta.append((i, m_name))
            texts.append(m_fn(p))
    enc = tokenizer(texts, padding=True, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}
    lens = enc["attention_mask"].sum(dim=1)
    return enc, lens, meta

# --------------------------------------------------
# 3.  Generation wrapper
# --------------------------------------------------

def run_generation(model, enc, steps: int, pad_id: int):
    cfg = GenerationConfig(max_new_tokens=steps, do_sample=False, num_beams=1, pad_token_id=pad_id, use_cache=True)
    model.eval()
    with torch.inference_mode():
        out = model.generate(**enc, generation_config=cfg)
    return out

# --------------------------------------------------
# 4.  Post‑processing
# --------------------------------------------------

def decode_results(
    out, lens, meta, tokenizer, steps: int, decode_tokens: bool
) -> Dict[int, Dict[str, List]]:
    res: Dict[int, Dict[str, List]] = {}
    for idx, (p_idx, m_name) in enumerate(meta):
        start = lens[idx].item()
        ids = out[idx][start : start + steps].tolist()
        tokens = [tokenizer.decode([i], skip_special_tokens=True) for i in ids] if decode_tokens else ids
        res.setdefault(p_idx, {})[m_name] = tokens
    return res

# --------------------------------------------------
# 5.  Pretty printing
# --------------------------------------------------

def show_tables(res, prompts, trunc: int, return_dfs: bool, print_text: bool = True): 
    #если print_text: bool = True, то делаем печать помимо DataFrame
    dfs = {}
    for i, p in enumerate(prompts):
        df = pd.DataFrame(res[i])
        df.index = [f"step {j+1}" for j in range(len(df))]
        hdr = f"=== PROMPT {i+1}: {p[:trunc]}{'…' if len(p) > trunc else ''} ==="
        print("\n" + hdr)

        # Таблица top‑1 токенов
        if HAVE_DISPLAY:
            display(df)
        else:
            print(df.to_string())

        # Краткий текст‑продолжение из n токенов
        if print_text:
            print("\n--- concatenated top‑1 continuation (n = steps) ---")
            for col in df.columns:
                joined = ''.join(res[i][col])  # токены могут содержать пробелы
                print(f"[{col}] {joined}")
                print()

        if return_dfs:
            dfs[i] = df
    return dfs if return_dfs else None





  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# --------------------------------------------------
# 6.  Public API
# --------------------------------------------------

def greedy_top1(
    prompts: List[str],
    mods: List[Tuple[str, Callable[[str], str]]],
    model, tokenizer,
    steps: int = 40,
    device: Optional[str] = None,
    decode_tokens: bool = True,
    trunc_print: int = 120,
    return_dataframes: bool = True,
):
    device = device or model.device
    enc, lens, meta = prepare_batch(prompts, mods, tokenizer, device)
    out = run_generation(model, enc, steps, tokenizer.pad_token_id)
    res = decode_results(out, lens, meta, tokenizer, steps, decode_tokens)
    dfs = show_tables(res, prompts, trunc_print, return_dataframes)
    return (res, dfs) if return_dataframes else res

In [3]:
# --------------------------------------------------
# 7.  Script example
# --------------------------------------------------

mdl, tok, dev = setup_model(MODEL_NAME)
_ = greedy_top1(PROMPTS, MODS, mdl, tok, steps=STEPS, device=dev)



=== PROMPT 1: For overdetermined reasons, I’ve lately found the world an increasingly terrifying and depressing place. It’s gotten har… ===


Unnamed: 0,original,typo first e,add salutation,префикс ======,префикс вопрос,суффикс 10 лет?
step 1,destroy,I,value,I,\n,\n
step 2,everything,value,.,value,\n,\n
step 3,I,.,\n,.,I,I
step 4,value,\n,\n,\n,have,'m
step 5,.,\n,I,\n,a,not
step 6,\n,I,'ve,I,question,sure
step 7,\n,'ve,been,'ve,.,what
step 8,I,been,a,been,For,to
step 9,'ve,a,researcher,a,overd,make
step 10,been,researcher,for,researcher,etermined,of



--- concatenated top‑1 continuation (n = steps) ---
[original]  destroy everything I value.

I've been a researcher for twenty years, and I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist

[typo first e]  I value.

I've been a researcher for over twenty years, and I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist. I've never been a scientist.

[add salutation]  value.

I've been a researcher for over twenty years, and I've never b

In [4]:
t2_itogo = tm.time()
print('вся тетрадка заняла', round(t2_itogo - t1_itogo)//60,'минут', round(t2_itogo - t1_itogo)%60,'секунд')

вся тетрадка заняла 0 минут 13 секунд
