<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import torch, torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from IPython.display import display

"""
Базовый скрипт: сравнение распределений P(next_token) без суммаризации.
* Строки таблиц = исходные prompt-ы.
* Столбцы       = русские названия модификаций.
* Метрики       = cosine similarity и KL‑divergence.
* Параметр STEPS задаёт, сколько токенов вперёд анализируем.
"""

# ---------- ПАРАМЕТРЫ ----------
MODEL_NAME = "gpt2"
STEPS      = 2
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# ---------- 1. Модель и токенизатор ----------
_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
_model     = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()

# ---------- 2. Вспомогательные функции ----------

def _next_token_probs(ids):
    with torch.no_grad():
        logits = _model(ids).logits[0, -1]
    return torch.softmax(logits, dim=-1).cpu()

def _kl(p, q):
    return F.kl_div(p.log(), q, reduction="batchmean").item()

def _cos(p, q):
    return F.cosine_similarity(p, q, dim=0).item()

def _multi_step(prompt: str, n: int):
    ids = _tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)
    dists = []
    for _ in range(n):
        probs = _next_token_probs(ids)
        dists.append(probs)
        next_id = probs.argmax().unsqueeze(0).unsqueeze(0).to(DEVICE)
        ids = torch.cat([ids, next_id], dim=1)
    return dists

# ---------- 3. Исходные промпты ----------
PROMPTS = [
    "Why the stock market is expected to",
    "The future of artificial intelligence depends on",
]

# ---------- 4. Модификации ----------
MODS = {
    "префикс ======":   lambda p: "="*10 + p,
    "префикс вопрос":  lambda p: "I have a question. " + p,
    "суффикс 10 лет?": lambda p: p + " in the next decade?",
}

# ---------- 5. Расчёт ----------
records = []
for prompt in PROMPTS:
    base = _multi_step(prompt, STEPS)
    for mod_name, mod_fn in MODS.items():
        mod  = _multi_step(mod_fn(prompt), STEPS)
        for step in range(STEPS):
            records.append({
                "prompt": prompt,
                "mod":    mod_name,
                "step":   step+1,
                "cos":    _cos(base[step], mod[step]),
                "kl":     _kl(base[step],  mod[step]),
            })

_df = pd.DataFrame(records)
pd.set_option("display.precision", 4)

# ---------- 6. Вывод ----------
for s in range(1, STEPS+1):
    df = _df[_df.step==s]
    print(f"\n=== Шаг {s} — Cosine ===")
    display(df.pivot(index="prompt", columns="mod", values="cos"))
    print(f"\n=== Шаг {s} — KL ===")
    display(df.pivot(index="prompt", columns="mod", values="kl"))

# ---------- 7. Лимит строк (без ошибки в Jupyter) ----------
import inspect, sys
try:
    src_len = len(inspect.getsource(sys.modules[__name__]).splitlines())
    if src_len > 175:
        print("⚠️ Файл приближается к лимиту, рассмотрите рефакторинг.")
except (TypeError, OSError):
    # В Jupyter __main__ может быть built-in → getsource недоступен.
    pass


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (



=== Шаг 1 — Cosine ===


mod,префикс ======,префикс вопрос,суффикс 10 лет?
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The future of artificial intelligence depends on,0.9745,0.9395,0.0024
Why the stock market is expected to,0.9017,0.9308,0.0005



=== Шаг 1 — KL ===


mod,префикс ======,префикс вопрос,суффикс 10 лет?
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The future of artificial intelligence depends on,9.6809e-07,2.2507e-06,0.0001
Why the stock market is expected to,1.9205e-06,1.7422e-06,0.0002



=== Шаг 2 — Cosine ===


mod,префикс ======,префикс вопрос,суффикс 10 лет?
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The future of artificial intelligence depends on,0.9616,0.0008,0.0006
Why the stock market is expected to,0.9372,0.8397,0.034



=== Шаг 2 — KL ===


mod,префикс ======,префикс вопрос,суффикс 10 лет?
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The future of artificial intelligence depends on,5.1809e-07,0.000184,0.0002
Why the stock market is expected to,2.7418e-06,5.6288e-06,0.0001


In [7]:
import torch, torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd
from IPython.display import display
from pathlib import Path

"""
Скрипт сравнения распределений P(next_token) с поддержкой:
----------------------------------------------------------------
1. **Внутренних промптов** (список в коде) **или**
2. **Промптов из текстового файла** в той же директории.

Флаг `USE_FILE_PROMPTS` выбирает источник.
Если `True`, файл `PROMPT_FILE` читается целиком и используется как **один** длинный prompt.

Модификация «summary + слова» печатает краткое summary исходного prompt‑а.
Добавлена **безопасная токенизация**: если длиннее лимита модели (1024 для GPT‑2),
текст автоматически усечётся, а в консоль выведется предупреждение.
"""

# ---------- ПАРАМЕТРЫ ----------
MODEL_NAME   = "gpt2"
STEPS        = 2
DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"

# Суммаризация
SUMM_MODEL   = "sshleifer/distilbart-cnn-12-6"
CUSTOM_WORDS = "Сводка:"

# Выбор источника промптов
USE_FILE_PROMPTS = False             # ← переключите на True для чтения из файла
PROMPT_FILE      = "prompt.txt"      # UTF‑8 файл в той же папке

# ---------- 1. Модели ----------
_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
_model     = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE).eval()

_summarizer = pipeline(
    "summarization",
    model=SUMM_MODEL,
    tokenizer=SUMM_MODEL,
    device=0 if DEVICE=="cuda" else -1,
)

CTX_LIMIT = _model.config.n_positions  # 1024 для GPT‑2

# ---------- 2. Вспомогательные функции ----------

def _next_token_probs(ids):
    """Вернёт софтмакс‑распределение для следующего токена."""
    with torch.no_grad():
        logits = _model(ids).logits[0, -1]
    return torch.softmax(logits, dim=-1).cpu()


def _kl(p, q):
    return F.kl_div(p.log(), q, reduction="batchmean").item()


def _cos(p, q):
    return F.cosine_similarity(p, q, dim=0).item()


def _multi_step(prompt: str, n: int):
    """Возвращает список распределений для n последующих токенов.
    Если prompt длиннее контекст‑лимита, автоматически усечёт и предупредит."""
    token_ids = _tokenizer.encode(prompt)
    if len(token_ids) > CTX_LIMIT:
        print(f"⚠️ Промпт содержит {len(token_ids)} токенов и будет усечён до {CTX_LIMIT}.")
    ids = _tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=CTX_LIMIT
        )["input_ids"].to(DEVICE)

    dists = []
    for _ in range(n):
        probs = _next_token_probs(ids)
        dists.append(probs)
        next_id = probs.argmax().unsqueeze(0).unsqueeze(0).to(DEVICE)
        ids = torch.cat([ids, next_id], dim=1)
    return dists

# ---------- 3. Источник промптов ----------
if USE_FILE_PROMPTS:
    path = Path(PROMPT_FILE)
    if not path.is_file():
        raise FileNotFoundError(f"Файл {PROMPT_FILE} не найден")
    PROMPTS = [path.read_text(encoding="utf-8").strip()]
else:
    PROMPTS = [
        "Why the stock market is expected to",
        "The future of artificial intelligence depends on",
    ]
    

# ---------- 4. Модификации ----------
def add_typo_first_e(p: str) -> str:
    """Replace the first occurrence of 'e' with '3'."""
    return p.replace("e", "3", 1)

def add_salutation(p: str) -> str:
    """Prepend a greeting line."""
    return "Dear GPT,\n" + p


MODS = {
    "префикс ======":   lambda p: "="*10 + p,
     "префикс question(англ)":  lambda p: "I have a question. " + p,
    "префикс вопрос":  lambda p: "У меня есть вопрос. " + p,
    "суффикс 10 лет?": lambda p: p + " in the next decade?",
}

# --- Модификация с суммаризацией и печатью ---

def summary_mod(custom: str = CUSTOM_WORDS):
    def _fn(p: str):
        summary = _summarizer(p, max_length=60, min_length=15, do_sample=False)[0]["summary_text"]
        print(f"\n[SUMMARY]\n{summary}\n")
        return f"{custom} {summary}\n\n{p}"
    return _fn

MODS["summary + слова (print)"] = summary_mod()

# ---------- 5. Расчёт ----------
records = []
for prompt in PROMPTS:
    base = _multi_step(prompt, STEPS)
    for mod_name, mod_fn in MODS.items():
        mod  = _multi_step(mod_fn(prompt), STEPS)
        for step in range(STEPS):
            records.append({
                "prompt": prompt[:80] + ("…" if len(prompt) > 80 else ""),
                "mod":    mod_name,
                "step":   step + 1,
                "cos":    _cos(base[step], mod[step]),
                "kl":     _kl(base[step],  mod[step]),
            })

_df = pd.DataFrame(records)
pd.set_option("display.precision", 4)

# ---------- 6. Вывод ----------
for s in range(1, STEPS + 1):
    df = _df[_df.step == s]
    print(f"\n=== Шаг {s} — Cosine ===")
    display(df.pivot(index="prompt", columns="mod", values="cos"))
    print(f"\n=== Шаг {s} — KL ===")
    display(df.pivot(index="prompt", columns="mod", values="kl"))

# ---------- 7. Лимит строк (без ошибки в Jupyter) ----------
import inspect, sys
try:
    if len(inspect.getsource(sys.modules[__name__]).splitlines()) > 175:
        print("⚠️ Файл приближается к лимиту, рассмотрите рефакторинг.")
except (TypeError, OSError):
    pass


Your max_length is set to 60, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)



[SUMMARY]
 Why the stock market is expected to perform well in the first half of 2014 .



Your max_length is set to 60, but your input_length is only 9. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)



[SUMMARY]
 The future of artificial intelligence depends on artificial intelligence, experts say . Artificial intelligence is a key to the future of AI technology .


=== Шаг 1 — Cosine ===


mod,summary + слова (print),префикс ======,префикс question(англ),префикс вопрос,суффикс 10 лет?
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The future of artificial intelligence depends on,0.035,0.9745,0.9395,0.9138,0.0024
Why the stock market is expected to,0.0376,0.9017,0.9308,0.8906,0.0005



=== Шаг 1 — KL ===


mod,summary + слова (print),префикс ======,префикс question(англ),префикс вопрос,суффикс 10 лет?
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The future of artificial intelligence depends on,9.2285e-05,9.6809e-07,2.2507e-06,2.8169e-06,0.0001
Why the stock market is expected to,0.00010411,1.9205e-06,1.7422e-06,2.3389e-06,0.0002



=== Шаг 2 — Cosine ===


mod,summary + слова (print),префикс ======,префикс question(англ),префикс вопрос,суффикс 10 лет?
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The future of artificial intelligence depends on,0.017,0.9616,0.0008,0.8433,0.0006
Why the stock market is expected to,0.0054,0.9372,0.8397,0.9152,0.034



=== Шаг 2 — KL ===


mod,summary + слова (print),префикс ======,префикс question(англ),префикс вопрос,суффикс 10 лет?
prompt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
The future of artificial intelligence depends on,0.0001,5.1809e-07,0.000184,3.0122e-06,0.0002
Why the stock market is expected to,0.0001,2.7418e-06,5.6288e-06,3.8064e-06,0.0001
