In [None]:
# 将 llm_toy/src 加入 sys.path（稳健多候选）
import sys as _sys
from pathlib import Path as _Path
def _add_src_path():
    cands = [
        _Path.cwd()/'llm_toy'/'src',
        _Path.cwd()/'src',
        _Path.cwd().parent/'llm_toy'/'src',
        _Path.cwd().parent/'src',
    ]
    for base in list(_Path.cwd().parents)[:3]:
        cands += [base/'llm_toy'/'src', base/'src']
    for p in cands:
        if (p/'model.py').exists() and (p/'utils.py').exists():
            _sys.path.append(str(p.resolve()))
            print('已添加src路径:', p.resolve())
            return
    print('警告：未找到 llm_toy/src，请手动添加路径或调整工作目录。')
_add_src_path()


# 07 RAG入门：Retrieval-Augmented Generation（检索增强生成）

本Notebook演示最小RAG流程：使用 TF-IDF 检索文档，并把结果拼接到Prompt中交给 GPT-2 生成回答。

In [None]:
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from model import SimpleGPTModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


## 构建检索语料（Corpus）

混合 tiny_corpus.txt 与少量额外句子，形成一个小型知识库（Doc集合）。

In [None]:
# 解析 tiny_corpus.txt 路径（多候选）
from pathlib import Path as _P2
_cwd = _P2.cwd()
_cands = [
    _cwd/'llm_toy'/'data'/'tiny_corpus.txt',
    _cwd/'data'/'tiny_corpus.txt',
    _cwd.parent/'llm_toy'/'data'/'tiny_corpus.txt',
]
for base in list(_cwd.parents)[:3]:
    _cands.append(base/'llm_toy'/'data'/'tiny_corpus.txt')
tiny_path = None
for _p in _cands:
    if _p.exists(): tiny_path = _p; break

lines = []
if tiny_path and tiny_path.exists():
    lines += tiny_path.read_text(encoding='utf-8').strip().split('
')
extra = [
    'Transformer 使用 Self-Attention 捕捉序列中远距离依赖。',
    'Fine-tuning 可以让预训练模型适配特定任务或风格。',
    'Tokenization 将文本切分为 tokens，是NLP管线的基础环节。',
    'Top-k 与 Top-p 采样影响生成的多样性与保守性。',
]
corpus = [s for s in (lines + extra) if s and isinstance(s, str)]
len(corpus), corpus[:3]


## 构建TF-IDF检索器

拟合语料得到 doc-term 矩阵，查询时计算 cosine 相似度并返回Top-k文档。

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(corpus)
X.shape


In [None]:
def retrieve(query: str, k: int = 3):
    q = vectorizer.transform([query])
    sims = cosine_similarity(q, X)[0]
    idx = np.argsort(-sims)[:k]
    return [(int(i), float(sims[i]), corpus[int(i)]) for i in idx]

retrieve('什么是 Transformer 和 Self-Attention？', k=3)


## 组装Prompt并用生成模型作答

把检索到的文档作为“已知信息”拼到Prompt中，提示模型基于此作答。

In [None]:
simple = SimpleGPTModel(model_name='gpt2')
tok = simple.tokenizer
gen_model = simple.model.to(device)

def build_prompt(query: str, ctx_list):
    ctx_lines = [f'{i+1}. {c}' for i, (_, _, c) in enumerate(ctx_list)]
    ctx_block = '\n'.join(ctx_lines)
    tmpl = (
        '请基于以下已知信息简洁回答用户问题。'
        '\n已知信息：\n' + ctx_block +
        '\n问题：' + query + '\n答案：'
    )
    return tmpl

def generate_answer(prompt: str, max_new_tokens=80, temperature=0.7):
    input_ids = tok.encode(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        out = gen_model.generate(
            input_ids,
            max_length=min(1024, input_ids.shape[1] + max_new_tokens),
            do_sample=True,
            temperature=0.7,
            pad_token_id=tok.eos_token_id
        )
    return tok.decode(out[0], skip_special_tokens=True)

query = '如何简单解释 Transformer 的核心思想？'
ctx = retrieve(query, k=3)
prompt = build_prompt(query, ctx)
print('--- Prompt ---')
print(prompt)
print('
--- Answer ---')
print(generate_answer(prompt))
