<a href="https://colab.research.google.com/github/201524495/201524495/blob/main/logitlens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logit Lens
실습 내용:
- **SLogit Lens**로 **intermediate hidden representation을 vocabulary logit으로 투영**해 레이어별 예측을 분석



![](https://res.cloudinary.com/lesswrong-2-0/image/upload/f_auto%2Cq_auto/v1/mirroredImages/AcKRB8wDpdaN6v6ru/ccfmt4rt3aegjjfi7lo8)  
![](https://res.cloudinary.com/lesswrong-2-0/image/upload/f_auto%2Cq_auto/v1/mirroredImages/AcKRB8wDpdaN6v6ru/u4idlaozp3dnnom3qitn)  
출처: *Interpreting GPT: the Logit Lens* (LessWrong, 2020).

In [None]:
# !pip install -q transformers torch
import os, torch, numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"


## 1) HF 모델 load


In [None]:
MODEL_NAME = os.getenv("HF_MODEL_LENS", "gpt2")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, output_hidden_states=True).to(device).eval()

### ToDo : Unembedding layer (logit predictor) 구현
W_U = None  # [V, d]


## 2) Logit Lens utility


In [None]:
def run_logit_lens(prompt: str):
    toks = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(**toks)
        hiddens = out.hidden_states

    ### ToDo : Layer-wise logit lens 구현 (list of logits per layer)
    logits_per_layer = None
    last_ix = int(toks["input_ids"].shape[1]-1)
    return toks, logits_per_layer, last_ix

def topk_tokens(vec, k=5):
    topv, topi = torch.topk(vec, k)
    return [tokenizer.decode([int(i)]) for i in topi], [float(v) for v in topv]


## 3) 입력 prompt에 대한 top-3 word 추출


In [None]:
prompts = [
    "Paris is capital of",
    "Alan Turing was a",
    "The most famous sports in USA is"
]
for prompt in prompts:
    toks, logits_per_layer, last_ix = run_logit_lens(prompt)
    print(f"Prompt: {prompt}")
    for L, logits in enumerate(logits_per_layer):
        tops, vals = topk_tokens(logits[0, last_ix], k=3)
        print(f"  Layer {L:02d}: {tops}")



## 4) 타겟 토큰 확률의 레이어별 변화


In [None]:
target_token = " football"
prompt = "The most famous sports in USA is"
toks, logits_per_layer, last_ix = run_logit_lens(prompt)
target_id = tokenizer.encode(target_token)[0]
probs = []
for logits in logits_per_layer:
    ### ToDo : logit -> probability 구현
    p = None
    probs.append(float(p))
plt.figure(); plt.plot(range(len(probs)), probs, marker='o')
plt.xlabel("Layer"); plt.ylabel(f"P({target_token.strip()})"); plt.title("Layer-wise probability"); plt.grid(True); plt.show()


## 5) 타겟 token rank 추적


In [None]:
def token_rank(logits, token_id):
    return 1 + int((logits > logits[token_id]).sum())

prompt = "The most famous sports in USA is"
gold = " football"
toks = tokenizer(prompt, return_tensors="pt").to(device)
gold_id = tokenizer.encode(gold)[0]

with torch.no_grad():
    out = model(**toks); hiddens = out.hidden_states

logits_per_layer = [torch.matmul(h, W_U.T).cpu() for h in hiddens]
last_ix = int(toks["input_ids"].shape[1]-1)
ranks = []
for logits in logits_per_layer:
    vec = logits[0, last_ix]; ranks.append(token_rank(vec, gold_id))

plt.figure(); plt.plot(range(len(ranks)), ranks, marker='o')
plt.xlabel("Layer"); plt.ylabel(f"rank({gold.strip()}) ↓ better"); plt.title("Gold token rank by layer")
plt.gca().invert_yaxis(); plt.grid(True); plt.show()


## 7) Reference


- LessWrong — *Interpreting GPT: the Logit Lens* (2020): https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens  
- TransformerLens: https://github.com/TransformerLensOrg/TransformerLens  
- Tuned Lens: https://github.com/AlignmentResearch/tuned-lens


### 정답

In [None]:
MODEL_NAME = os.getenv("HF_MODEL_LENS", "gpt2")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, output_hidden_states=True).to(device).eval()
W_U = model.lm_head.weight.detach()  # [V, d]


In [None]:
def run_logit_lens(prompt: str):
    toks = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(**toks)
        hiddens = out.hidden_states

    ### ToDo : Layer-wise logit lens 구현 (list of logits per layer)
    logits_per_layer = None
    last_ix = int(toks["input_ids"].shape[1]-1)
    return toks, logits_per_layer, last_ix

def topk_tokens(vec, k=5):
    topv, topi = torch.topk(vec, k)
    return [tokenizer.decode([int(i)]) for i in topi], [float(v) for v in topv]


In [None]:
target_token = " football"
prompt = "The most famous sports in USA is"
toks, logits_per_layer, last_ix = run_logit_lens(prompt)
target_id = tokenizer.encode(target_token)[0]
probs = []
for logits in logits_per_layer:
    ### ToDo : logit -> probability 구현
    p = None
    probs.append(float(p))
plt.figure(); plt.plot(range(len(probs)), probs, marker='o')
plt.xlabel("Layer"); plt.ylabel(f"P({target_token.strip()})"); plt.title("Layer-wise probability"); plt.grid(True); plt.show()
