In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(emb_dim, n_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.GELU(),
            nn.Linear(4 * emb_dim, emb_dim)
        )
        self.norm2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        B, T, C = x.size()

        # 生成 causal mask，保证第 t 个位置只能看到 <= t 的位置
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).repeat(B, 1, 1)
        # nn.MultiheadAttention 需要 bool mask，True 表示被遮挡
        attn_mask = ~mask.bool()[0]  # (T, T) bool，True 表示遮挡

        attn_out, _ = self.attn(x, x, x, attn_mask=attn_mask, need_weights=False)
        x = x + attn_out
        x = self.norm1(x)
        mlp_out = self.mlp(x)
        x = x + mlp_out
        return self.norm2(x)

class TinyTransformer(nn.Module):
    def __init__(self, vocab_size, emb_dim=512, n_heads=16, n_layers=12, block_size=512):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, block_size, emb_dim))
        self.blocks = nn.Sequential(*[
            TransformerBlock(emb_dim, n_heads) for _ in range(n_layers)
        ])
        self.ln = nn.LayerNorm(emb_dim)
        self.fc = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        tok_emb = self.token_embedding(x)
        x = tok_emb + self.pos_embedding[:, :x.size(1), :]
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.fc(x)
        return logits

In [2]:
import torch
torch.cuda.empty_cache()


In [4]:
# train_reward.ipynb

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import json
from tqdm import tqdm
from tokenizers import Tokenizer  # 使用自定义 BPE tokenizer

# 超参数
BATCH_SIZE = 16
EPOCHS = 3
LR = 1e-4
MAX_LEN = 512

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载 tokenizer
tokenizer = Tokenizer.from_file("tiny_tokenizer.json")
vocab_size=tokenizer.get_vocab_size()
# 加载微调模型
base_model = TinyTransformer(vocab_size=vocab_size)
base_model.load_state_dict(torch.load("tiny_model_finetuned_alpaca.pt"))
base_model.to(device)
base_model.eval()

for param in base_model.parameters():
    param.requires_grad = False

# 定义奖励模型
class RewardModel(nn.Module):
    def __init__(self, base_model, hidden_dim=512):
        super().__init__()
        self.base = base_model
        self.reward_head = nn.Linear(hidden_dim, 1)

        # 冻结 base_model 所有参数
        for param in self.base.parameters():
            param.requires_grad = False

    def forward(self, input_ids):
        # 获取 embedding 和位置编码
        tok_emb = self.base.token_embedding(input_ids)
        x = tok_emb + self.base.pos_embedding[:, :input_ids.size(1), :]

        # 通过 transformer blocks 和 layernorm
        x = self.base.blocks(x)
        x = self.base.ln(x)  # (B, T, hidden_dim)

        # 取最后一个 token 的隐藏状态
        last_hidden = x[:, -1, :]  # (B, hidden_dim)

        # 经过 reward head 输出打分
        reward = self.reward_head(last_hidden)  # (B, 1)
        return reward.squeeze(-1)  # (B,)

reward_model = RewardModel(base_model).to(device)

# 数据集类
class RewardDataset(Dataset):
    def __init__(self, path):
        self.samples = []
        with open(path, "r") as f:
            for line in f:
                d = json.loads(line)
                for i, r in enumerate([d['response_0'], d['response_1']]):
                    text = d['prompt'] + r
                    enc = tokenizer.encode(text)
                    input_ids = enc.ids[:MAX_LEN]
                    label = 1.0 if d.get('safer_response', 0) == i else 0.0
                    self.samples.append((input_ids, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_ids, label = self.samples[idx]
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.float)
        return input_ids, label

def collate_fn(batch):
    input_ids, labels = zip(*batch)
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    return input_ids.to(device), torch.tensor(labels, dtype=torch.float).to(device)

# 加载数据
train_dataset = RewardDataset("train.jsonl")
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss  # or MSELoss / CrossEntropyLoss, 根据你的标签定义
from tqdm import tqdm
# 训练
optimizer = Adam(reward_model.reward_head.parameters(), lr=1e-4)
criterion = BCEWithLogitsLoss()  # 或其他你需要的损失函数

reward_model.train()
num_epochs=5
for epoch in range(num_epochs):
    total_loss = 0
    for input_ids, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = input_ids.to(device)
        labels = labels.to(device).float()  # 确保 label dtype 一致

        rewards = reward_model(input_ids)
        loss = criterion(rewards, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

  base_model.load_state_dict(torch.load("tiny_model_finetuned_alpaca.pt"))
Epoch 1: 100%|██████████| 2619/2619 [01:24<00:00, 31.06it/s]


Epoch 1, Loss: 1827.0529


Epoch 2: 100%|██████████| 2619/2619 [01:24<00:00, 30.82it/s]


Epoch 2, Loss: 1812.3700


Epoch 3: 100%|██████████| 2619/2619 [01:24<00:00, 30.84it/s]


Epoch 3, Loss: 1811.5863


Epoch 4: 100%|██████████| 2619/2619 [01:24<00:00, 30.81it/s]


Epoch 4, Loss: 1806.8971


Epoch 5: 100%|██████████| 2619/2619 [01:25<00:00, 30.74it/s]

Epoch 5, Loss: 1806.1168





In [5]:
# 保存整个模型（含 base_model + reward_head）
torch.save(reward_model.state_dict(), "reward_model.pt")


In [9]:
# 第 1 部分：导入库
import torch
import json
import random
from tqdm import tqdm
from torch.nn import functional as F
from tokenizers import Tokenizer
from torch.utils.data import DataLoader, Dataset

# 加载自定义模型与 tokenizer
# from tiny_transformer import TinyTransformer  # 你自定义的模型类
tokenizer = Tokenizer.from_file("tiny_tokenizer.json")
vocab_size=tokenizer.get_vocab_size()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy_model = TinyTransformer(vocab_size=vocab_size).to(device)
policy_model.load_state_dict(torch.load("tiny_model_finetuned_alpaca.pt"))
policy_model.eval()

# reward_model = TinyTransformer(vocab_size=vocab_size).to(device)
# reward_model.load_state_dict(torch.load("reward_model.pt"))
# reward_model.eval()
base_model = TinyTransformer(vocab_size=vocab_size).to(device)
reward_model = RewardModel(base_model).to(device)
reward_model.load_state_dict(torch.load("reward_model.pt"))
reward_model.eval()



  policy_model.load_state_dict(torch.load("tiny_model_finetuned_alpaca.pt"))
  reward_model.load_state_dict(torch.load("reward_model.pt"))


RewardModel(
  (base): TinyTransformer(
    (token_embedding): Embedding(8192, 512)
    (blocks): Sequential(
      (0): TransformerBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): TransformerBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
        

In [24]:
# 第 2 部分：数据加载
class PKUSafeDataset(Dataset):
    def __init__(self, jsonl_path):
        with open(jsonl_path, "r", encoding="utf-8") as f:
            self.data = [json.loads(line) for line in f]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]["prompt"]

# train_dataset = PKUSafeDataset("train.jsonl")
# train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
train_dataset = PKUSafeDataset("train.jsonl")
subset_dataset = torch.utils.data.Subset(train_dataset, list(range(10)))
train_loader = DataLoader(subset_dataset, batch_size=1, shuffle=True)



In [25]:
def compute_reward(prompt, response):
    text = prompt + response
    encoding = tokenizer.encode(text)
    input_ids = torch.tensor([encoding.ids], device=device)

    with torch.no_grad():
        reward = reward_model(input_ids)  # logits 是 reward 分数
        reward = reward.item()            # 转为 Python 标量

    return reward


In [26]:
# 第 4 部分：策略模型生成函数
def generate_response(model, prompt, max_new_tokens=64):
    encoding = tokenizer.encode(prompt)
    input_ids = torch.tensor([encoding.ids], device=device)
    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(input_ids)
            next_token_logits = logits[0, -1, :]
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
            if next_token.item() == tokenizer.token_to_id("[EOS]"):  # 或自定义终止符
                break
    return tokenizer.decode(input_ids[0].tolist())


In [27]:
# 第 5 部分：简化版 PPO Loss
def ppo_loss(policy_model, old_log_probs, input_ids, advantages):
    logits = policy_model(input_ids)
    log_probs = F.log_softmax(logits, dim=-1)
    
    selected_log_probs = log_probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1)
    ratio = torch.exp(selected_log_probs - old_log_probs)
    
    clip_range = 0.2
    clipped_ratio = torch.clamp(ratio, 1 - clip_range, 1 + clip_range)
    loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()
    return loss


In [28]:
# 第 6 部分：训练循环（简版 PPO）
optimizer = torch.optim.Adam(policy_model.parameters(), lr=1e-5)

for epoch in range(1):  # 可调整轮数
    for prompt in tqdm(train_loader):
        prompt = prompt[0]

        # 生成 response
        response = generate_response(policy_model, prompt)

        # 编码用于 PPO 训练
        text = prompt + " " + response
        encoding = tokenizer.encode(text)
        input_ids = torch.tensor([encoding.ids], device=device)

        # 计算 reward 和 advantage
        reward = compute_reward(prompt, response)
        baseline = 0  # 可用均值奖励替代 baseline
        advantage = torch.tensor([reward - baseline], device=device)

        # 计算 old log prob
        with torch.no_grad():
            logits = policy_model(input_ids)
            log_probs = F.log_softmax(logits, dim=-1)
            old_log_probs = log_probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1)

        # PPO 反向传播
        loss = ppo_loss(policy_model, old_log_probs, input_ids, advantage)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    torch.save(policy_model.state_dict(), f"tiny_model_ppo_epoch{epoch}.pt")


100%|██████████| 10/10 [00:05<00:00,  1.85it/s]


In [31]:
from tqdm import tqdm

# 加载测试集前100条
test_dataset = PKUSafeDataset("test.jsonl")
test_subset = [test_dataset[i] for i in range(100)]

safe_count = 0
total_count = 0

for prompt in tqdm(test_subset):
    prompt = prompt[0]  # 解包
    
    # 用策略模型生成响应
    response = generate_response(policy_model, prompt)

    # 用奖励模型判断是否安全（score > 阈值则视为安全）
    reward_score = compute_reward(prompt, response)
    is_safe = reward_score > 0.5  # 你可以根据 reward 范围调整

    safe_count += int(is_safe)
    total_count += 1

print(f"策略模型在测试集前100条上的安全率: {safe_count / total_count:.2%}")


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:50<00:00,  2.00it/s]

策略模型在测试集前100条上的安全率: 36.00%



