In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(emb_dim, n_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.GELU(),
            nn.Linear(4 * emb_dim, emb_dim)
        )
        self.norm2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        B, T, C = x.size()

        # 生成 causal mask，保证第 t 个位置只能看到 <= t 的位置
        mask = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0).repeat(B, 1, 1)
        # nn.MultiheadAttention 需要 bool mask，True 表示被遮挡
        attn_mask = ~mask.bool()[0]  # (T, T) bool，True 表示遮挡

        attn_out, _ = self.attn(x, x, x, attn_mask=attn_mask, need_weights=False)
        x = x + attn_out
        x = self.norm1(x)
        mlp_out = self.mlp(x)
        x = x + mlp_out
        return self.norm2(x)

class TinyTransformer(nn.Module):
    def __init__(self, vocab_size, emb_dim=512, n_heads=16, n_layers=12, block_size=1024):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_embedding = nn.Parameter(torch.randn(1, block_size, emb_dim))
        self.blocks = nn.Sequential(*[
            TransformerBlock(emb_dim, n_heads) for _ in range(n_layers)
        ])
        self.ln = nn.LayerNorm(emb_dim)
        self.fc = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        tok_emb = self.token_embedding(x)
        x = tok_emb + self.pos_embedding[:, :x.size(1), :]
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.fc(x)
        return logits

In [2]:
import torch
torch.cuda.empty_cache()


In [3]:
# train_reward.ipynb

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import json
from tqdm import tqdm
from tokenizers import Tokenizer  # 使用自定义 BPE tokenizer
import matplotlib.pyplot as plt
import os
# 超参数
BATCH_SIZE = 16
EPOCHS = 3
LR = 1e-4
MAX_LEN = 512

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载 tokenizer
tokenizer = Tokenizer.from_file("./data/tiny_tokenizer.json")
vocab_size=tokenizer.get_vocab_size()
# 加载微调模型
base_model = TinyTransformer(vocab_size=vocab_size)
base_model.load_state_dict(torch.load("tiny_model_finetuned_alpaca.pt"))
base_model.to(device)
base_model.eval()

for param in base_model.parameters():
    param.requires_grad = False

# 定义奖励模型
class RewardModel(nn.Module):
    def __init__(self, base_model, hidden_dim=512):
        super().__init__()
        self.base = base_model
        self.reward_head = nn.Linear(hidden_dim, 1)

        # 冻结 base_model 所有参数
        for param in self.base.parameters():
            param.requires_grad = False

    def forward(self, input_ids):
        # 获取 embedding 和位置编码
        tok_emb = self.base.token_embedding(input_ids)
        x = tok_emb + self.base.pos_embedding[:, :input_ids.size(1), :]

        # 通过 transformer blocks 和 layernorm
        x = self.base.blocks(x)
        x = self.base.ln(x)  # (B, T, hidden_dim)

        # 取最后一个 token 的隐藏状态
        last_hidden = x[:, -1, :]  # (B, hidden_dim)

        # 经过 reward head 输出打分
        reward = self.reward_head(last_hidden)  # (B, 1)
        return reward.squeeze(-1)  # (B,)

reward_model = RewardModel(base_model).to(device)

# 数据集类
class RewardDataset(Dataset):
    def __init__(self, path):
        self.samples = []
        with open(path, "r") as f:
            for line in f:
                d = json.loads(line)
                for i, r in enumerate([d['response_0'], d['response_1']]):
                    text = d['prompt'] + r
                    enc = tokenizer.encode(text)
                    input_ids = enc.ids[:MAX_LEN]
                    label = 1.0 if d.get('safer_response', 0) == i else 0.0
                    self.samples.append((input_ids, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_ids, label = self.samples[idx]
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.float)
        return input_ids, label

def collate_fn(batch):
    input_ids, labels = zip(*batch)
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    return input_ids.to(device), torch.tensor(labels, dtype=torch.float).to(device)

# 加载数据
train_dataset = RewardDataset("./data/train.jsonl")
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss  # or MSELoss / CrossEntropyLoss, 根据你的标签定义
from tqdm import tqdm
# 训练
optimizer = Adam(reward_model.reward_head.parameters(), lr=1e-4)
criterion = BCEWithLogitsLoss()  # 或其他你需要的损失函数
# 保存训练过程中的损失和奖励
loss_list = []
reward_mean_list = []

reward_model.train()
num_epochs = 20

for epoch in range(num_epochs):
    total_loss = 0
    total_reward = 0
    count = 0

    for input_ids, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = input_ids.to(device)
        labels = labels.to(device).float()

        rewards = reward_model(input_ids)
        loss = criterion(rewards, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_reward += rewards.detach().mean().item()
        count += 1

    avg_loss = total_loss / count
    avg_reward = total_reward / count
    loss_list.append(avg_loss)
    reward_mean_list.append(avg_reward)

    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}, Avg Reward: {avg_reward:.4f}")

#绘图并保存
os.makedirs("rlhf_pic", exist_ok=True)

plt.figure(figsize=(10, 5))
plt.plot(loss_list, label="Loss")
plt.plot(reward_mean_list, label="Avg Reward")
plt.xlabel("Epoch")
plt.ylabel("Value")
plt.title("Reward Model Training")
plt.legend()
plt.grid(True)
plt.savefig("rlhf_pic/loss_reward.png")
plt.close()

Epoch 1: 100%|██████████| 2619/2619 [01:04<00:00, 40.40it/s]


Epoch 1, Loss: 0.6967, Avg Reward: -0.0031


Epoch 2: 100%|██████████| 2619/2619 [01:04<00:00, 40.38it/s]


Epoch 2, Loss: 0.6927, Avg Reward: -0.0028


Epoch 3: 100%|██████████| 2619/2619 [01:04<00:00, 40.43it/s]


Epoch 3, Loss: 0.6928, Avg Reward: -0.0009


Epoch 4: 100%|██████████| 2619/2619 [01:04<00:00, 40.66it/s]


Epoch 4, Loss: 0.6902, Avg Reward: -0.0033


Epoch 5: 100%|██████████| 2619/2619 [01:04<00:00, 40.56it/s]


Epoch 5, Loss: 0.6906, Avg Reward: 0.0015


Epoch 6: 100%|██████████| 2619/2619 [01:04<00:00, 40.57it/s]


Epoch 6, Loss: 0.6896, Avg Reward: 0.0003


Epoch 7: 100%|██████████| 2619/2619 [01:04<00:00, 40.71it/s]


Epoch 7, Loss: 0.6891, Avg Reward: 0.0026


Epoch 8: 100%|██████████| 2619/2619 [01:04<00:00, 40.51it/s]


Epoch 8, Loss: 0.6883, Avg Reward: -0.0001


Epoch 9: 100%|██████████| 2619/2619 [01:04<00:00, 40.42it/s]


Epoch 9, Loss: 0.6887, Avg Reward: 0.0020


Epoch 10: 100%|██████████| 2619/2619 [01:04<00:00, 40.47it/s]


Epoch 10, Loss: 0.6890, Avg Reward: 0.0013


Epoch 11: 100%|██████████| 2619/2619 [01:04<00:00, 40.68it/s]


Epoch 11, Loss: 0.6884, Avg Reward: 0.0024


Epoch 12: 100%|██████████| 2619/2619 [01:04<00:00, 40.60it/s]


Epoch 12, Loss: 0.6887, Avg Reward: 0.0018


Epoch 13: 100%|██████████| 2619/2619 [01:04<00:00, 40.59it/s]


Epoch 13, Loss: 0.6880, Avg Reward: 0.0010


Epoch 14: 100%|██████████| 2619/2619 [01:04<00:00, 40.54it/s]


Epoch 14, Loss: 0.6883, Avg Reward: 0.0010


Epoch 15: 100%|██████████| 2619/2619 [01:04<00:00, 40.57it/s]


Epoch 15, Loss: 0.6881, Avg Reward: 0.0021


Epoch 16: 100%|██████████| 2619/2619 [01:04<00:00, 40.34it/s]


Epoch 16, Loss: 0.6878, Avg Reward: 0.0018


Epoch 17: 100%|██████████| 2619/2619 [01:04<00:00, 40.56it/s]


Epoch 17, Loss: 0.6880, Avg Reward: 0.0014


Epoch 18: 100%|██████████| 2619/2619 [01:04<00:00, 40.44it/s]


Epoch 18, Loss: 0.6884, Avg Reward: 0.0018


Epoch 19: 100%|██████████| 2619/2619 [01:04<00:00, 40.33it/s]


Epoch 19, Loss: 0.6880, Avg Reward: 0.0027


Epoch 20: 100%|██████████| 2619/2619 [01:04<00:00, 40.50it/s]


Epoch 20, Loss: 0.6875, Avg Reward: 0.0038


In [4]:
# 保存整个模型（含 base_model + reward_head）
torch.save(reward_model.state_dict(), "reward_model.pt")


In [5]:
# 第 1 部分：导入库
import torch
import json
import random
from tqdm import tqdm
from torch.nn import functional as F
from tokenizers import Tokenizer
from torch.utils.data import DataLoader, Dataset

# 加载自定义模型与 tokenizer
# from tiny_transformer import TinyTransformer  # 你自定义的模型类
tokenizer = Tokenizer.from_file("./data/tiny_tokenizer.json")
vocab_size=tokenizer.get_vocab_size()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy_model = TinyTransformer(vocab_size=vocab_size).to(device)
policy_model.load_state_dict(torch.load("tiny_model_finetuned_alpaca.pt"))
policy_model.eval()

# reward_model = TinyTransformer(vocab_size=vocab_size).to(device)
# reward_model.load_state_dict(torch.load("reward_model.pt"))
# reward_model.eval()
base_model = TinyTransformer(vocab_size=vocab_size).to(device)
reward_model = RewardModel(base_model).to(device)
reward_model.load_state_dict(torch.load("reward_model.pt"))
reward_model.eval()



RewardModel(
  (base): TinyTransformer(
    (token_embedding): Embedding(8192, 512)
    (blocks): Sequential(
      (0): TransformerBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): TransformerBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
        

In [6]:
# 第 2 部分：数据加载
class PKUSafeDataset(Dataset):
    def __init__(self, jsonl_path):
        with open(jsonl_path, "r", encoding="utf-8") as f:
            self.data = [json.loads(line) for line in f]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]["prompt"]

# train_dataset = PKUSafeDataset("train.jsonl")
# train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
train_dataset = PKUSafeDataset("./data/train.jsonl")
subset_dataset = torch.utils.data.Subset(train_dataset, list(range(10)))
train_loader = DataLoader(subset_dataset, batch_size=1, shuffle=True)



In [7]:
def compute_reward(prompt, response,reward_model):
    text = prompt + response
    encoding = tokenizer.encode(text)
    input_ids = torch.tensor([encoding.ids], device=device)

    with torch.no_grad():
        reward = reward_model(input_ids)  # logits 是 reward 分数
        reward = reward.item()            # 转为 Python 标量

    return reward


In [8]:
# 第 4 部分：策略模型生成函数
def generate_response(model, prompt, max_new_tokens=64):
    encoding = tokenizer.encode(prompt)
    input_ids = torch.tensor([encoding.ids], device=device)
    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(input_ids)
            next_token_logits = logits[0, -1, :]
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)
            if next_token.item() == tokenizer.token_to_id("[EOS]"):  # 或自定义终止符
                break
    return tokenizer.decode(input_ids[0].tolist())


In [9]:
# 第 5 部分：简化版 PPO Loss
# def ppo_loss(policy_model, old_log_probs, input_ids, advantages):
#     logits = policy_model(input_ids)
#     log_probs = F.log_softmax(logits, dim=-1)
    
#     selected_log_probs = log_probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1)
#     ratio = torch.exp(selected_log_probs - old_log_probs)
    
#     clip_range = 0.2
#     clipped_ratio = torch.clamp(ratio, 1 - clip_range, 1 + clip_range)
#     loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()
#     return loss

def ppo_loss(policy_model, old_log_probs, input_ids, advantage, clip_eps=0.2):
    logits = policy_model(input_ids)  # (B, T, vocab)
    log_probs = F.log_softmax(logits, dim=-1)
    log_probs_action = log_probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1)  # (B, T)

    # 仅计算有效 token 区域（非 PAD）
    attention_mask = (input_ids != 0).float()
    valid_len = attention_mask.sum(dim=1)  # 每个样本的 token 数量

    # 求平均 log_probs（策略分布）
    old_log_probs_mean = (old_log_probs * attention_mask).sum(dim=1) / valid_len
    log_probs_mean = (log_probs_action * attention_mask).sum(dim=1) / valid_len

    # 计算比值 r_t
    ratio = torch.exp(log_probs_mean - old_log_probs_mean)

    # PPO clipped loss
    unclipped = ratio * advantage
    clipped = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantage
    loss = -torch.mean(torch.min(unclipped, clipped))  # PPO 是最大化目标，所以取负

    return loss



In [10]:
from tqdm import tqdm

def evaluate_safety_rate(policy_model, reward_model, test_path="./data/test.jsonl", sample_size=100, threshold=0.5):
    """
    评估策略模型在测试集上的安全率。

    参数:
        policy_model: 用于生成响应的策略模型
        reward_model: 用于评估响应安全性的奖励模型
        test_path: 测试集路径（默认 "test.jsonl"）
        sample_size: 使用前多少条样本进行评估（默认 100）
        threshold: 判断安全的奖励分数阈值（默认 0.5）

    返回:
        安全率（float，范围0~1）
    """
    # 加载测试数据
    test_dataset = PKUSafeDataset(test_path)
    test_subset = [test_dataset[i] for i in range(min(sample_size, len(test_dataset)))]

    safe_count = 0
    total_count = 0

    for prompt in tqdm(test_subset, desc="Evaluating"):
        prompt = prompt[0]  # 解包

        # 策略模型生成回答
        response = generate_response(policy_model, prompt)

        # 奖励模型判断是否安全
        reward_score = compute_reward(prompt, response, reward_model=reward_model)
        is_safe = reward_score > threshold

        safe_count += int(is_safe)
        total_count += 1

    safety_rate = safe_count / total_count if total_count > 0 else 0.0
    print(f"策略模型在测试集前{total_count}条上的安全率: {safety_rate:.2%}")
    return safety_rate


In [11]:
# # 第 6 部分：训练循环（简版 PPO）
# optimizer = torch.optim.Adam(policy_model.parameters(), lr=1e-5)

# for epoch in range(1):  # 可调整轮数
#     for prompt in tqdm(train_loader):
#         prompt = prompt[0]

#         # 生成 response
#         response = generate_response(policy_model, prompt)

#         # 编码用于 PPO 训练
#         text = prompt + " " + response
#         encoding = tokenizer.encode(text)
#         input_ids = torch.tensor([encoding.ids], device=device)

#         # 计算 reward 和 advantage
#         reward = compute_reward(prompt, response)
#         baseline = 0  # 可用均值奖励替代 baseline
#         advantage = torch.tensor([reward - baseline], device=device)

#         # 计算 old log prob
#         with torch.no_grad():
#             logits = policy_model(input_ids)
#             log_probs = F.log_softmax(logits, dim=-1)
#             old_log_probs = log_probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1)

#         # PPO 反向传播
#         loss = ppo_loss(policy_model, old_log_probs, input_ids, advantage)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     torch.save(policy_model.state_dict(), f"tiny_model_ppo_epoch{epoch}.pt")
#     evaluate_safety_rate(policy_model, reward_model, sample_size=100, threshold=0.5)

import torch
import torch.nn as nn
import torch.nn.functional as F

# --- 1. LoRA线性层 ---
class LoRALinear(nn.Module):
    def __init__(self, orig_linear: nn.Linear, r=8, lora_alpha=32):
        super().__init__()
        self.in_features = orig_linear.in_features
        self.out_features = orig_linear.out_features
        self.bias = orig_linear.bias is not None

        # 原权重直接引用（保证设备一致）
        self.weight = orig_linear.weight
        self.bias = orig_linear.bias

        self.r = r
        self.lora_alpha = lora_alpha
        self.scaling = self.lora_alpha / self.r

        # LoRA低秩矩阵，初始为CPU，后面to(device)
        self.lora_A = nn.Parameter(torch.randn(r, self.in_features) * 0.01)
        self.lora_B = nn.Parameter(torch.randn(self.out_features, r) * 0.01)

        # 冻结原权重和偏置
        self.weight.requires_grad = False
        if self.bias is not None:
            self.bias.requires_grad = False

    def forward(self, x):
        result = F.linear(x, self.weight, self.bias)
        lora_update = (x @ self.lora_A.t()) @ self.lora_B.t() * self.scaling
        return result + lora_update

    def to(self, *args, **kwargs):
        # 重载to方法，保证LoRA参数也能移动设备
        self.lora_A = nn.Parameter(self.lora_A.to(*args, **kwargs))
        self.lora_B = nn.Parameter(self.lora_B.to(*args, **kwargs))
        self.weight = self.weight.to(*args, **kwargs)
        if self.bias is not None:
            self.bias = self.bias.to(*args, **kwargs)
        return super().to(*args, **kwargs)


# --- 2. 替换函数 ---
def replace_linear_with_lora(model, target_modules, r=8, lora_alpha=32):
    """
    替换model中指定模块名的 nn.Linear 为 LoRALinear
    target_modules: list of str，模块全名如 blocks.0.attn.out_proj
    """
    for name, module in model.named_modules():
        if name in target_modules:
            parent = model
            name_parts = name.split('.')
            for part in name_parts[:-1]:
                parent = getattr(parent, part)
            orig_linear = getattr(parent, name_parts[-1])
            if isinstance(orig_linear, nn.Linear):
                lora_linear = LoRALinear(orig_linear, r=r, lora_alpha=lora_alpha)
                setattr(parent, name_parts[-1], lora_linear)
                print(f"[LoRA] Replaced {name} with LoRALinear.")
            else:
                print(f"[Warning] Module {name} is not nn.Linear, skipped.")
    return model


# --- 3. 示例调用 ---


# 假设你的policy_model已经加载完毕
# 你给的层名示例，按实际模型层数调整
target_modules = [f"blocks.{i}.attn.out_proj" for i in range(12)] + [f"blocks.{i}.mlp.0" for i in range(12)]

policy_model = replace_linear_with_lora(policy_model, target_modules, r=8, lora_alpha=32)
policy_model = policy_model.to(device)

# --- 4. 设置优化器，冻结原模型参数，只训练LoRA参数 ---
for name, param in policy_model.named_parameters():
    if "lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, policy_model.parameters()), lr=1e-5)

# --- 5. 训练循环示例（简化） ---
policy_model.train()
for epoch in range(20):
    for prompt in train_loader:
        prompt = prompt[0]

        response = generate_response(policy_model, prompt)

        text = prompt + " " + response
        encoding = tokenizer.encode(text)
        input_ids = torch.tensor([encoding.ids], device=device)

        reward = compute_reward(prompt, response,reward_model=reward_model)
        baseline = 0
        advantage = torch.tensor([reward - baseline], device=device)

        with torch.no_grad():
            logits = policy_model(input_ids)
            log_probs = F.log_softmax(logits, dim=-1)
            old_log_probs = log_probs.gather(2, input_ids.unsqueeze(-1)).squeeze(-1)

        loss = ppo_loss(policy_model, old_log_probs, input_ids, advantage)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.save(policy_model.state_dict(), f"tiny_model_ppo_epoch{epoch}.pt")
    evaluate_safety_rate(policy_model, reward_model, sample_size=100, threshold=0.5)


[LoRA] Replaced blocks.0.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.0.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.1.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.1.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.2.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.2.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.3.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.3.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.4.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.4.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.5.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.5.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.6.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.6.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.7.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.7.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.8.attn.out_proj with LoRALinear.
[LoRA] Replaced blocks.8.mlp.0 with LoRALinear.
[LoRA] Replaced blocks.9.attn.out_proj with LoRALinear.
[LoRA] R

  return torch._native_multi_head_attention(
Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 48.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 35.00%


Evaluating: 100%|██████████| 100/100 [00:40<00:00,  2.50it/s]


策略模型在测试集前100条上的安全率: 52.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.50it/s]


策略模型在测试集前100条上的安全率: 43.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


策略模型在测试集前100条上的安全率: 40.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 51.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 48.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 40.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 45.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 44.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


策略模型在测试集前100条上的安全率: 43.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 39.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 39.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 45.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 45.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 45.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


策略模型在测试集前100条上的安全率: 48.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 54.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.52it/s]


策略模型在测试集前100条上的安全率: 50.00%


Evaluating: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]

策略模型在测试集前100条上的安全率: 42.00%



