In [2]:
#section 0
# 查看 GPU
!nvidia-smi

# 安装依赖 - 修正版
!pip -q install "transformers>=4.41" accelerate datasets scikit-learn openpyxl

# ✅ 修正：正确安装 RDKit
!pip -q install rdkit

# ✅ 修正：正确安装 Chemprop (注意版本)
!pip -q install chemprop

# 安装 PyTorch (如果需要)
!pip -q install torch torchvision torchaudio

!pip install -q rdkit transformers accelerate datasets scikit-learn openpyxl torch bitsandbytes peft trl sentencepiece protobuf

print("✅ 所有依赖安装完成")

Thu Oct 23 08:45:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   30C    P0             50W /  400W |       5MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
#section 1
from huggingface_hub import login
login()  # 按提示粘贴你的HF Token（Settings -> Access Tokens）

# 可选：挂载 Drive 便于读取数据/保存结果
from google.colab import drive
drive.mount('/content/drive')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#section 1 (在挂载 Drive 之前或之后)
from google.colab import files
import shutil

# 上传文件
print("📤 请选择您的数据文件...")
uploaded = files.upload()

# 获取上传的文件名
filename = list(uploaded.keys())[0]
print(f"✅ 文件已上传: {filename}")

# 移动到工作目录（可选）
if filename != "smiles-data.xlsx":
    shutil.move(filename, "smiles-data.xlsx")
    print("✅ 文件已重命名为: smiles-data.xlsx")

# 验证文件
import pandas as pd
df = pd.read_excel("smiles-data.xlsx")
print(f"\n📊 数据预览:")
print(f"   行数: {len(df)}")
print(f"   列名: {df.columns.tolist()}")
print(f"\n前3行:")
print(df.head(3))

📤 请选择您的数据文件...


Saving smiles-data.xlsx to smiles-data (1).xlsx
✅ 文件已上传: smiles-data (1).xlsx
✅ 文件已重命名为: smiles-data.xlsx

📊 数据预览:
   行数: 1200
   列名: ['number', 1, 2, 3, 'Structure', 'Score']

前3行:
   number   1   2   3                                          Structure  \
0       1  A1  C1  B1  CCCCCCCCCCCCNC(=O)C(CCCCCOC(=O)CCCCCCCCCC)NCCN...   
1       2  A1  C1  B2  CCCCCCCCCCCCNC(=O)C(CCCCCOC(=O)CCC(C)CCCCC)NCC...   
2       3  A1  C1  B3  CCCCCCCCCCCCNC(=O)C(CCCCCOC(=O)CCCCCCCCC)NCCN(C)C   

      Score  
0  4.056689  
1  3.381791  
2  2.374907  


In [5]:
# 强制删除旧文件，重新创建
!rm -f /content/train_primary_then_agents_verify_test.py

# 然后重新运行 Section 2

In [3]:
!pip install -U bitsandbytes



In [10]:
%%writefile train_deepseek_simple.py
# -*- coding: utf-8 -*-
"""
DeepSeek + MolFormer - 终极简化版
"""
import os, json, argparse, warnings
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM,
    Trainer, TrainingArguments, DataCollatorWithPadding, BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import re

# ========== 辅助函数 ==========
def seed_everything(seed=42):
    import random
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed); os.environ["PYTHONHASHSEED"]=str(seed)
    torch.backends.cudnn.deterministic=True; torch.backends.cudnn.benchmark=False

def ensure_label_1_8(x): return max(1, min(8, int(round(float(x)))))

def within_k(y_true, y_pred, k=1):
    return float(np.mean(np.abs(np.array(y_true)-np.array(y_pred)) <= k))

def quadratic_weighted_kappa(y_true, y_pred):
    yt = np.array(y_true); yp = np.array(y_pred); m = 8
    O = np.zeros((m,m))
    for a,b in zip(yt,yp): O[a-1, b-1]+=1
    W = np.array([[(i-j)**2/49 for j in range(m)] for i in range(m)])
    act, pred = O.sum(1), O.sum(0)
    E = np.outer(act, pred) / act.sum()
    return 1.0 - (np.sum(W*O)/np.sum(W*E) if np.sum(W*E)>0 else 1.0)

def evaluate_metrics(y_true, y_pred):
    return {"MAE": float(mean_absolute_error(y_true, y_pred)),
            "within±1": within_k(y_true, y_pred, 1),
            "QWK": quadratic_weighted_kappa(y_true, y_pred)}

# Scaffold split
try:
    from rdkit import Chem
    from rdkit.Chem.Scaffolds import MurckoScaffold
    def get_scaffold(s):
        try: return Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(Chem.MolFromSmiles(s)))
        except: return None
except:
    def get_scaffold(s): return hash(s) % 10000

def scaffold_split(df, smiles_col, test_size=120, val_frac=0.2, seed=42):
    df = df.copy(); df["scaffold"] = df[smiles_col].apply(get_scaffold)
    df = df[df["scaffold"].notna()].reset_index(drop=True)
    groups = df.groupby("scaffold").size().sort_values(ascending=False)

    # ✅ 修复：更智能地选择 scaffold
    test_scaffolds, cnt = [], 0
    for scaf, c in groups.items():
        # 如果加上这个 scaffold 不会超出太多（容忍 20% 误差）
        if cnt + c <= test_size * 1.2:
            test_scaffolds.append(scaf)
            cnt += c
            if cnt >= test_size:  # ✅ 达到目标就停止
                break
        # 如果当前不足，且这个 scaffold 能填补一半以上的缺口
        elif cnt < test_size and test_size - cnt > c * 0.5:
            test_scaffolds.append(scaf)
            cnt += c
            break

    test_df = df[df["scaffold"].isin(test_scaffolds)]
    remain_df = df[~df["scaffold"].isin(test_scaffolds)]
    train_df, val_df = train_test_split(remain_df, test_size=val_frac, random_state=seed)
    print(f"Split: Train={len(train_df)} Val={len(val_df)} Test={len(test_df)} (target={test_size})")
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)

# ========== 🔥 使用原生 Transformers Trainer (不用 trl) ==========
def train_llm_classifier(model_name, train_df, val_df, smiles_col, label_col, out_dir,
                         epochs=5, lr=2e-4, bs=2, seed=42):
    os.makedirs(out_dir, exist_ok=True)
    seed_everything(seed)

    print(f"\n🔧 Loading DeepSeek: {model_name}")

    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True, bnb_8bit_compute_dtype=torch.float16,
        bnb_8bit_use_double_quant=True,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    if not tokenizer.pad_token: tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name, quantization_config=bnb_config, device_map="auto",
        trust_remote_code=True, torch_dtype=torch.float16,
    )

    model = prepare_model_for_kbit_training(model)
    lora_config = LoraConfig(
        r=16, lora_alpha=32, target_modules=["q_proj","k_proj","v_proj","o_proj"],
        lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # 准备数据
    def format_sample(row):
        s = str(row[smiles_col]); score = ensure_label_1_8(row[label_col])
        text = f"Predict score (1-8) for molecule.\nSMILES: {s}\nScore: {score}"
        return tokenizer(text, truncation=True, max_length=256, padding=False)

    train_data = [format_sample(row) for _, row in train_df.iterrows()]
    val_data = [format_sample(row) for _, row in val_df.iterrows()]

    # 🔥 使用原生 Trainer (避免 trl 兼容性问题)
    from torch.utils.data import Dataset
    class SimpleDataset(Dataset):
        def __init__(self, data): self.data = data
        def __len__(self): return len(self.data)
        def __getitem__(self, i):
            item = {k: torch.tensor(v) for k,v in self.data[i].items()}
            item["labels"] = item["input_ids"].clone()
            return item

    train_ds = SimpleDataset(train_data)
    val_ds = SimpleDataset(val_data)

    def collate_fn(batch):
        return {k: torch.nn.utils.rnn.pad_sequence(
            [b[k] for b in batch], batch_first=True, padding_value=tokenizer.pad_token_id
        ) for k in batch[0].keys()}

    args = TrainingArguments(
        output_dir=out_dir, num_train_epochs=epochs,
        per_device_train_batch_size=bs, per_device_eval_batch_size=bs,
        gradient_accumulation_steps=8, learning_rate=lr, fp16=True,
        logging_steps=20, eval_strategy="epoch", save_strategy="epoch",
        load_best_model_at_end=True, warmup_ratio=0.1, report_to="none",
        save_total_limit=2, seed=seed,
    )

    from transformers import default_data_collator

    trainer = Trainer(
        model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
        data_collator=collate_fn, tokenizer=tokenizer,
    )

    print("🚀 Training DeepSeek (15-20h)...")
    trainer.train()
    trainer.save_model(out_dir); tokenizer.save_pretrained(out_dir)
    return model, tokenizer

@torch.no_grad()
def predict_llm(model, tokenizer, df, smiles_col, label_col):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.eval(); predictions = []

    for idx, row in df.iterrows():
        prompt = f"Predict score (1-8) for molecule.\nSMILES: {row[smiles_col]}\nScore:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
        inputs = {k:v.to(device) for k,v in inputs.items()}

        try:
            outputs = model.generate(**inputs, max_new_tokens=3, temperature=0.1, do_sample=False)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            match = re.search(r'Score:\s*(\d)', response)
            score = int(match.group(1)) if match else 4
            score = max(1, min(8, score))
        except: score = 4

        predictions.append(score)
        if (idx+1)%30==0: print(f"   {idx+1}/{len(df)}")

    return np.array(predictions), None

# ========== MolFormer ==========
class SmilesDataset(torch.utils.data.Dataset):
    def __init__(self, df, tok, text_col, label_col):
        self.df=df; self.tok=tok; self.text_col=text_col
        self.labels = [ensure_label_1_8(x)-1 for x in df[label_col]]
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        enc = self.tok(str(self.df.iloc[i][self.text_col]), truncation=True, max_length=256)
        return {k:torch.tensor(v) for k,v in enc.items()} | {"labels": torch.tensor(self.labels[i])}

# 修复后的 train_molformer 函数
def train_molformer(model_name, train_df, val_df, smiles_col, label_col, out_dir,
                    epochs=12, lr=1e-5, bs=8, seed=42):
    os.makedirs(out_dir, exist_ok=True)
    seed_everything(seed)
    print(f"\n🔧 Training {model_name}")

    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=8, trust_remote_code=True
    )

    train_ds = SmilesDataset(train_df, tok, smiles_col, label_col)
    val_ds = SmilesDataset(val_df, tok, smiles_col, label_col)

    # ✅ 使用 default_data_collator 而不是 DataCollatorWithPadding
    from transformers import default_data_collator

    args = TrainingArguments(
        output_dir=out_dir,
        seed=seed,
        learning_rate=lr,
        num_train_epochs=epochs,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none",
        save_total_limit=2,
        logging_steps=50,
        warmup_ratio=0.1,
        weight_decay=0.1,
        dataloader_num_workers=0,  # ✅ 避免多进程问题
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=default_data_collator,  # ✅ 修复关键
        tokenizer=tok,
    )

    print(f"   Training for {epochs} epochs...")
    trainer.train()
    trainer.save_model(out_dir)
    tok.save_pretrained(out_dir)
    print(f"   ✅ Saved to {out_dir}")
    return model, tok

@torch.no_grad()
def predict_molformer(model, tok, df, smiles_col, label_col):
    ds = SmilesDataset(df, tok, smiles_col, label_col)
    loader = torch.utils.data.DataLoader(ds, batch_size=64, collate_fn=DataCollatorWithPadding(tok))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval(); preds = []
    for batch in loader:
        _ = batch.pop("labels")
        logits = model(**{k:v.to(device) for k,v in batch.items()}).logits
        preds.extend((logits.argmax(-1)+1).cpu().tolist())
    return np.array(preds), None

# ========== Main ==========
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--data_path", default="/content/smiles-data.xlsx")
    ap.add_argument("--smiles_col", default="Structure")
    ap.add_argument("--label_col", default="Score")
    ap.add_argument("--out_dir", default="./outputs_deepseek")
    ap.add_argument("--seed", type=int, default=42)
    ap.add_argument("--epochs_llm", type=int, default=5)
    args = ap.parse_args()

    seed_everything(args.seed)
    os.makedirs(args.out_dir, exist_ok=True)

    # 加载数据
    df = pd.read_excel(args.data_path)
    df = df.dropna(subset=[args.smiles_col, args.label_col])
    df[args.label_col] = df[args.label_col].map(ensure_label_1_8)

    train_df, val_df, test_df = scaffold_split(df, args.smiles_col, 120, 0.2, args.seed)

    # 训练 DeepSeek
    print("\n[Stage 1] DeepSeek PRIMARY")
    m_pri, t_pri = train_llm_classifier(
        "deepseek-ai/deepseek-llm-7b-base", train_df, val_df,
        args.smiles_col, args.label_col,
        os.path.join(args.out_dir, "primary"),
        epochs=args.epochs_llm, seed=args.seed
    )

    print("\n✅ Training complete!")

if __name__ == "__main__":
    main()

Overwriting train_deepseek_simple.py


In [11]:
!python train_deepseek_simple.py \
    --data_path /content/smiles-data.xlsx \
    --epochs_llm 5

2025-10-23 08:49:33.754879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761209373.776623    8196 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761209373.783248    8196 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1761209373.799766    8196 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1761209373.799793    8196 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1761209373.799797    8196 computation_placer.cc:177] computation placer alr

In [12]:
# 检查当前有哪些训练脚本
!ls -lh *.py

# 检查已保存的模型
!ls -lh ./outputs_deepseek/

-rw-r--r-- 1 root root 12K Oct 23 08:49 train_deepseek_simple.py
total 4.0K
drwxr-xr-x 4 root root 4.0K Oct 23 09:14 primary


In [14]:
!pip uninstall -U bitsandbytes


Usage:   
  pip3 uninstall [options] <package> ...
  pip3 uninstall [options] -r <requirements file> ...

no such option: -U


In [15]:
!pip install -U bitsandbytes



In [9]:
#section - 合并 LoRA 权重
print("🔧 开始合并 LoRA 权重...")

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# 清理显存
import gc
torch.cuda.empty_cache()
gc.collect()

try:
    # 加载 base model (FP16)
    print("加载 base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        "deepseek-ai/deepseek-llm-7b-base",
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )

    # 加载 LoRA adapter
    print("加载 LoRA adapter...")
    lora_model = PeftModel.from_pretrained(base_model, "./outputs_deepseek/primary")

    # 合并
    print("合并权重（这可能需要几分钟）...")
    merged_model = lora_model.merge_and_unload()

    # 保存
    print("保存合并后的模型...")
    import os
    os.makedirs("./outputs_deepseek/primary_merged", exist_ok=True)
    merged_model.save_pretrained("./outputs_deepseek/primary_merged")

    # 保存 tokenizer
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base", trust_remote_code=True)
    tokenizer.save_pretrained("./outputs_deepseek/primary_merged")

    print("✅ 合并完成！")

    # 测试合并后的模型
    print("\n测试合并后的模型...")
    test_prompt = "Predict score (1-8) for molecule.\nSMILES: CCO\nScore:"
    inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = merged_model.generate(**inputs, max_new_tokens=3)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"测试输出: {response}")

    # 备份到 Drive
    print("\n备份到 Google Drive...")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    !mkdir -p /content/drive/MyDrive/deepseek_backup
    !cp -r ./outputs_deepseek/primary_merged /content/drive/MyDrive/deepseek_backup/

    print("✅ 全部完成！")

except Exception as e:
    print(f"❌ 错误: {e}")
    print("可能是内存不足，尝试重启 runtime 后再执行")

🔧 开始合并 LoRA 权重...
加载 base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

加载 LoRA adapter...
合并权重（这可能需要几分钟）...
保存合并后的模型...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


✅ 合并完成！

测试合并后的模型...
测试输出: Predict score (1-8) for molecule.
SMILES: CCO
Score: 1

备份到 Google Drive...
Mounted at /content/drive
1

✅ 全部完成！


In [17]:
# Step 1: 挂载 Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: 复制 DeepSeek 模型到 Drive
!mkdir -p /content/drive/MyDrive/deepseek_backup
!cp -r /content/outputs_deepseek/primary /content/drive/MyDrive/deepseek_backup/

# 验证保存成功
!ls -lh /content/drive/MyDrive/deepseek_backup/primary/

print("✅ DeepSeek PRIMARY 已保存到 Google Drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
total 68M
-rw------- 1 root root  895 Oct 23 09:18 adapter_config.json
-rw------- 1 root root  61M Oct 23 09:18 adapter_model.safetensors
drwx------ 2 root root 4.0K Oct 23 09:18 checkpoint-204
drwx------ 2 root root 4.0K Oct 23 09:18 checkpoint-255
-rw------- 1 root root 5.1K Oct 23 09:18 README.md
-rw------- 1 root root  482 Oct 23 09:18 special_tokens_map.json
-rw------- 1 root root 3.1K Oct 23 09:18 tokenizer_config.json
-rw------- 1 root root 7.2M Oct 23 09:18 tokenizer.json
-rw------- 1 root root 5.8K Oct 23 09:18 training_args.bin
✅ DeepSeek PRIMARY 已保存到 Google Drive


In [4]:
# Step 1: 重新挂载 Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: 从 Drive 复制回来
!mkdir -p /content/outputs_deepseek
!cp -r /content/drive/MyDrive/deepseek_backup/primary /content/outputs_deepseek/

# 验证
!ls -lh /content/outputs_deepseek/primary/

print("✅ DeepSeek PRIMARY 已从 Drive 恢复")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
total 68M
-rw-r--r-- 1 root root  895 Oct 23 09:20 adapter_config.json
-rw-r--r-- 1 root root  61M Oct 23 09:20 adapter_model.safetensors
drwxr-xr-x 2 root root 4.0K Oct 23 09:09 checkpoint-204
drwxr-xr-x 2 root root 4.0K Oct 23 09:14 checkpoint-255
-rw-r--r-- 1 root root 5.1K Oct 23 09:20 README.md
-rw-r--r-- 1 root root  482 Oct 23 09:20 special_tokens_map.json
-rw-r--r-- 1 root root 3.1K Oct 23 09:20 tokenizer_config.json
-rw-r--r-- 1 root root 7.2M Oct 23 09:20 tokenizer.json
-rw-r--r-- 1 root root 5.8K Oct 23 09:20 training_args.bin
✅ DeepSeek PRIMARY 已从 Drive 恢复


In [13]:
#section 3 - 完整修复版
import os
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM,
    Trainer, TrainingArguments, DataCollatorWithPadding, BitsAndBytesConfig
)
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# ========== 辅助函数（保持不变）==========
def seed_everything(seed=42):
    import random
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed); os.environ["PYTHONHASHSEED"]=str(seed)
    torch.backends.cudnn.deterministic=True; torch.backends.cudnn.benchmark=False

def ensure_label_1_8(x):
    try: return max(1, min(8, int(round(float(x)))))
    except: return 4

def within_k(y_true, y_pred, k=1):
    return float(np.mean(np.abs(np.array(y_true)-np.array(y_pred)) <= k))

def quadratic_weighted_kappa(y_true, y_pred, min_rating=1, max_rating=8):
    yt = np.array([ensure_label_1_8(x) for x in y_true], dtype=int)
    yp = np.array([ensure_label_1_8(x) for x in y_pred], dtype=int)
    m = max_rating - min_rating + 1
    O = np.zeros((m,m), float)
    for a,b in zip(yt,yp): O[a-min_rating, b-min_rating]+=1
    W = np.zeros((m,m), float)
    for i in range(m):
        for j in range(m):
            W[i,j] = ((i-j)**2) / ((m-1)**2)
    act = np.sum(O, axis=1); pred = np.sum(O, axis=0)
    E = np.outer(act, pred) / np.sum(act)
    num = np.sum(W*O); den = np.sum(W*E)
    return 1.0 - (num/den if den>0 else 1.0)

def evaluate_metrics(y_true, y_pred):
    return {
        "MAE": float(mean_absolute_error(y_true, y_pred)),
        "within±1": within_k(y_true, y_pred, 1),
        "QWK": quadratic_weighted_kappa(y_true, y_pred, 1, 8)
    }

try:
    from rdkit import Chem
    from rdkit.Chem.Scaffolds import MurckoScaffold
    def get_scaffold(s):
        try: return Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(Chem.MolFromSmiles(s)))
        except: return None
except:
    def get_scaffold(s): return hash(s) % 10000

def scaffold_split(df, smiles_col, test_size=120, val_frac=0.2, seed=42):
    df = df.copy()
    df["scaffold"] = df[smiles_col].apply(get_scaffold)
    df = df[df["scaffold"].notna()].reset_index(drop=True)
    groups = df.groupby("scaffold").size().sort_values(ascending=False)
    test_scaffolds, cnt = [], 0
    for scaf, c in groups.items():
        if cnt + c <= test_size * 1.2:
            test_scaffolds.append(scaf)
            cnt += c
            if cnt >= test_size:
                break
        elif cnt < test_size and test_size - cnt > c * 0.5:
            test_scaffolds.append(scaf)
            cnt += c
            break
    test_df = df[df["scaffold"].isin(test_scaffolds)]
    remain_df = df[~df["scaffold"].isin(test_scaffolds)]
    train_df, val_df = train_test_split(remain_df, test_size=val_frac, random_state=seed)
    print(f"📊 Split: Train={len(train_df)} Val={len(val_df)} Test={len(test_df)}")
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)

# ========== MolFormer（保持不变）==========
class SmilesDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, text_col, label_col, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tok = tokenizer
        self.text_col = text_col
        self.label_col = label_col
        self.max_length = max_length
        labels = [ensure_label_1_8(x) for x in self.df[self.label_col].tolist()]
        self.labels = [int(v - 1) for v in labels]
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        s = str(self.df.iloc[idx][self.text_col])
        enc = self.tok(s, truncation=True, max_length=self.max_length, padding=False, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def train_molformer(model_name, train_df, val_df, smiles_col, label_col, out_dir,
                    epochs=12, lr=1e-5, bs=8, wd=0.1, seed=42):
    os.makedirs(out_dir, exist_ok=True)
    seed_everything(seed)
    print(f"\n🔧 Training {model_name}")

    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=8,
        torch_dtype=torch.float32,  # ✅ MolFormer 用 FP32
        trust_remote_code=True
    )

    train_ds = SmilesDataset(train_df, tok, smiles_col, label_col)
    val_ds = SmilesDataset(val_df, tok, smiles_col, label_col)
    collator = DataCollatorWithPadding(tokenizer=tok, padding="longest")

    args = TrainingArguments(
        output_dir=out_dir, seed=seed, learning_rate=lr, num_train_epochs=epochs,
        per_device_train_batch_size=bs, per_device_eval_batch_size=bs, weight_decay=wd,
        eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True,
        metric_for_best_model="eval_loss", greater_is_better=False, warmup_ratio=0.1,
        fp16=False,  # ✅ MolFormer 关闭 FP16
        report_to="none", logging_steps=50, save_total_limit=2,
    )

    def _metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1) + 1
        labels = labels + 1
        return {
            "mae": mean_absolute_error(labels, preds),
            "within1": within_k(labels, preds, 1),
            "qwk": quadratic_weighted_kappa(labels, preds, 1, 8)
        }

    trainer = Trainer(
        model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
        data_collator=collator, tokenizer=tok, compute_metrics=_metrics
    )

    print(f"   Training for {epochs} epochs...")
    trainer.train()
    trainer.save_model(out_dir)
    tok.save_pretrained(out_dir)
    print(f"   ✅ Saved to {out_dir}")
    return model, tok

@torch.no_grad()
def predict_molformer(model, tok, df, smiles_col, label_col, batch_size=64):
    ds = SmilesDataset(df, tok, smiles_col, label_col)
    collator = DataCollatorWithPadding(tokenizer=tok, padding="longest")
    loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()
    all_preds = []
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        logits = model(**batch).logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy() + 1
        all_preds.extend(preds.tolist())
    return np.array(all_preds, dtype=int)

# DeepSeek 预测
@torch.no_grad()
def predict_deepseek(model, tokenizer, df, smiles_col):
    import re
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.eval()
    predictions = []

    print(f"\n🔮 Predicting with DeepSeek on {len(df)} samples...")
    for idx, row in df.iterrows():
        prompt = f"Predict score (1-8) for molecule.\nSMILES: {row[smiles_col]}\nScore:"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256)
        inputs = {k:v.to(device) for k,v in inputs.items()}

        try:
            outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.1, do_sample=False)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            match = re.search(r'Score:\s*(\d)', response)
            score = int(match.group(1)) if match else 4
            score = max(1, min(8, score))
        except:
            score = 4

        predictions.append(score)
        if (idx+1)%30==0:
            print(f"   Progress: {idx+1}/{len(df)}")

    return np.array(predictions)

# ========== 主流程 ==========
print("="*70)
print("🚀 MolFormer Agents 训练（修复版）")
print("="*70)

seed = 42
seed_everything(seed)

df = pd.read_excel("/content/smiles-data.xlsx")
df = df.dropna(subset=["Structure", "Score"])
df["Score"] = df["Score"].map(ensure_label_1_8)

train_df, val_df, test_df = scaffold_split(df, "Structure", 120, 0.2, seed)
y_val = val_df["Score"].tolist()
y_test = test_df["Score"].tolist()

# ========== 加载 DeepSeek（修复）==========
print("\n" + "="*70)
print("[Step 1] Loading DeepSeek PRIMARY (merged)")
print("="*70)

tokenizer_pri = AutoTokenizer.from_pretrained(
    "./outputs_deepseek/primary_merged",
    trust_remote_code=True
)
if not tokenizer_pri.pad_token:
    tokenizer_pri.pad_token = tokenizer_pri.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
)

model_pri = AutoModelForCausalLM.from_pretrained(
    "./outputs_deepseek/primary_merged",  # ✅ 直接加载
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

model_pri.eval()
print("✅ DeepSeek merged model loaded")

# 评估 DeepSeek
print("\n[Step 2] Evaluating DeepSeek on test set...")
pri_test_pred = predict_deepseek(model_pri, tokenizer_pri, test_df, "Structure")
pri_test_m = evaluate_metrics(y_test, pri_test_pred)
print(f"\nPRIMARY: MAE={pri_test_m['MAE']:.3f} within±1={pri_test_m['within±1']:.3f} QWK={pri_test_m['QWK']:.3f}")

# 4. 训练 MolFormer Agent 1
print("\n" + "="*70)
print("[Step 3] Training MolFormer Agent 1")
print("="*70)

m_a1, t_a1 = train_molformer(
    "ibm/MoLFormer-XL-both-10pct", train_df, val_df,
    "Structure", "Score", "./outputs_deepseek/agent1",
    epochs=12, lr=1e-5, bs=8, seed=42
)

a1_test = predict_molformer(m_a1, t_a1, test_df, "Structure", "Score")
a1_test_m = evaluate_metrics(y_test, a1_test)
print(f"\nAGENT 1: MAE={a1_test_m['MAE']:.3f} within±1={a1_test_m['within±1']:.3f} QWK={a1_test_m['QWK']:.3f}")

# 5. 训练 MolFormer Agent 2
print("\n" + "="*70)
print("[Step 4] Training MolFormer Agent 2")
print("="*70)

m_a2, t_a2 = train_molformer(
    "ibm/MoLFormer-XL-both-10pct", train_df, val_df,
    "Structure", "Score", "./outputs_deepseek/agent2",
    epochs=12, lr=1e-5, bs=8, seed=142
)

a2_test = predict_molformer(m_a2, t_a2, test_df, "Structure", "Score")
a2_test_m = evaluate_metrics(y_test, a2_test)
print(f"\nAGENT 2: MAE={a2_test_m['MAE']:.3f} within±1={a2_test_m['within±1']:.3f} QWK={a2_test_m['QWK']:.3f}")

# 6. Verify 机制
print("\n" + "="*70)
print("[Step 5] Verify Mechanism")
print("="*70)

agent_avg = np.round((a1_test + a2_test) / 2).astype(int)
agent_avg_m = evaluate_metrics(y_test, agent_avg)
print(f"Agent Average: MAE={agent_avg_m['MAE']:.3f} QWK={agent_avg_m['QWK']:.3f}")

# Verify logic
finals = []
for i in range(len(pri_test_pred)):
    pp = pri_test_pred[i]
    aa = agent_avg[i]
    mad = abs(a1_test[i] - a2_test[i]) / 2
    gap = abs(pp - aa)
    accept = (mad <= 1.8) and (gap <= 2)
    finals.append(pp if accept else aa)

verify_pred = np.array(finals)
verify_m = evaluate_metrics(y_test, verify_pred)

accept_rate = sum([finals[i] == pri_test_pred[i] for i in range(len(finals))]) / len(finals)

print(f"\n[VERIFY] MAE={verify_m['MAE']:.3f} within±1={verify_m['within±1']:.3f} QWK={verify_m['QWK']:.3f}")
print(f"Primary acceptance rate: {accept_rate:.1%}")

# 7. 保存结果
results_df = pd.DataFrame({
    "Structure": test_df["Structure"].values,
    "true": y_test,
    "primary_deepseek": pri_test_pred,
    "agent1_molformer": a1_test,
    "agent2_molformer": a2_test,
    "agent_avg": agent_avg,
    "verify_final": verify_pred
})
results_df.to_csv("./outputs_deepseek/final_predictions.csv", index=False)

# 8. 最终总结
print("\n" + "="*70)
print("📊 FINAL RESULTS")
print("="*70)
print(f"PRIMARY (DeepSeek):  MAE={pri_test_m['MAE']:.3f} within±1={pri_test_m['within±1']:.3f} QWK={pri_test_m['QWK']:.3f}")
print(f"Agent 1 (MolFormer): MAE={a1_test_m['MAE']:.3f} within±1={a1_test_m['within±1']:.3f} QWK={a1_test_m['QWK']:.3f}")
print(f"Agent 2 (MolFormer): MAE={a2_test_m['MAE']:.3f} within±1={a2_test_m['within±1']:.3f} QWK={a2_test_m['QWK']:.3f}")
print(f"Agent Average:       MAE={agent_avg_m['MAE']:.3f} within±1={agent_avg_m['within±1']:.3f} QWK={agent_avg_m['QWK']:.3f}")
print(f"VERIFY (Final):      MAE={verify_m['MAE']:.3f} within±1={verify_m['within±1']:.3f} QWK={verify_m['QWK']:.3f}")

print("\n🎯 Key Insight:")
if pri_test_m['MAE'] < 2.5:
    print("   ✅ DeepSeek (general LLM) WORKS on molecular prediction!")
    print("   ✅ Demonstrates method generalizability")
else:
    print("   ⚠️  DeepSeek struggles, but Verify mechanism recovers performance")
    print("   ✅ Multi-agent framework is robust")

print(f"\n✅ All done! Results saved to ./outputs_deepseek/final_predictions.csv")

🚀 MolFormer Agents 训练（修复版）
📊 Split: Train=816 Val=204 Test=180

[Step 1] Loading DeepSeek PRIMARY (merged)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.


✅ DeepSeek merged model loaded

[Step 2] Evaluating DeepSeek on test set...

🔮 Predicting with DeepSeek on 180 samples...


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for

   Progress: 30/180


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for

   Progress: 60/180


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for

   Progress: 90/180


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for

   Progress: 120/180


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for

   Progress: 150/180


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for

   Progress: 180/180

PRIMARY: MAE=3.600 within±1=0.378 QWK=0.009

[Step 3] Training MolFormer Agent 1

🔧 Training ibm/MoLFormer-XL-both-10pct


Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


   Training for 12 epochs...


Epoch,Training Loss,Validation Loss,Mae,Within1,Qwk
1,1.9962,1.962508,3.637255,0.289216,0.07353
2,1.8947,1.859394,2.083333,0.553922,0.475719
3,1.805,1.777795,1.720588,0.607843,0.53168
4,1.6599,1.689973,1.352941,0.656863,0.713369
5,1.5974,1.713111,1.377451,0.671569,0.672958
6,1.5245,1.713194,1.509804,0.642157,0.651923
7,1.5063,1.624775,1.352941,0.676471,0.695338
8,1.561,1.639664,1.303922,0.681373,0.695012
9,1.4893,1.630354,1.308824,0.676471,0.700451
10,1.4625,1.607348,1.289216,0.681373,0.722291


   ✅ Saved to ./outputs_deepseek/agent1

AGENT 1: MAE=2.467 within±1=0.506 QWK=0.294

[Step 4] Training MolFormer Agent 2

🔧 Training ibm/MoLFormer-XL-both-10pct


Some weights of MolformerForSequenceClassification were not initialized from the model checkpoint at ibm/MoLFormer-XL-both-10pct and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.dense2.bias', 'classifier.dense2.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


   Training for 12 epochs...


Epoch,Training Loss,Validation Loss,Mae,Within1,Qwk
1,1.9993,1.957516,3.52451,0.323529,0.089995
2,1.8774,1.862925,2.313725,0.5,0.413582
3,1.8063,1.773563,1.617647,0.612745,0.611322
4,1.7105,1.8062,1.848039,0.553922,0.519934
5,1.6682,1.688096,1.455882,0.647059,0.649591
6,1.5404,1.654921,1.362745,0.671569,0.686772
7,1.4915,1.65422,1.348039,0.671569,0.693902
8,1.485,1.639932,1.367647,0.656863,0.690603
9,1.4653,1.670623,1.303922,0.656863,0.709393
10,1.516,1.622977,1.27451,0.661765,0.721331


   ✅ Saved to ./outputs_deepseek/agent2

AGENT 2: MAE=2.306 within±1=0.517 QWK=0.385

[Step 5] Verify Mechanism
Agent Average: MAE=2.350 QWK=0.350

[VERIFY] MAE=2.350 within±1=0.483 QWK=0.355
Primary acceptance rate: 20.0%

📊 FINAL RESULTS
PRIMARY (DeepSeek):  MAE=3.600 within±1=0.378 QWK=0.009
Agent 1 (MolFormer): MAE=2.467 within±1=0.506 QWK=0.294
Agent 2 (MolFormer): MAE=2.306 within±1=0.517 QWK=0.385
Agent Average:       MAE=2.350 within±1=0.483 QWK=0.350
VERIFY (Final):      MAE=2.350 within±1=0.483 QWK=0.355

🎯 Key Insight:
   ⚠️  DeepSeek struggles, but Verify mechanism recovers performance
   ✅ Multi-agent framework is robust

✅ All done! Results saved to ./outputs_deepseek/final_predictions.csv


In [14]:
#section - 保存所有结果和模型
import os
import pickle
import shutil
from google.colab import drive

print("="*70)
print("💾 保存所有结果和模型")
print("="*70)

# 1. 挂载 Drive
drive.mount('/content/drive', force_remount=True)

# 2. 创建备份目录
backup_dir = "/content/drive/MyDrive/deepseek_complete_backup"
os.makedirs(backup_dir, exist_ok=True)
print(f"✅ 备份目录: {backup_dir}")

# 3. 保存训练好的模型
print("\n[1/6] 保存模型...")

# DeepSeek (merged)
if os.path.exists("./outputs_deepseek/primary_merged"):
    print("  - DeepSeek PRIMARY (merged)")
    !cp -r ./outputs_deepseek/primary_merged {backup_dir}/
else:
    print("  ⚠️ DeepSeek merged 不存在")

# MolFormer Agent 1
if os.path.exists("./outputs_deepseek/agent1"):
    print("  - MolFormer Agent 1")
    !cp -r ./outputs_deepseek/agent1 {backup_dir}/
else:
    print("  ⚠️ Agent 1 不存在")

# MolFormer Agent 2
if os.path.exists("./outputs_deepseek/agent2"):
    print("  - MolFormer Agent 2")
    !cp -r ./outputs_deepseek/agent2 {backup_dir}/
else:
    print("  ⚠️ Agent 2 不存在")

# 4. 保存数据分割信息
print("\n[2/6] 保存数据分割...")
split_info = {
    'train_indices': train_df.index.tolist(),
    'val_indices': val_df.index.tolist(),
    'test_indices': test_df.index.tolist(),
    'train_size': len(train_df),
    'val_size': len(val_df),
    'test_size': len(test_df),
    'seed': 42
}

with open(f'{backup_dir}/split_info.pkl', 'wb') as f:
    pickle.dump(split_info, f)
print(f"  ✅ 保存了 {len(train_df)} train, {len(val_df)} val, {len(test_df)} test")

# 5. 保存预测结果
print("\n[3/6] 保存预测结果...")
if os.path.exists("./outputs_deepseek/final_predictions.csv"):
    !cp ./outputs_deepseek/final_predictions.csv {backup_dir}/
    print("  ✅ final_predictions.csv")

# 6. 保存评估指标
print("\n[4/6] 保存评估指标...")
results_summary = {
    'primary': {
        'model': 'deepseek-llm-7b-base',
        'MAE': pri_test_m['MAE'],
        'within_1': pri_test_m['within±1'],
        'QWK': pri_test_m['QWK']
    },
    'agent1': {
        'model': 'MoLFormer-XL',
        'MAE': a1_test_m['MAE'],
        'within_1': a1_test_m['within±1'],
        'QWK': a1_test_m['QWK']
    },
    'agent2': {
        'model': 'MoLFormer-XL',
        'MAE': a2_test_m['MAE'],
        'within_1': a2_test_m['within±1'],
        'QWK': a2_test_m['QWK']
    },
    'agent_average': {
        'MAE': agent_avg_m['MAE'],
        'within_1': float(np.mean(np.abs(agent_avg - y_test) <= 1)),
        'QWK': agent_avg_m['QWK']
    },
    'verify': {
        'MAE': verify_m['MAE'],
        'within_1': verify_m['within±1'],
        'QWK': verify_m['QWK'],
        'primary_acceptance_rate': accept_rate
    }
}

import json
with open(f'{backup_dir}/results_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2)
print("  ✅ results_summary.json")

# 7. 保存训练配置
print("\n[5/6] 保存训练配置...")
config = {
    'data_path': '/content/smiles-data.xlsx',
    'smiles_col': 'Structure',
    'label_col': 'Score',
    'total_samples': len(df),
    'seed': 42,
    'deepseek': {
        'model': 'deepseek-ai/deepseek-llm-7b-base',
        'epochs': 5,
        'lr': 2e-4,
        'batch_size': 2,
    },
    'molformer': {
        'model': 'ibm/MoLFormer-XL-both-10pct',
        'epochs': 12,
        'lr': 1e-5,
        'batch_size': 8,
    },
    'verify_mechanism': {
        'mad_threshold': 1.8,
        'gap_threshold': 2
    }
}

with open(f'{backup_dir}/training_config.json', 'w') as f:
    json.dump(config, f, indent=2)
print("  ✅ training_config.json")

# 8. 保存原始数据的副本
print("\n[6/6] 保存原始数据...")
if os.path.exists('/content/smiles-data.xlsx'):
    !cp /content/smiles-data.xlsx {backup_dir}/
    print("  ✅ smiles-data.xlsx")

# 9. 创建加载脚本
print("\n[Bonus] 创建重启加载脚本...")
reload_script = """# 重启后运行此脚本
from google.colab import drive
import os

# 1. 挂载 Drive
drive.mount('/content/drive')

# 2. 恢复所有文件
backup_dir = "/content/drive/MyDrive/deepseek_complete_backup"
os.makedirs('./outputs_deepseek', exist_ok=True)

# 恢复模型
!cp -r {backup_dir}/primary_merged ./outputs_deepseek/
!cp -r {backup_dir}/agent1 ./outputs_deepseek/
!cp -r {backup_dir}/agent2 ./outputs_deepseek/

# 恢复数据和结果
!cp {backup_dir}/split_info.pkl ./outputs_deepseek/
!cp {backup_dir}/final_predictions.csv ./outputs_deepseek/
!cp {backup_dir}/results_summary.json ./outputs_deepseek/
!cp {backup_dir}/training_config.json ./outputs_deepseek/
!cp {backup_dir}/smiles-data.xlsx ./

print("✅ 所有文件已恢复！")
"""

with open(f'{backup_dir}/RELOAD_SCRIPT.py', 'w') as f:
    f.write(reload_script)
print("  ✅ RELOAD_SCRIPT.py")

# 10. 验证保存
print("\n" + "="*70)
print("📋 保存内容验证:")
print("="*70)
!ls -lh {backup_dir}/

print("\n✅ 全部保存完成！")
print(f"\n📁 备份位置: {backup_dir}")
print("\n💡 重启后恢复步骤:")
print("   1. 运行 !python /content/drive/MyDrive/deepseek_complete_backup/RELOAD_SCRIPT.py")
print("   2. 或者手动复制文件")

💾 保存所有结果和模型
Mounted at /content/drive
✅ 备份目录: /content/drive/MyDrive/deepseek_complete_backup

[1/6] 保存模型...
  - DeepSeek PRIMARY (merged)
  - MolFormer Agent 1
  - MolFormer Agent 2

[2/6] 保存数据分割...
  ✅ 保存了 816 train, 204 val, 180 test

[3/6] 保存预测结果...
  ✅ final_predictions.csv

[4/6] 保存评估指标...
  ✅ results_summary.json

[5/6] 保存训练配置...
  ✅ training_config.json

[6/6] 保存原始数据...
  ✅ smiles-data.xlsx

[Bonus] 创建重启加载脚本...
  ✅ RELOAD_SCRIPT.py

📋 保存内容验证:
total 95K
drwx------ 4 root root 4.0K Oct 23 09:57 agent1
drwx------ 4 root root 4.0K Oct 23 09:59 agent2
-rw------- 1 root root  13K Oct 23 09:59 final_predictions.csv
drwx------ 2 root root 4.0K Oct 23 09:57 primary_merged
-rw------- 1 root root  747 Oct 23 09:59 RELOAD_SCRIPT.py
-rw------- 1 root root  683 Oct 23 09:59 results_summary.json
-rw------- 1 root root  64K Oct 23 09:59 smiles-data.xlsx
-rw------- 1 root root 3.1K Oct 23 09:59 split_info.pkl
-rw------- 1 root root  462 Oct 23 09:59 training_config.json

✅ 全部保存完成！

📁 备份位置: /con

In [15]:
# 额外保存：用于 Active Learning 的预测概率
print("💾 保存 Active Learning 所需数据...")

# 保存所有训练样本的 ID（用于排除已训练的）
trained_indices = set(train_df.index.tolist())

active_learning_data = {
    'trained_indices': list(trained_indices),
    'remaining_pool_size': len(df) - len(train_df) - len(val_df) - len(test_df),
    'current_train_size': len(train_df),
    'target_new_samples': 240,
    'models_trained': True,
    'stage': 'ready_for_active_learning'
}

import pickle
with open(f'{backup_dir}/active_learning_state.pkl', 'wb') as f:
    pickle.dump(active_learning_data, f)

print("✅ Active Learning 状态已保存")
print(f"   当前训练集大小: {len(train_df)}")
print(f"   计划增加样本: 240")
print(f"   剩余可用样本: {active_learning_data['remaining_pool_size']}")

💾 保存 Active Learning 所需数据...
✅ Active Learning 状态已保存
   当前训练集大小: 816
   计划增加样本: 240
   剩余可用样本: 0


In [8]:
#section - DeepSeek 加载诊断
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

print("="*70)
print("🔍 DeepSeek 加载诊断")
print("="*70)

# 1. 检查文件
print("\n[1] 检查保存的文件:")
!ls -lh ./outputs_deepseek/primary/

# 2. 加载模型
print("\n[2] 加载模型...")
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base", trust_remote_code=True)
if not tokenizer.pad_token:
    tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/deepseek-llm-7b-base",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

lora_model = PeftModel.from_pretrained(base_model, "./outputs_deepseek/primary")
lora_model.eval()

# 3. 检查 LoRA 模块
print("\n[3] LoRA 模块:")
lora_count = 0
for name, module in lora_model.named_modules():
    if "lora" in name.lower():
        lora_count += 1
        if lora_count <= 3:  # 只打印前3个
            print(f"  ✓ {name}")
print(f"  总共 {lora_count} 个 LoRA 模块")

if lora_count == 0:
    print("  ❌ 没有找到 LoRA 模块！加载失败！")
else:
    print("  ✅ LoRA 模块已加载")

# 4. 对比预测
print("\n[4] 对比预测 (3个测试样本):")
test_prompts = [
    "Predict score (1-8) for molecule.\nSMILES: CCO\nScore:",
    "Predict score (1-8) for molecule.\nSMILES: CCCC\nScore:",
    "Predict score (1-8) for molecule.\nSMILES: c1ccccc1\nScore:",
]

for i, prompt in enumerate(test_prompts):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    # Base model
    with torch.no_grad():
        base_out = base_model.generate(**inputs, max_new_tokens=3, do_sample=False)
        base_response = tokenizer.decode(base_out[0], skip_special_tokens=True)
        base_score = base_response.split("Score:")[-1].strip()[:1] if "Score:" in base_response else "?"

    # LoRA model
    with torch.no_grad():
        lora_out = lora_model.generate(**inputs, max_new_tokens=3, do_sample=False)
        lora_response = tokenizer.decode(lora_out[0], skip_special_tokens=True)
        lora_score = lora_response.split("Score:")[-1].strip()[:1] if "Score:" in lora_response else "?"

    print(f"\n  Test {i+1}:")
    print(f"    Base:  {base_score}")
    print(f"    LoRA:  {lora_score}")

    if base_score == lora_score:
        print(f"    ⚠️  两个模型预测相同！LoRA 可能没生效")
    else:
        print(f"    ✅  预测不同，LoRA 已生效")

print("\n" + "="*70)

🔍 DeepSeek 加载诊断

[1] 检查保存的文件:
total 68M
-rw-r--r-- 1 root root  895 Oct 23 09:20 adapter_config.json
-rw-r--r-- 1 root root  61M Oct 23 09:20 adapter_model.safetensors
drwxr-xr-x 2 root root 4.0K Oct 23 09:09 checkpoint-204
drwxr-xr-x 2 root root 4.0K Oct 23 09:14 checkpoint-255
-rw-r--r-- 1 root root 5.1K Oct 23 09:20 README.md
-rw-r--r-- 1 root root  482 Oct 23 09:20 special_tokens_map.json
-rw-r--r-- 1 root root 3.1K Oct 23 09:20 tokenizer_config.json
-rw-r--r-- 1 root root 7.2M Oct 23 09:20 tokenizer.json
-rw-r--r-- 1 root root 5.8K Oct 23 09:20 training_args.bin

[2] 加载模型...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.



[3] LoRA 模块:
  ✓ base_model.model.model.layers.0.self_attn.q_proj.lora_dropout
  ✓ base_model.model.model.layers.0.self_attn.q_proj.lora_dropout.default
  ✓ base_model.model.model.layers.0.self_attn.q_proj.lora_A
  总共 1080 个 LoRA 模块
  ✅ LoRA 模块已加载

[4] 对比预测 (3个测试样本):


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.



  Test 1:
    Base:  1
    LoRA:  1
    ⚠️  两个模型预测相同！LoRA 可能没生效


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.



  Test 2:
    Base:  1
    LoRA:  1
    ⚠️  两个模型预测相同！LoRA 可能没生效


Setting `pad_token_id` to `eos_token_id`:100001 for open-end generation.



  Test 3:
    Base:  1
    LoRA:  1
    ⚠️  两个模型预测相同！LoRA 可能没生效



In [None]:
#section 4
# ==== 配置 ====
REASONING_MODEL = "meta-llama/Llama-3.1-8b-instruct"  # 可换成你自己的HF指令模型
OUT_DIR         = "/content/outputs_full"
DATA_PATH       = "/content/smiles-data.xlsx"
SMILES_COL      = "Structure"
LABEL_COL       = "Score"

# ==== 加载 LLAMA（HF，无需OpenAI） ====
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import torch, re, os, json, numpy as np, pandas as pd

tok_llm = AutoTokenizer.from_pretrained(REASONING_MODEL, use_fast=True)
mdl_llm = AutoModelForCausalLM.from_pretrained(
    REASONING_MODEL,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)
dev_llm = next(mdl_llm.parameters()).device
_SCORE_RE = re.compile(r"\b([1-8])\b")

def llm_reason_and_score(smiles, max_new_tokens=128, temperature=0.0):
    prompt = (
      "You evaluate mRNA transfection efficiency on a 1–8 scale (higher is better).\n"
      "Given a molecule SMILES, provide a brief reasoning (1–2 sentences), then output the final line:\n"
      "Final Score: <single integer 1..8>\n"
      f"SMILES: {smiles}\nReasoning:\n"
    )
    inputs = tok_llm(prompt, return_tensors="pt").to(dev_llm)
    gen_cfg = GenerationConfig(max_new_tokens=max_new_tokens, do_sample=(temperature>0),
                               temperature=(temperature if temperature>0 else None),
                               top_p=1.0, eos_token_id=tok_llm.eos_token_id)
    with torch.no_grad():
        out = mdl_llm.generate(**inputs, generation_config=gen_cfg)
    text = tok_llm.decode(out[0], skip_special_tokens=True)
    m = re.search(r"Final\s*Score\s*:\s*([1-8])", text, re.I) or _SCORE_RE.search(text)
    score = int(m.group(1)) if m else 4
    return score, text

# ==== 读最佳 agent 组合 ====
with open(os.path.join(OUT_DIR, "selection_and_metrics.json"), "r", encoding="utf-8") as f:
    sel = json.load(f)
best_agents = tuple(sel["best_agent_combo_on_val"]["agents"])
print("Best agents (from val):", best_agents)

# ==== 载入两个HF类 agent推理器 ====
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
def load_hf_agent(agent_dir):
    tok = AutoTokenizer.from_pretrained(agent_dir, use_fast=True)
    mdl = AutoModelForSequenceClassification.from_pretrained(agent_dir)
    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    mdl.to(dev).eval()
    collator = DataCollatorWithPadding(tokenizer=tok)
    def predict(smiles_list, batch_size=64):
        class _DS(torch.utils.data.Dataset):
            def __init__(self, xs): self.xs=xs
            def __len__(self): return len(self.xs)
            def __getitem__(self, i):
                enc = tok(self.xs[i], truncation=True, max_length=256, return_tensors="pt")
                return {k:v.squeeze(0) for k,v in enc.items()}
        ds = _DS(smiles_list)
        loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collator)
        preds=[]
        with torch.no_grad():
            for batch in loader:
                batch = {k:v.to(dev) for k,v in batch.items()}
                logits = mdl(**batch).logits
                yhat = torch.argmax(logits, dim=-1).detach().cpu().numpy()+1
                preds += yhat.tolist()
        return np.array(preds, dtype=int)
    return predict

pred_cb = load_hf_agent(os.path.join(OUT_DIR, "agent_chemberta"))
pred_mf = load_hf_agent(os.path.join(OUT_DIR, "agent_molformer"))

# ==== Chemprop 预测器（若安装） ====
try:
    import chemprop
    HAVE_CP = True
except:
    HAVE_CP = False

def predict_chemprop_dir(model_dir, smiles_list):
    if not HAVE_CP:
        return np.array([4]*len(smiles_list), dtype=int)
    tmp_csv = os.path.join(model_dir, "_tmp_pred_llm.csv")
    pd.DataFrame({"smiles": smiles_list}).to_csv(tmp_csv, index=False)
    args = chemprop.args.PredictArgs().parse_args([
        '--test_path', tmp_csv,
        '--checkpoint_dir', model_dir,
        '--preds_path', os.path.join(model_dir, "_preds_llm.csv"),
        '--num_workers', '0'
    ])
    chemprop.train.make_predictions(args)
    arr = pd.read_csv(args.preds_path).values.squeeze()
    return np.clip(np.round(arr), 1, 8).astype(int)

pred_cp = lambda xs: predict_chemprop_dir(os.path.join(OUT_DIR, "agent_chemprop"), xs)

def predict_by_name(name, smiles_list):
    if name == "ChemBERTa": return pred_cb(smiles_list)
    if name == "MolFormer": return pred_mf(smiles_list)
    if name == "Chemprop":  return pred_cp(smiles_list)
    raise ValueError(f"Unknown agent {name}")

def verify_fuse(llm_score, agent_scores, mad_hi=1.8, gap_hi=2):
    a = np.array(agent_scores, float)
    a_avg = float(np.mean(a))
    a_mad = float(np.median(np.abs(a - np.median(a))))
    gap   = abs(llm_score - int(round(a_avg)))
    accept = (a_mad <= mad_hi) and (gap <= gap_hi)
    final = llm_score if accept else int(round(a_avg))
    return int(np.clip(final,1,8)), {"agent_avg":a_avg,"agent_mad":a_mad,"gap":gap,"accept_llm":accept}


In [None]:
#section 4.5
# 读取训练脚本生成的 Test120（保证与主流程一致）
test_primary_csv = os.path.join(OUT_DIR, "test120_primary_predictions.csv")
assert os.path.exists(test_primary_csv), "请先运行训练脚本，生成测试划分！"
df_test = pd.read_csv(test_primary_csv).rename(columns={"Structure":SMILES_COL, "true":LABEL_COL})
smiles_list = df_test[SMILES_COL].tolist()
true_list   = df_test[LABEL_COL].tolist()

# 跑 reasoning + verify
rows=[]; final_preds=[]
for smi, yt in zip(smiles_list, true_list):
    s_llm, reasoning = llm_reason_and_score(smi)
    agent_scores = [int(predict_by_name(name, [smi])[0]) for name in best_agents]
    s_final, diag = verify_fuse(s_llm, agent_scores)
    final_preds.append(s_final)
    rows.append({
        "smiles": smi, "true": yt, "llm_score": s_llm,
        **{f"agent_{name}": agent_scores[i] for i,name in enumerate(best_agents)},
        "agent_avg": diag["agent_avg"], "agent_mad": diag["agent_mad"], "gap": diag["gap"],
        "accept_llm": diag["accept_llm"], "final_score": s_final,
        "reasoning": reasoning[:800]
    })

# 评估与保存
import numpy as np
from sklearn.metrics import mean_absolute_error
def within1(y,p): return float(np.mean(np.abs(np.array(y)-np.array(p))<=1))
def qwk(y,p,lo=1,hi=8):
    y=np.array(y,int); p=np.array(p,int); m=hi-lo+1
    O=np.zeros((m,m))
    for a,b in zip(y,p): O[a-lo,b-lo]+=1
    W=np.zeros((m,m))
    for i in range(m):
        for j in range(m): W[i,j]=((i-j)**2)/((m-1)**2)
    act=O.sum(1); pred=O.sum(0); E=np.outer(act,pred)/act.sum()
    num=(W*O).sum(); den=(W*E).sum(); return 1.0-(num/den if den>0 else 1.0)

metrics = {
    "MAE": float(mean_absolute_error(true_list, final_preds)),
    "within±1": within1(true_list, final_preds),
    "QWK": qwk(true_list, final_preds)
}
print("Reasoning+Verify (Test120):", metrics)

out_csv = os.path.join(OUT_DIR, "test120_reasoning_verify_llama.csv")
out_json= os.path.join(OUT_DIR, "test120_reasoning_verify_llama_metrics.json")
pd.DataFrame(rows).to_csv(out_csv, index=False)
import json
with open(out_json, "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)
print("Saved:", out_csv)

In [None]:
# =======================
# section 4.6 — Active Learning Loop (整合到 Section 4 后续)
# 依赖：已完成 Section 2/3 的训练脚本 & Section 4/4.5 的加载
# 使用同一 OUT_DIR / BEST_AGENTS / pred_* 预测器
# =======================
import os, json, numpy as np, pandas as pd
from copy import deepcopy
from sklearn.metrics import mean_absolute_error
import torch

# ---- 修复：继承Section 4的best_agents ----
BEST_AGENTS = best_agents  # 从Section 4继承
print("🔄 Active Learning使用的最佳Agent组合:", BEST_AGENTS)

# ---- 基础工具 ----
def ensure_label_1_8(x):
    try:
        v = int(round(float(x)))
    except:
        v = 4
    return max(1, min(8, v))

def within_k(y_true, y_pred, k=1):
    yt = np.asarray([ensure_label_1_8(x) for x in y_true], dtype=int)
    yp = np.asarray([ensure_label_1_8(x) for x in y_pred], dtype=int)
    return float(np.mean(np.abs(yt-yp) <= k))

def quadratic_weighted_kappa(y_true, y_pred, lo=1, hi=8):
    y = np.asarray([ensure_label_1_8(x) for x in y_true], dtype=int)
    p = np.asarray([ensure_label_1_8(x) for x in y_pred], dtype=int)
    m = hi - lo + 1
    O = np.zeros((m,m), float)
    for a,b in zip(y,p): O[a-lo,b-lo]+=1
    W = np.zeros((m,m), float)
    for i in range(m):
        for j in range(m): W[i,j] = ((i-j)**2)/((m-1)**2)
    act = O.sum(1); prd = O.sum(0)
    E = np.outer(act, prd) / (act.sum() if act.sum()>0 else 1)
    num = (W*O).sum(); den = (W*E).sum() if (W*E).sum()>0 else 1.0
    return 1.0 - num/den

def evaluate_model_df(df_eval, preds, label_col=LABEL_COL):
    y_true = df_eval[label_col].tolist()
    return {
        "MAE": float(mean_absolute_error(y_true, preds)),
        "within±1": within_k(y_true, preds, 1),
        "QWK": quadratic_weighted_kappa(y_true, preds, 1, 8)
    }

# ---- 载入主模型（用于增量训练） ----
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

PRIMARY_DIR = os.path.join(OUT_DIR, "primary")
assert os.path.isdir(PRIMARY_DIR), "未找到主模型目录，请先运行训练脚本（section 3）生成 OUT_DIR/primary。"

primary_tok = AutoTokenizer.from_pretrained(PRIMARY_DIR, use_fast=True)
primary_model = AutoModelForSequenceClassification.from_pretrained(PRIMARY_DIR)

# 概率与预测（用于熵）
def predict_hf_probs(model, tokenizer, smiles_list, max_len=256, batch_size=64):
    class _DS(torch.utils.data.Dataset):
        def __init__(self, xs): self.xs=xs
        def __len__(self): return len(self.xs)
        def __getitem__(self, i):
            enc = tokenizer(self.xs[i], truncation=True, max_length=max_len, return_tensors="pt")
            return {k:v.squeeze(0) for k,v in enc.items()}
    collator = DataCollatorWithPadding(tokenizer=tokenizer)
    ds = _DS(smiles_list)
    loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device); model.eval()
    probs_all=[]; preds=[]
    with torch.no_grad():
        for batch in loader:
            batch = {k:v.to(device) for k,v in batch.items()}
            logits = model(**batch).logits
            probs  = torch.softmax(logits, dim=-1).detach().cpu().numpy()
            yhat   = np.argmax(probs, axis=-1)+1
            probs_all.append(probs); preds += yhat.tolist()
    return np.vstack(probs_all), np.array(preds, int)

# ---- 不确定性度量：主模型熵 + Agent分歧 + 多样性惩罚 ----
from math import log

def entropy_row(probs_row, eps=1e-9):
    p = np.clip(probs_row, eps, 1.0)
    p = p / p.sum()
    return -np.sum(p * np.log(p))

def agent_disagreement(agent_preds_int):
    arr = np.array(agent_preds_int, dtype=float)
    std = float(np.std(arr))
    q75, q25 = np.percentile(arr, [75,25])
    iqr = float(q75 - q25)
    return 0.7*std + 0.3*iqr

# RDKit 多样性（若不可用则退化为0惩罚）
try:
    from rdkit import Chem
    from rdkit.Chem import AllChem, DataStructs
    HAVE_RDKIT_AL = True
except Exception:
    HAVE_RDKIT_AL = False

_fp_cache = {}
def morgan_fp(smi, radius=2, nbits=2048):
    if not HAVE_RDKIT_AL: return None
    if smi in _fp_cache: return _fp_cache[smi]
    m = Chem.MolFromSmiles(smi)
    if m is None:
        _fp_cache[smi] = None; return None
    fp = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=nbits)
    _fp_cache[smi] = fp
    return fp

def diversity_penalty(smi, labeled_smiles_set, radius=2, nbits=2048):
    if not HAVE_RDKIT_AL or len(labeled_smiles_set)==0:
        return 0.0
    fp = morgan_fp(smi, radius, nbits)
    if fp is None: return 0.5
    sims=[]
    for ls in labeled_smiles_set:
        fp2 = morgan_fp(ls, radius, nbits)
        if fp2 is None: continue
        sims.append(DataStructs.TanimotoSimilarity(fp, fp2))
    if len(sims)==0: return 0.0
    return float(max(sims))  # 与已标注集中最像的相似度（0~1），越大惩罚越大

def select_uncertain_samples(primary_model, primary_tok, agent_names, pool_df, labeled_df,
                             budget=50, w_entropy=0.6, w_disagree=0.3, w_div=0.1):
    smiles = pool_df[SMILES_COL].astype(str).tolist()

    # 主模型概率与熵
    pri_probs, _ = predict_hf_probs(primary_model, primary_tok, smiles)
    pri_entropy = np.apply_along_axis(entropy_row, 1, pri_probs)

    # Agent 分歧（使用你在 Section 4 已加载好的预测器）
    agent_all_preds = []
    for name in agent_names:
        if name == "ChemBERTa":
            _, preds = predict_hf_probs(
                AutoModelForSequenceClassification.from_pretrained(os.path.join(OUT_DIR, "agent_chemberta")).eval().to(next(primary_model.parameters()).device),
                AutoTokenizer.from_pretrained(os.path.join(OUT_DIR, "agent_chemberta"), use_fast=True),
                smiles
            )
        elif name == "MolFormer":
            _, preds = predict_hf_probs(
                AutoModelForSequenceClassification.from_pretrained(os.path.join(OUT_DIR, "agent_molformer")).eval().to(next(primary_model.parameters()).device),
                AutoTokenizer.from_pretrained(os.path.join(OUT_DIR, "agent_molformer"), use_fast=True),
                smiles
            )
        elif name == "Chemprop":
            try:
                import chemprop
                tmp = os.path.join(OUT_DIR, "agent_chemprop", "_al_tmp.csv")
                pd.DataFrame({"smiles": smiles}).to_csv(tmp, index=False)
                args = chemprop.args.PredictArgs().parse_args([
                    '--test_path', tmp,
                    '--checkpoint_dir', os.path.join(OUT_DIR, "agent_chemprop"),
                    '--preds_path', os.path.join(OUT_DIR, "agent_chemprop", "_al_preds.csv"),
                    '--num_workers', '0'
                ])
                chemprop.train.make_predictions(args)
                arr = pd.read_csv(args.preds_path).values.squeeze()
                preds = np.clip(np.round(arr), 1, 8).astype(int)
            except Exception:
                preds = np.full(len(smiles), 4, dtype=int)
        else:
            continue
        agent_all_preds.append(preds)
    if len(agent_all_preds)==0:
        disagree = np.zeros(len(smiles))
    else:
        agent_all_preds = np.vstack(agent_all_preds)
        disagree = np.apply_along_axis(agent_disagreement, 0, agent_all_preds)

    # 多样性惩罚
    labeled_smiles_set = set(labeled_df[SMILES_COL].astype(str).tolist()) if labeled_df is not None else set()
    div_pen = np.array([diversity_penalty(s, labeled_smiles_set) for s in smiles])

    # 组合不确定性
    score = w_entropy*pri_entropy + w_disagree*disagree + w_div*div_pen
    idx_sorted = np.argsort(-score)[:budget]
    return pool_df.iloc[idx_sorted].copy(), score[idx_sorted]

# ---- 获取标签：真实实验 / 模拟（占位） ----
def get_experimental_labels(df_selected, mode="simulate"):
    """
    mode='real'：请将实验结果写入 df_selected[LABEL_COL] 后 return df_selected
    mode='simulate'：用验证集挑出的 BEST_AGENTS 的均值作为占位标签（便于跑通流程）
    """
    if mode == "real":
        # TODO: 实验返回后把真实标签写入 df_selected[LABEL_COL]
        raise NotImplementedError("请把实验标签写入 df_selected['Score'] 后返回。")
    else:
        preds_list = []
        smiles = df_selected[SMILES_COL].astype(str).tolist()
        for name in BEST_AGENTS:
            if name == "ChemBERTa":
                _, p = predict_hf_probs(
                    AutoModelForSequenceClassification.from_pretrained(os.path.join(OUT_DIR, "agent_chemberta")).eval().to(next(primary_model.parameters()).device),
                    AutoTokenizer.from_pretrained(os.path.join(OUT_DIR, "agent_chemberta"), use_fast=True),
                    smiles
                )
            elif name == "MolFormer":
                _, p = predict_hf_probs(
                    AutoModelForSequenceClassification.from_pretrained(os.path.join(OUT_DIR, "agent_molformer")).eval().to(next(primary_model.parameters()).device),
                    AutoTokenizer.from_pretrained(os.path.join(OUT_DIR, "agent_molformer"), use_fast=True),
                    smiles
                )
            elif name == "Chemprop":
                try:
                    import chemprop
                    tmp = os.path.join(OUT_DIR, "agent_chemprop", "_al_tmp2.csv")
                    pd.DataFrame({"smiles": smiles}).to_csv(tmp, index=False)
                    args = chemprop.args.PredictArgs().parse_args([
                        '--test_path', tmp,
                        '--checkpoint_dir', os.path.join(OUT_DIR, "agent_chemprop"),
                        '--preds_path', os.path.join(OUT_DIR, "agent_chemprop", "_al_preds2.csv"),
                        '--num_workers', '0'
                    ])
                    chemprop.train.make_predictions(args)
                    arr = pd.read_csv(args.preds_path).values.squeeze()
                    p = np.clip(np.round(arr), 1, 8).astype(int)
                except Exception:
                    p = np.full(len(smiles), 4, dtype=int)
            preds_list.append(p)
        pseudo = np.clip(np.round(np.mean(np.vstack(preds_list), axis=0)), 1, 8).astype(int)
        out = df_selected.copy()
        out[LABEL_COL] = pseudo
        return out

# ---- 主模型增量训练 ----
class _SmilesDS(torch.utils.data.Dataset):
    def __init__(self, df, tok, text_col, label_col, max_length=256):
        self.df = df.reset_index(drop=True); self.tok=tok
        self.text_col=text_col; self.label_col=label_col; self.max_length=max_length
        labels = [ensure_label_1_8(x) for x in self.df[self.label_col].tolist()]
        self.labels = [int(v-1) for v in labels]
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        s = str(self.df.iloc[idx][self.text_col])
        enc = self.tok(s, truncation=True, max_length=self.max_length, return_tensors="pt")
        item = {k:v.squeeze(0) for k,v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

def incremental_training_primary(primary_dir, primary_model, primary_tok, add_df, val_df,
                                 smiles_col=SMILES_COL, label_col=LABEL_COL,
                                 epochs=1, lr=2e-5, bs=16, wd=0.01, seed=42, save_suffix="al"):
    os.makedirs(primary_dir, exist_ok=True)
    train_ds = _SmilesDS(add_df, primary_tok, smiles_col, label_col)
    val_ds   = _SmilesDS(val_df,   primary_tok, smiles_col, label_col)
    collator = DataCollatorWithPadding(tokenizer=primary_tok)
    args = TrainingArguments(
        output_dir=primary_dir, seed=seed,
        learning_rate=lr, num_train_epochs=epochs,
        per_device_train_batch_size=bs, per_device_eval_batch_size=bs,
        weight_decay=wd, eval="epoch", save_strategy="no",
        warmup_ratio=0.1, bf16=torch.cuda.is_available(), report_to="none", logging_steps=50
    )
    def _metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1) + 1
        labels = labels + 1
        return {
            "mae": mean_absolute_error(labels, preds),
            "within1": within_k(labels, preds, 1),
            "qwk": quadratic_weighted_kappa(labels, preds, 1, 8)
        }
    from transformers import default_data_collator

    trainer = Trainer(model=primary_model, args=args, train_dataset=train_ds, eval_dataset=val_ds,
                      data_collator=collator, tokenizer=primary_tok, compute_metrics=_metrics)
    trainer.train()
    primary_model.save_pretrained(os.path.join(primary_dir, f"{save_suffix}_checkpoint"))
    primary_tok.save_pretrained(os.path.join(primary_dir, f"{save_suffix}_checkpoint"))
    return primary_model

# ---- 准备 Pool / Labeled / Val / Test ----
# 全量数据（与你的 DATA_PATH 一致）
if DATA_PATH.lower().endswith(".xlsx"):
    df_all = pd.read_excel(DATA_PATH)
else:
    df_all = pd.read_csv(DATA_PATH)
df_all = df_all.dropna(subset=[SMILES_COL, LABEL_COL]).reset_index(drop=True)
df_all[LABEL_COL] = df_all[LABEL_COL].map(ensure_label_1_8)

# Test120（沿用训练脚本生成的划分）
test_csv = os.path.join(OUT_DIR, "test120_primary_predictions.csv")
assert os.path.exists(test_csv), "请先运行训练脚本（Section 3），生成 Test120 文件。"
df_test = pd.read_csv(test_csv).rename(columns={"Structure":SMILES_COL, "true":LABEL_COL})
df_test[LABEL_COL] = df_test[LABEL_COL].map(ensure_label_1_8)

# Pool = All - Test120； 初始化已标注（演示用）从 Pool 随机抽取 N_INIT 条
test_smiles_set = set(df_test[SMILES_COL].astype(str).tolist())
df_pool = df_all[~df_all[SMILES_COL].astype(str).isin(test_smiles_set)].copy().reset_index(drop=True)

N_INIT = 100  # 这里仅演示。实战中请用你真实的 train+val 作为 labeled_df 初始集
rng = np.random.default_rng(42)
init_idx = rng.choice(len(df_pool), size=min(N_INIT, len(df_pool)), replace=False)
labeled_df = df_pool.iloc[init_idx].copy().reset_index(drop=True)
pool_df    = df_pool.drop(index=init_idx).reset_index(drop=True)

# 用 Test120 的前40条作为 AL 的验证集（仅演示；实战建议固定你训练时的 val 集）
df_val_for_al = df_test.iloc[:40].copy().reset_index(drop=True)

print(f"[AL init] Pool={len(pool_df)}  Labeled={len(labeled_df)}  Val_for_AL={len(df_val_for_al)}  Test120={len(df_test)}")

# ---- 主循环：Active Learning ----
def active_learning_cycle(primary_model, primary_tok, pool_df, labeled_df, val_df, test_df,
                          initial_budget=50, cycles=3, mode="simulate"):
    model = primary_model
    history=[]
    for c in range(cycles):
        print(f"\n🔄 Active Learning Cycle {c+1}/{cycles}")
        # 1) 选样
        sel_df, _ = select_uncertain_samples(model, primary_tok, BEST_AGENTS, pool_df, labeled_df,
                                             budget=initial_budget, w_entropy=0.6, w_disagree=0.3, w_div=0.1)
        # 2) 标签（真实/模拟）
        new_labeled = get_experimental_labels(sel_df, mode=mode)
        # 3) 合并已标注
        labeled_df = pd.concat([labeled_df, new_labeled], axis=0).drop_duplicates(subset=[SMILES_COL]).reset_index(drop=True)
        # 4) 从池中移除
        used = set(new_labeled[SMILES_COL].astype(str).tolist())
        pool_df = pool_df[~pool_df[SMILES_COL].astype(str).isin(used)].reset_index(drop=True)
        # 5) 增量训练主模型
        model = incremental_training_primary(
            PRIMARY_DIR, model, primary_tok, new_labeled, val_df,
            epochs=1, lr=2e-5, bs=16, wd=0.01, seed=42, save_suffix=f"al_c{c+1}"
        )
        # 6) Test120 评估
        probs_test, preds_test = predict_hf_probs(model, primary_tok, test_df[SMILES_COL].astype(str).tolist())
        met = evaluate_model_df(test_df, preds_test, label_col=LABEL_COL)
        history.append({"cycle": c+1, "test_metrics": met, "pool_remaining": len(pool_df), "labeled_total": len(labeled_df)})
        print(f"[Cycle {c+1}] Test120: MAE={met['MAE']:.3f} ±1={met['within±1']:.3f} QWK={met['QWK']:.3f} | Pool={len(pool_df)} Labeled={len(labeled_df)}")
        if len(pool_df)==0:
            print("Pool is empty. Stop."); break
    return model, labeled_df, pool_df, history

# === 运行一个示例（可根据需要调整 cycles / initial_budget / mode） ===
improved_model, labeled_df_final, pool_df_final, al_history = active_learning_cycle(
    primary_model, primary_tok, pool_df, labeled_df, df_val_for_al, df_test,
    initial_budget=50, cycles=3, mode="simulate"   # 实验回填时改为 mode="real"
)

# 保存历史
al_hist_path = os.path.join(OUT_DIR, "active_learning_history.csv")
pd.DataFrame(al_history).to_csv(al_hist_path, index=False)
print("✅ Active Learning finished. History saved to:", al_hist_path)


In [None]:
#section 5
import os, json, pandas as pd, numpy as np

OUT_DIR = "/content/outputs_full"
sel_path = os.path.join(OUT_DIR, "selection_and_metrics.json")
assert os.path.exists(sel_path), "请先完成主训练脚本运行。"

with open(sel_path, "r", encoding="utf-8") as f:
    summary = json.load(f)

print("== 主模型 Test ==")
print(summary["primary_test"])

print("\n== 各 Agent 单独 Test ==")
for k,v in summary["agents_test"].items():
    print(f"{k:12s} -> MAE={v['MAE']:.3f}  ±1={v['within±1']:.3f}  QWK={v['QWK']:.3f}")

print("\n== 验证集选择的最佳组合 ==")
print(summary["best_agent_combo_on_val"])

print("\n== Test（主模型/各Agent/最佳组合/全部组合） ==")
for k,v in summary["test_reports"].items():
    print(f"{k:25s} -> MAE={v['MAE']:.3f}  ±1={v['within±1']:.3f}  QWK={v['QWK']:.3f}")

print("\n== Verify 后的最终指标（主模型+门卫） ==")
print(summary["verify_test_metrics"])

# 逐样本对照（看看哪些被回退）
df_primary = pd.read_csv(os.path.join(OUT_DIR, "test120_primary_predictions.csv"))
df_agents  = pd.read_csv(os.path.join(OUT_DIR, "test120_agent_predictions.csv"))
df_verify  = pd.read_csv(os.path.join(OUT_DIR, "test120_verify_predictions.csv"))

df = df_primary.merge(df_agents, on=["Structure","true"], how="left") \
               .merge(df_verify[["smiles","final_score","agent_mad","gap","accept_primary"]],
                      left_on="Structure", right_on="smiles", how="left") \
               .drop(columns=["smiles"])
df.rename(columns={"final_score":"verify_final"}, inplace=True)

accept_rate = float(np.mean(df["accept_primary"]))
print(f"\nVerify 接受主模型比例: {accept_rate:.2%}")

print("\n被回退的前10条样本：")
df[df["accept_primary"]==False].head(10)