In [1]:
import os

import json
import random

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, TensorDataset, Dataset

from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
from xgboost import XGBClassifier

### 设置随机种子

In [2]:
# 设置random seed保证可重复性
seed = 42
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# For numpy
np.random.seed(seed)
# For deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### 提取和存embedding
##### 此处没有对序列做truncation，请确保输入序列长度 ≤ model_max_length (model_max_length = tokenizer.model_max_length)
#### 提取embedding时不涉及到padding，batch内所有序列长度相同

In [3]:
class JSONLDataset(Dataset):
    def __init__(self, file_path):
        super().__init__()
        self.file_path = file_path
        self.data = []

        # 读取 JSONL 文件
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:  # 跳过空行
                    item = json.loads(line)
                    self.data.append(item)

        print(f"[INFO] Loaded {len(self.data)} samples from {file_path}")
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [4]:
def load_dataset_jsonl(dataset_name):
    train_dataset = JSONLDataset(f"./data/{dataset_name}_train.jsonl")
    eval_dataset = JSONLDataset(f"./data/{dataset_name}_eval.jsonl")
    test_dataset = JSONLDataset(f"./data/{dataset_name}_test.jsonl")
        
    return train_dataset, eval_dataset, test_dataset

In [5]:
def collate_fn(batch, tokenizer):
    # labels of cage is saved in a string of list
    labels = [int(item["label"]) for item in batch]
    labels = torch.tensor(labels)
    sequences = [item["seq"] for item in batch] 
    encoding = tokenizer(sequences, padding=True, return_tensors="pt")

    return encoding, labels

In [6]:
# -------------------------------
# 提取 embedding（mean pooling last_hidden_state）
# -------------------------------
def calc_embeddings(hf_inputs, model, device):
    hf_inputs = {k: v.to(device) for k, v in hf_inputs.items()}
    mask = hf_inputs.get(
        "attention_mask", 
        torch.ones_like(hf_inputs['input_ids'])
    ).to(device)
    with torch.no_grad():
        outputs = model(**hf_inputs, output_hidden_states=True)
        last_hidden = outputs.hidden_states[-1]  # [B, T, H]
        mask = mask.unsqueeze(-1)  # [B, T, 1]
        pooled = (last_hidden * mask).sum(1) / mask.sum(1) #只对有效token做mean pooling [B, H]
        pooled = pooled.float()  # [B, H]
    return pooled

def extract_embeddings(model, tokenizer, dataloader, device):
    all_embeddings = []
    all_labels = []
    for batch in tqdm(dataloader, desc="Embedding"):
        inputs, labels = batch
        pooled = calc_embeddings(inputs, model, device)
        all_embeddings.append(pooled.detach().cpu())
        all_labels.append(labels)
    return torch.cat(all_embeddings), torch.cat(all_labels)
        

In [7]:
# -------------------------------
# 存embedding
# -------------------------------

def _process_embeddings_and_save(
    model,
    tokenizer,
    dataset,
    data_save_path,
    device,
): 
    data_loader = DataLoader(
        dataset, batch_size=1, shuffle=False,
        collate_fn=lambda x: collate_fn(x, tokenizer),
        pin_memory=True,
    )
    
    embedding_data = extract_embeddings(model, tokenizer, data_loader, device)
    X, y = embedding_data
    torch.save({
        "embeddings": X.detach().cpu(),
        "labels": y.detach().cpu(),
    }, data_save_path)

def extract_embeddings_on_dataset(model, tokenizer, dataset_name, device, embedding_dir, model_name):
    train_dataset, eval_dataset, test_dataset = load_dataset_jsonl(dataset_name)
    
    #columns = test_dataset.column_names
    #print(columns)
    
    if not os.path.exists(embedding_dir):
        os.makedirs(f"{embedding_dir}")
        
    data_train_path = f"{embedding_dir}/{dataset_name}_train.pt" if os.path.exists(embedding_dir) else f"./{dataset_name}_train.pt"
    _process_embeddings_and_save(model, tokenizer, train_dataset, data_train_path, device)
    
    data_eval_path = f"{embedding_dir}/{dataset_name}_eval.pt" if os.path.exists(embedding_dir) else f"./{dataset_name}_eval.pt"
    _process_embeddings_and_save(model, tokenizer, eval_dataset, data_eval_path, device)
    
    data_test_path = f"{embedding_dir}/{dataset_name}_test.pt" if os.path.exists(embedding_dir) else f"./{dataset_name}_test.pt"
    _process_embeddings_and_save(model, tokenizer, test_dataset, data_test_path, device)

In [8]:
# -------------------------------
# embedding 提取主流程
# -------------------------------
model_name = "Genos-1.2B"
model_path = "/data/model/Genos-1.2B"
datasets = ["Human_classify_8k"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"[INFO] Loading model from {model_path}")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_path,
    # 如果环境中不支持，可以注释掉下面这行
    attn_implementation="flash_attention_2",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)
model.eval()

embedding_dir = f"./embedding/{model_name}/"

for dataset_name in datasets:
    extract_embeddings_on_dataset(
        model,
        tokenizer,
        dataset_name,
        device,
        embedding_dir,
        model_name
    )

[INFO] Loading model from /data/model/Genos-1.2B
[INFO] Loaded 150 samples from ./data/Human_classify_8k_train.jsonl
[INFO] Loaded 150 samples from ./data/Human_classify_8k_eval.jsonl
[INFO] Loaded 150 samples from ./data/Human_classify_8k_test.jsonl


Embedding: 100%|██████████| 150/150 [00:12<00:00, 12.19it/s]
Embedding: 100%|██████████| 150/150 [00:11<00:00, 12.61it/s]
Embedding: 100%|██████████| 150/150 [00:11<00:00, 12.60it/s]


### 加载embedding，并训练XGboost分类器

In [9]:
def train_xgboost_classifier(X_train, y_train, X_test, random_state=42):
    """训练XGBoost分类器"""
    X_train_np = X_train.cpu().numpy() if torch.is_tensor(X_train) else X_train
    y_train_np = y_train.cpu().numpy() if torch.is_tensor(y_train) else y_train
    X_test_np = X_test.cpu().numpy() if torch.is_tensor(X_test) else X_test
    
    print("Training XGBoost classifier...")
    print("XGboost parameters: n_estimators=100, learning_rate=0.1, max_depth=6, random_state={}".format(random_state))
    xgb = XGBClassifier(
        n_estimators=100,
        random_state=random_state,
        # use_label_encoder=False, 
        eval_metric='mlogloss',
        learning_rate=0.1,
        max_depth=6        
    )
    xgb.fit(X_train_np, y_train_np)
    print("XGBoost training completed")
    
    probs = xgb.predict_proba(X_test_np)
    preds = xgb.predict(X_test_np)
    return preds, probs

In [10]:
def evaluate_model_on_dataset(dataset_name, device, embedding_dir, method='xgboost'):
    
    data_train_path = f"{embedding_dir}/{dataset_name}_train.pt"
    data_eval_path = f"{embedding_dir}/{dataset_name}_eval.pt"
    data_test_path = f"{embedding_dir}/{dataset_name}_test.pt"

    print(f"[INFO] Loading train data from {data_train_path}")
    train_data = torch.load(data_train_path)
    X_train = train_data["embeddings"]
    y_train = train_data["labels"]

    print(f"[INFO] Loading validation data from {data_eval_path}")
    eval_data = torch.load(data_eval_path)
    X_val = eval_data["embeddings"]
    y_val = eval_data["labels"]

    print(f"[INFO] Loading test data from {data_test_path}")
    test_data = torch.load(data_test_path)
    X_test = test_data["embeddings"]
    y_test = test_data["labels"]

    print(f"\n[INFO] Label distribution:")
    train_counts = np.bincount(y_train.numpy())
    val_counts = np.bincount(y_val.numpy())
    test_counts = np.bincount(y_test.numpy())
    
    print(f"Train set: {train_counts} (total: {len(y_train)})")
    print(f"Validation set: {val_counts} (total: {len(y_val)})")
    print(f"Test set: {test_counts} (total: {len(y_test)})")
    
    # 因为Xgboost不使用验证集，因此仅使用原始训练集和测试集
    print(f"\n[INFO] Since XGboost does not use validation set, using train and test sets without validation set.")
    print(f"Train set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")

    # 使用训练集训练，在处理后的测试集上评估
    y_pred, y_probs = train_xgboost_classifier(
            X_train, y_train, X_test,
            random_state=42
        )
    y_true = y_test.numpy()


    # 计算评估指标
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    try:
        mcc = matthews_corrcoef(y_true, y_pred)
    except:
        mcc = 0.0

    try:
        if y_probs.shape[1] == 2:
            auc = roc_auc_score(y_true, y_probs[:, 1])
            auprc = average_precision_score(y_true, y_probs[:, 1])
        else:
            auc = roc_auc_score(y_true, y_probs, multi_class='ovr', average='macro')
            auprc = average_precision_score(y_true, y_probs, average='macro')
    except Exception as e:
        print(f"Error calculating AUC/AUPRC: {e}")
        auc = 0.0
        auprc = 0.0

    # 打印每个类别的正确率
    print(f"\n[INFO] Per-class accuracy:")
    for class_idx in range(4):
        class_mask = (y_true == class_idx)
        if np.sum(class_mask) > 0:
            class_correct = np.sum((y_pred[class_mask] == class_idx))
            class_accuracy = class_correct / np.sum(class_mask)
            print(f"  Class {class_idx}: {class_accuracy:.4f} ({class_correct}/{np.sum(class_mask)})")
        else:
            print(f"  Class {class_idx}: No samples in test set")

    # 打印错误预测分布
    print(f"\n[INFO] Error prediction distribution:")
    for true_class in range(4):
        true_class_mask = (y_true == true_class)
        true_class_indices = np.where(true_class_mask)[0]

        if len(true_class_indices) > 0:
            preds_for_true_class = y_pred[true_class_mask]
            wrong_pred_mask = (preds_for_true_class != true_class)
            wrong_preds = preds_for_true_class[wrong_pred_mask]

            if len(wrong_preds) > 0:
                error_counts = np.bincount(wrong_preds, minlength=4)
                total_errors = len(wrong_preds)
                print(f"  For true class {true_class} (errors: {total_errors}/{len(true_class_indices)}):")
                for pred_class in range(4):
                    if pred_class != true_class and error_counts[pred_class] > 0:
                        percentage = (error_counts[pred_class] / total_errors) * 100
                        print(f"    → Predicted as class {pred_class}: {error_counts[pred_class]} ({percentage:.1f}%)")
            else:
                print(f"  For true class {true_class}: No prediction errors")
        else:
            print(f"  For true class {true_class}: No samples in test set")

    return acc, auc, auprc, f1, mcc, precision, recall

In [11]:
# -------------------------------
# XGboost 分类主流程
# -------------------------------
datasets = ["Human_classify_8k"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


for dataset in datasets:
    acc, auc, auprc, f1, mcc, precision, recall = evaluate_model_on_dataset(dataset, 
                                                device, embedding_dir, method='xgboost')
print(f"\nCompleted {dataset} task with XGboost: Acc={acc:.4f}, AUC={auc:.4f}, F1={f1:.4f}")

[INFO] Loading train data from ./embedding/Genos-1.2B//Human_classify_8k_train.pt
[INFO] Loading validation data from ./embedding/Genos-1.2B//Human_classify_8k_eval.pt
[INFO] Loading test data from ./embedding/Genos-1.2B//Human_classify_8k_test.pt

[INFO] Label distribution:
Train set: [50 50 50] (total: 150)
Validation set: [50 50 50] (total: 150)
Test set: [50 50 50] (total: 150)

[INFO] Since XGboost does not use validation set, using train and test sets without validation set.
Train set size: 150
Test set size: 150
Training XGBoost classifier...
XGboost parameters: n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42
XGBoost training completed

[INFO] Per-class accuracy:
  Class 0: 0.1800 (9/50)
  Class 1: 0.2800 (14/50)
  Class 2: 0.4200 (21/50)
  Class 3: No samples in test set

[INFO] Error prediction distribution:
  For true class 0 (errors: 41/50):
    → Predicted as class 1: 20 (48.8%)
    → Predicted as class 2: 21 (51.2%)
  For true class 1 (errors: 36/50):
   