In [1]:
import model
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
#Load data from pt files
tf_data = torch.load('../embedding/training_set_tf_embedding_v3.pt')
dna_data = torch.load('../embedding/training_set_DNA_embedding_v3.pt')

In [3]:
tf_data.shape

torch.Size([34993, 960])

In [4]:
len(dna_data), dna_data[0].shape

(34993, torch.Size([768]))

In [5]:
tf_data.to('cpu')

tensor([[ 0.0016, -0.0211, -0.0074,  ...,  0.0042,  0.0009, -0.0138],
        [ 0.0027,  0.0032, -0.0059,  ..., -0.0022, -0.0154, -0.0133],
        [ 0.0018, -0.0055, -0.0051,  ..., -0.0117, -0.0155, -0.0179],
        ...,
        [-0.0026, -0.0026,  0.0052,  ..., -0.0017, -0.0170, -0.0017],
        [-0.0010, -0.0038,  0.0036,  ..., -0.0051, -0.0160, -0.0200],
        [-0.0010, -0.0038,  0.0036,  ..., -0.0051, -0.0160, -0.0200]])

In [6]:
# Get the dimensions of the first tensor to determine shape
feature_dim = dna_data[0].size(0)
num_samples = len(dna_data)

# Create a new tensor with the appropriate dimensions
dna_tensor = torch.zeros((num_samples, feature_dim))

# Copy data from each tensor in the list to the new tensor
for i, tensor in enumerate(dna_data):
    dna_tensor[i] = tensor

# Verify shape
print(f"DNA tensor shape: {dna_tensor.shape}")

DNA tensor shape: torch.Size([34993, 768])


In [8]:
import pandas as pd
#get labels
labels = pd.read_csv('../dataset/train_set_v3.csv')['label']
protein_id = pd.read_csv('../dataset/train_set_v3.csv')['TF name']

In [9]:
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 交叉验证参数
num_folds = 5
group_kf = StratifiedGroupKFold(n_splits=num_folds, shuffle=True, random_state=42)

# 记录每折的性能
fold_results = []
fold_auc_scores = []
all_true_labels = []
all_pred_probs = []

# Learning rate warmup parameters
warmup_steps = 100  # Number of iterations for warmup
total_steps = 0  # Will be calculated based on epochs and batches

# Convert labels pandas Series to tensor
labels_tensor = torch.tensor(labels, dtype=torch.float32)

# 交叉验证循环
for fold, (train_idx, val_idx) in enumerate(group_kf.split(dna_tensor, labels, groups=protein_id)):
    print(f"\n==== Fold {fold+1}/{num_folds} ====")

    # 划分训练集和验证集
    dna_train, dna_val = dna_tensor[train_idx], dna_tensor[val_idx]
    protein_train, protein_val = tf_data[train_idx], tf_data[val_idx]
    labels_train, labels_val = labels_tensor[train_idx], labels_tensor[val_idx]

    # 转换成 DataLoader
    train_dataset = torch.utils.data.TensorDataset(dna_train, protein_train, labels_train)
    val_dataset = torch.utils.data.TensorDataset(dna_val, protein_val, labels_val)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

    # 初始化模型
    classifier = model.DNAProteinClassifier().to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(classifier.parameters(), lr=1e-5)

    # Calculate total steps for the scheduler
    num_epochs = 30
    total_steps = len(train_loader) * num_epochs

    # Create cosine annealing scheduler
    scheduler = CosineAnnealingLR(optimizer, T_max=total_steps)

    # 训练模型
    for epoch in range(num_epochs):
        classifier.train()
        total_loss = 0
        current_lr = optimizer.param_groups[0]['lr']

        for dna_batch, protein_batch, label_batch in train_loader:
            dna_batch, protein_batch, label_batch = dna_batch.to(device), protein_batch.to(device), label_batch.to(device)

            optimizer.zero_grad()
            outputs = classifier(dna_batch, protein_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
            scheduler.step()  # Update learning rate

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}, LR: {current_lr:.6f}")

    # 评估模型
    classifier.eval()
    correct, total = 0, 0
    true_labels = []
    pred_probs = []
    
    with torch.no_grad():
        for dna_batch, protein_batch, label_batch in val_loader:
            dna_batch, protein_batch, label_batch = dna_batch.to(device), protein_batch.to(device), label_batch.to(device)
            outputs = classifier(dna_batch, protein_batch)
            predictions = (outputs > 0.5).float()
            correct += (predictions == label_batch).sum().item()
            total += label_batch.size(0)
            
            # Collect true labels and prediction probabilities for ROC AUC calculation
            true_labels.extend(label_batch.cpu().numpy())
            pred_probs.extend(outputs.cpu().numpy())

    # Calculate metrics
    accuracy = correct / total
    auc_score = roc_auc_score(true_labels, pred_probs)
    
    print(f"Validation Accuracy (Fold {fold+1}): {accuracy * 100:.2f}%")
    print(f"ROC AUC Score (Fold {fold+1}): {auc_score:.4f}")
    
    fold_results.append(accuracy)
    fold_auc_scores.append(auc_score)
    
    # Store for overall ROC AUC
    all_true_labels.extend(true_labels)
    all_pred_probs.extend(pred_probs)

# 计算平均准确率和AUC
print(f"\n=== Cross Validation Mean Accuracy: {np.mean(fold_results) * 100:.2f}% ===")
print(f"=== Cross Validation Mean ROC AUC: {np.mean(fold_auc_scores):.4f} ===")

# Calculate overall ROC AUC from all folds combined
overall_auc = roc_auc_score(all_true_labels, all_pred_probs)
print(f"=== Overall ROC AUC (all folds combined): {overall_auc:.4f} ===")


==== Fold 1/5 ====
Epoch 1/30, Loss: 0.5400, LR: 0.000010
Epoch 2/30, Loss: 0.4284, LR: 0.000010
Epoch 3/30, Loss: 0.3796, LR: 0.000010
Epoch 4/30, Loss: 0.3430, LR: 0.000010
Epoch 5/30, Loss: 0.3197, LR: 0.000010
Epoch 6/30, Loss: 0.2934, LR: 0.000009
Epoch 7/30, Loss: 0.2731, LR: 0.000009
Epoch 8/30, Loss: 0.2499, LR: 0.000009
Epoch 9/30, Loss: 0.2369, LR: 0.000008
Epoch 10/30, Loss: 0.2164, LR: 0.000008
Epoch 11/30, Loss: 0.1997, LR: 0.000008
Epoch 12/30, Loss: 0.1873, LR: 0.000007
Epoch 13/30, Loss: 0.1735, LR: 0.000007
Epoch 14/30, Loss: 0.1616, LR: 0.000006
Epoch 15/30, Loss: 0.1488, LR: 0.000006
Epoch 16/30, Loss: 0.1416, LR: 0.000005
Epoch 17/30, Loss: 0.1292, LR: 0.000004
Epoch 18/30, Loss: 0.1216, LR: 0.000004
Epoch 19/30, Loss: 0.1142, LR: 0.000003
Epoch 20/30, Loss: 0.1036, LR: 0.000003
Epoch 21/30, Loss: 0.1025, LR: 0.000003
Epoch 22/30, Loss: 0.0959, LR: 0.000002
Epoch 23/30, Loss: 0.0904, LR: 0.000002
Epoch 24/30, Loss: 0.0895, LR: 0.000001
Epoch 25/30, Loss: 0.0839, LR

In [None]:
# data V3: new training set with pos:neg = 1:2, negtive samples are generatedby shuffling across different species

# Model R1
# === Cross Validation Mean Accuracy: 91.98% ===
# === Cross Validation Mean ROC AUC: 0.9623 ===
# === Overall ROC AUC (all folds combined): 0.9599 ===

# Model R2 
# === Cross Validation Mean Accuracy: 76.46% ===
# === Cross Validation Mean ROC AUC: 0.7996 ===
# === Overall ROC AUC (all folds combined): 0.7982 ===


In [10]:
# Save the model to a local file
model_path = '../models_v3/dna_protein_classifier_v3r2.pth'
torch.save(classifier.state_dict(), model_path)
print(f"Model saved to {model_path}")

# To save the entire model (not just state_dict)
full_model_path = '../models_v3/dna_protein_classifier_full_v3r2.pt'
torch.save(classifier, full_model_path)
print(f"Full model saved to {full_model_path}")

Model saved to ../models_v3/dna_protein_classifier_v3r2.pth
Full model saved to ../models_v3/dna_protein_classifier_full_v3r2.pt
