In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor
from joblib import dump, load

In [4]:
# Load the genotype_fitness_data.tsv file
file_path = 'C:/Users/Thomascrx/Desktop/ml_code/sequence_points_file.csv'  # Replace with your file path
genotype_fitness_data = pd.read_csv(file_path)
print(genotype_fitness_data.head())

                                            sequence  delta_log10Ka
0  NITNLCPFGEVFNATRFASVYCWNRKRISNCVADYSVLYNSASFST...          -2.05
1  NITNLCPFGEVFFATRFASVYAWNRKRISNCVADYSVLYNSASFST...          -0.42
2  NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...          -4.76
3  NITNLCPFGEVFNATRFVSVYAWNRKRISNCVADYSVLYNSASFST...          -0.61
4  NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...          -1.15


In [10]:
def cut_to_provec(sequence, num):
    def split_overlapping(s, window_size):
        return [s[i:i + window_size] for i in range(len(s) - window_size + 1)]
    return [split_overlapping(i, window_size=num) for i in sequence]

sequence = genotype_fitness_data["sequence"]
sequence = cut_to_provec(sequence, 3)
provec = pd.read_csv("C:/Users/Thomascrx/Desktop/protVec_100d_3grams.csv", sep="\t")

columns = list(provec.columns)
columns.pop(0)
columns_to_merge = columns
provec['Merged'] = provec[columns_to_merge].apply(lambda row: row.tolist(), axis=1)
provec.drop(columns=columns_to_merge, inplace=True)
provec_dict = provec.set_index('words')['Merged'].to_dict()

def provec_encode_aa(sequence):
    return np.array([provec_dict[aa] for aa in sequence])

X = np.array([provec_encode_aa(aa) for aa in sequence])
y = genotype_fitness_data['delta_log10Ka'].values
print(f"X shape: {X.shape}")

X shape: (136204, 199, 100)


In [14]:
# Standardize X
scaler = StandardScaler()
X = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

array([[-0.64091236, -0.73472378,  0.38318452, ..., -0.23630753,
         0.85832559, -1.05736203],
       [ 1.29960596, -1.84373196, -0.02712866, ..., -1.21963456,
        -2.42604606, -1.81729888],
       [ 2.72284051,  1.74630444, -0.36018767, ..., -0.05598329,
        -1.71761786, -2.15516764],
       ...,
       [ 0.90332009,  1.12095939,  0.27667378, ..., -2.50335607,
         2.02559375, -0.71210782],
       [ 0.03582682,  0.00756962,  0.38211474, ..., -1.07164549,
         1.6623494 ,  0.39921289],
       [ 1.00999226,  0.60483684, -2.00419012, ..., -0.56204786,
         0.67772929,  1.52278413]])

In [18]:
# Standardize y
y_mean = y.mean()
y_std = y.std()
y = (y - y_mean) / y_std

array([ 1.04974056, -1.14949301,  0.95346075,  0.6798234 , -1.14949301,
        0.15281811, -1.14949301,  0.88758509,  1.21696339, -1.14949301,
        0.79637264, -1.14949301,  0.18828962,  1.26256962,  0.95346075,
        0.10721189, -1.14949301, -0.32858094,  0.25416528,  1.23723283,
       -0.01440472,  0.37071453,  1.16122245,  0.54807208,  1.22709811,
       -1.14949301, -1.14949301,  1.24736754,  0.75076641,  0.39605132,
        1.20176132,  0.92812396,  0.78623792,  0.02106679, -0.07014566,
        0.51766792,  1.24230019, -1.14949301,  0.96866283,  1.29297377,
        0.2643    ,  1.24230019, -1.14949301, -1.14949301,  1.21189604,
        0.15281811,  0.28456943,  0.49233113,  1.26256962,  0.62408245,
       -1.14949301, -1.14949301,  0.81157472, -1.14949301,  1.26763698,
        0.34537774, -0.63262245,  1.26256962,  0.01599944,  0.77610321,
        0.76596849, -1.14949301, -1.14949301,  1.04467321, -1.14949301,
        0.29977151,  1.15108773,  0.52273528, -0.55661207,  0.05

In [20]:
# # 保存变量
# dump([X,y], 'variables.joblib')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
print(f"X_train_tensor shape: {X_train_tensor.shape}")

y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)
print(f"y_train_tensor shape after view: {y_train_tensor.shape}")

train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

X_train_tensor shape: torch.Size([108963, 199, 100])
y_train_tensor shape after view: torch.Size([108963, 1])


In [8]:
class Config:
    seq_len = 199          # 序列长度
    vocab_size = 100       # 新的编码维度
    d_model = 512         # 嵌入维度
    nhead = 8             # 注意力头数
    num_layers = 6       # 编码器层数
    dim_feedforward = 1024 # 前馈网络维度
    dropout = 0.1         # Dropout 比率
    batch_size = 32       # Batch Size（根据 GPU 显存调整）
    lr = 0.005           # 学习率
    weight_decay = 0.01   # 权重衰减
    epochs = 100          # 训练轮数
    grad_clip = 5.0       # 梯度裁剪阈值
    log_dir = "runs/exp1" # TensorBoard 日志目录

class ProteinTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding = nn.Linear(config.vocab_size, config.d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config.d_model,
            nhead=config.nhead,
            dim_feedforward=config.dim_feedforward,
            dropout=config.dropout,
            batch_first=True  # Set batch_first to True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_layers)
        self.fc = nn.Linear(config.seq_len * config.d_model, 1)

    def forward(self, x):
        x = self.embedding(x)          # (batch, 199, 100) → (batch, 199, 256)
        x = self.encoder(x)            # (batch, 199, 256)
        x = x.reshape(x.size(0), -1)   # (batch, 199 * 256=50944)
        return self.fc(x)              # (batch, 1)

config = Config()
model = ProteinTransformer(config)
model = model.to("cuda")

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
    print(f"Using {torch.cuda.device_count()} GPUs")

optimizer = Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
criterion = nn.MSELoss()
scaler = GradScaler()

writer = SummaryWriter(log_dir=config.log_dir)

dataset = TensorDataset(torch.randn(136204, 199, 100), torch.randn(136204, 1))
dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True, num_workers=4, pin_memory=True)

for epoch in range(config.epochs):
    model.train()
    total_loss = 0.0
    
    for batch_idx, (batch_X, batch_y) in enumerate(dataloader):
        batch_X, batch_y = batch_X.to("cuda"), batch_y.to("cuda")
        
        optimizer.zero_grad()
        
        with autocast():
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
        
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
        scaler.step(optimizer)
        scaler.update()
        
        current_step = epoch * len(dataloader) + batch_idx
        if batch_idx % 10 == 0:
            writer.add_scalar("Loss/train (batch)", loss.item(), current_step)
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    writer.add_scalar("Loss/train (epoch)", avg_loss, epoch)
    print(f"Epoch {epoch}, Avg Loss: {avg_loss:.4f}")

writer.close()

Epoch 0, Avg Loss: 1.1648
Epoch 1, Avg Loss: 0.9974
Epoch 2, Avg Loss: 0.9971
Epoch 3, Avg Loss: 0.9971
Epoch 4, Avg Loss: 0.9971
Epoch 5, Avg Loss: 0.9970
Epoch 6, Avg Loss: 0.9971


Exception ignored in: <function _releaseLock at 0x7fce4177de40>
Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.12/logging/__init__.py", line 243, in _releaseLock
    def _releaseLock():
    
KeyboardInterrupt: 


RuntimeError: DataLoader worker (pid(s) 5506, 5508) exited unexpectedly