In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error

In [None]:
df = pd.read_csv("RABV_N_gene_hosts.csv")
df.head()

Unnamed: 0,Host,He_mean,He_SE,Hew_mean,Hew_SE,n,p_mean,p_SE,dN_mean,dN_SE,...,dNdS_mean,dNdS_SE,Positive_sites,D_R,TMRCA,G_Range,H_Range,Pop_size,Pop_Density,Adult_size
0,Bat (Molossidae),0.901,0.034,0.924,0.034,15,0.114,0.007,0.012,0.001,...,0.023,0.019,2,3.1/2.8,314 (261-373),13001391.28,65.0,183209.09,500500.0,22.4
1,Bat (Phyllostomidae),0.657,0.011,0.673,0.01,46,0.033,0.003,0.004,0.0,...,0.026,0.005,1,1.0/1.2,219 (181-259),17803912.83,6.5,464.92,31.5,33.16
2,Bat (Vespertilionidae) 1,0.822,0.031,0.837,0.025,17,0.062,0.005,0.008,0.0,...,0.029,0.004,1,2.9/1.0,270 (217-323),7981291.04,9.0,17680.07,55.0,15.32
3,Bat (Vespertilionidae) 2,0.85,0.025,0.855,0.036,32,0.1,0.006,0.014,0.0,...,0.034,0.001,5,2.1/0.9,218 (190-247),4859262.54,10.5,86176.36,100.0,15.44
4,Coyote 1,0.425,0.1,0.425,0.1,18,0.011,0.002,0.001,0.0,...,0.023,0.043,0,0.0/0.9,149 (113-186),17099094.3,18.88,47.0,0.25,11989.1


In [None]:
df = df.replace(["–", "вЂ–"], np.nan)
df = df.dropna(subset=["He_mean", "p_mean", "dN_mean", "dS_mean", "dNdS_mean", "Positive_sites"])

cols_to_use = ["He_mean", "He_SE", "Hew_mean", "Hew_SE", "Pop_size", "Pop_Density", "Adult_size",
               "p_mean", "dN_mean", "dS_mean", "dNdS_mean", "Positive_sites"]
df = df[cols_to_use].apply(pd.to_numeric, errors='coerce')
df = df.dropna()

Определяем предикторы (X) и целевые переменные (y)

X - признаки хозяев

y - показатели вирусной эволюции

In [None]:
X_cols = ["He_mean", "He_SE", "Pop_size", "Pop_Density", "Adult_size"]
y_cols = ["p_mean", "dN_mean", "dS_mean", "dNdS_mean", "Positive_sites"]

X = df[X_cols].values
y = df[y_cols].values

Масштабируем данные

In [None]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)

Делим данные на тренировочные и тестовые

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, random_state=42)

Преобразуем numpy массивы в тензоры PyTorch

In [None]:
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

Модель

In [None]:
class MultiRegressor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultiRegressor, self).__init__()
        self.hidden = nn.Sequential(
            nn.Linear(input_dim, 16), # первый скрытый слой
            nn.ReLU(),                # нелинейность
            nn.Dropout(0.3),          # регуляризация
            nn.Linear(16, 8),         # второй слой
            nn.ReLU()
        )
        self.output = nn.Linear(8, output_dim) # выходной слой

    def forward(self, x):
        x = self.hidden(x)
        return self.output(x)

Настройки

Обучает простую нейронную сеть (модель множественной регрессии) на тренировочных данных и возвращает обученную модель и предсказания для валидационного набора.

In [None]:
def train_model(X_train, y_train, X_val, y_val, epochs=300):
    model = MultiRegressor(X_train.shape[1], y_train.shape[1])
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    X_val = torch.tensor(X_val, dtype=torch.float32)
    y_val = torch.tensor(y_val, dtype=torch.float32)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()     # Обнуляем градиенты предыдущего шага
        y_pred = model(X_train)   # Получаем предсказания модели
        loss = criterion(y_pred, y_train)   # Считаем ошибку (MSE)
        loss.backward()           # Вычисляем градиенты (обратное распространение ошибки)
        optimizer.step()          # Обновляем веса модели

    model.eval()
    with torch.no_grad():
        y_pred_val = model(X_val).numpy()
    return model, y_pred_val

 Кросс-валидация (5 фолдов)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores, mae_scores = [], []

for train_idx, val_idx in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y_scaled[train_idx], y_scaled[val_idx]

    model, y_pred = train_model(X_train, y_train, X_val, y_val, epochs=500)
    y_true = y_val

    y_pred_real = scaler_y.inverse_transform(y_pred)  # Преобразуем предсказания и истинные значения обратно в исходный масштаб
    y_true_real = scaler_y.inverse_transform(y_true)

    r2 = r2_score(y_true_real, y_pred_real, multioutput='uniform_average')
    mae = mean_absolute_error(y_true_real, y_pred_real)
    r2_scores.append(r2)
    mae_scores.append(mae)

print(f"Средний r2: {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")
print(f"Средний MAE: {np.mean(mae_scores):.3f}")

Средний r2: -1.321 ± 0.859
Средний MAE: 0.321
