### XGBoost and DNN

In [None]:
import pandas as pd

df = pd.read_csv("final_dataset_frequency_4.csv")

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

# Identify the columns
features = [col for col in df.columns if col not in ["sequence"] + [col for col in df.columns if col.startswith("GO:")]]
go_terms = [col for col in df.columns if col.startswith("GO:")]

# Separate features (X) and targets (y)
X = df[features]  # All features except "sequence" and GO terms
y = df[go_terms]  # GO term columns as targets

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb = XGBClassifier(eval_metric="logloss", random_state=42)

# Use MultiOutputClassifier for multi-label classification
multi_output_clf = MultiOutputClassifier(xgb)

# Train the model
multi_output_clf.fit(X_train, y_train)

# Make predictions
y_pred = multi_output_clf.predict(X_test)

# Evaluate the model
print("\nClassification Report (per GO term):")
print(classification_report(y_test, y_pred, target_names=go_terms, zero_division=0))



Classification Report (per GO term):
              precision    recall  f1-score   support

  GO:0005737       1.00      0.99      0.99       667
  GO:0005524       1.00      0.99      1.00       625
  GO:0005829       0.99      0.99      0.99       607
  GO:0046872       1.00      0.99      1.00       393
  GO:0000287       1.00      0.99      0.99       276
  GO:0005886       1.00      1.00      1.00       228
  GO:0016310       0.98      0.97      0.98       189
  GO:0008270       1.00      0.98      0.99       166
  GO:0071555       0.98      0.97      0.98       125
  GO:0009252       1.00      0.97      0.99       116
  GO:0008360       1.00      0.98      0.99       116
  GO:0051539       1.00      1.00      1.00       120
  GO:0003677       1.00      0.96      0.98       104
  GO:0016887       1.00      0.96      0.98       108
  GO:0051301       0.98      0.99      0.98        95
  GO:0000049       0.99      0.98      0.98        83
  GO:0030170       1.00      0.99      0.99

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Load the data
df = pd.read_csv("final_dataset.csv")

# Identify features and targets
features = [col for col in df.columns if col not in ["sequence"] + [col for col in df.columns if col.startswith("GO:")]]
go_terms = [col for col in df.columns if col.startswith("GO:")]

X = df[features].values  # Feature matrix
y = df[go_terms].values  # Target matrix

# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert data to PyTorch tensors and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

# Create DataLoader for batching
batch_size = 32
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the DNN model
class DNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.layer3(x))
        return x

# Initialize the model
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]
model = DNN(input_dim, output_dim).to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary cross-entropy loss for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with validation
num_epochs = 20
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")

# Load the best model
model.load_state_dict(torch.load("best_model.pth"))

# Evaluate the model on the test set
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        y_pred.append((outputs > 0.5).int().cpu().numpy())
        y_true.append(batch_y.cpu().numpy())

y_pred = np.vstack(y_pred)
y_true = np.vstack(y_true)

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=go_terms, zero_division=0))


Epoch [1/20], Train Loss: 0.1017, Val Loss: 0.0337
Epoch [2/20], Train Loss: 0.0341, Val Loss: 0.0271
Epoch [3/20], Train Loss: 0.0280, Val Loss: 0.0214
Epoch [4/20], Train Loss: 0.0231, Val Loss: 0.0167
Epoch [5/20], Train Loss: 0.0193, Val Loss: 0.0132
Epoch [6/20], Train Loss: 0.0161, Val Loss: 0.0104
Epoch [7/20], Train Loss: 0.0140, Val Loss: 0.0085
Epoch [8/20], Train Loss: 0.0122, Val Loss: 0.0070
Epoch [9/20], Train Loss: 0.0108, Val Loss: 0.0060
Epoch [10/20], Train Loss: 0.0098, Val Loss: 0.0052
Epoch [11/20], Train Loss: 0.0090, Val Loss: 0.0046
Epoch [12/20], Train Loss: 0.0082, Val Loss: 0.0042
Epoch [13/20], Train Loss: 0.0077, Val Loss: 0.0038
Epoch [14/20], Train Loss: 0.0072, Val Loss: 0.0035
Epoch [15/20], Train Loss: 0.0069, Val Loss: 0.0032
Epoch [16/20], Train Loss: 0.0066, Val Loss: 0.0030
Epoch [17/20], Train Loss: 0.0062, Val Loss: 0.0028
Epoch [18/20], Train Loss: 0.0059, Val Loss: 0.0026
Epoch [19/20], Train Loss: 0.0056, Val Loss: 0.0025
Epoch [20/20], Train 