In [236]:
#!pip install xgboost

# Importing models and training loop
%run "./models/one_hot_LSTM.ipynb"
%run "./models/one_hot_CNN.ipynb"
%run "./models/one_hot_XGBoost.ipynb"
%run "./models/ProtT5_CNN.ipynb"
%run "./models/ProtT5_XGBoost.ipynb"
%run "./models/training_routine.ipynb"
%run "./models/utils.ipynb"

one_hot_LSTM is loaded
one_hot_CNN is loaded
one_hot_XGBoost is loaded
ProtT5_CNN is loaded
ProtT5_XGBoost is loaded
Training routine is loaded
utils are loaded


In [237]:
import pandas as pd

# Importing data
ACE2_train = pd.read_csv("./data/ACE2_train_data.csv")
ACE2_test = pd.read_csv("./data/ACE2_test_data.csv")
LY16_train = pd.read_csv("./data/LY16_train_data.csv")
LY16_test = pd.read_csv("./data/LY16_test_data.csv")
LY555_train = pd.read_csv("./data/LY555_test_data.csv")
LY555_test = pd.read_csv("./data/LY555_train_data.csv")
REGN33_train = pd.read_csv("./data/REGN33_train_data.csv")
REGN33_test = pd.read_csv("./data/REGN33_test_data.csv")
REGN87_train = pd.read_csv("./data/REGN87_train_data.csv")
REGN87_test = pd.read_csv("./data/REGN87_test_data.csv")

In [238]:
train_antibodies = [LY16_train, LY555_train, REGN33_train, REGN87_train]
test_antibodies = [LY16_test, LY555_test, REGN33_test, REGN87_test]

In [239]:
# General settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [242]:
import xgboost as xgb

input_dim = 20  # number of unique characters in our sequences
output_dim = 1

antibodies = ["LY16", "LY555", "REGN33", "REGN87"]
models = {
    "CNN": ConvNet(),
    "XGBoost": xgb.XGBClassifier(objective="binary:logistic", random_state=42)
}
results = {model: {antibody: [] for antibody in antibodies} for model in models}

for i, (train_antibody, test_antibody) in enumerate(zip(train_antibodies, test_antibodies)):
    for _ in range(5):
        train_loader = prepare_data(train_antibody, sample_size=1000)
        test_loader = prepare_data(test_antibody, sample_size=1000)
        for model_name, model in models.items():
            if model_name == "XGBoost":
                # Flatten the sequences
                X_train = torch.vstack([x for x, y in train_loader]).reshape(-1, 24*20).numpy()
                y_train = torch.hstack([y for x, y in train_loader]).numpy()
                X_test = torch.vstack([x for x, y in test_loader]).reshape(-1, 24*20).numpy()
                y_test = torch.hstack([y for x, y in test_loader]).numpy()

                # Train the XGBoost model
                model.fit(X_train, y_train)

                # Predict on test set
                y_pred = model.predict_proba(X_test)[:, 1]

                # Calculate ROC/AUC score
                score = roc_auc_score(y_test, y_pred)
            else:
                # Handle other models (CNN) similarly
                criterion = nn.BCEWithLogitsLoss()
                learning_rate = 1e-3
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
                model = model.to(device)
                score = train_and_evaluate(model, train_loader, test_loader, criterion, optimizer)
            results[model_name][antibodies[i]].append(score)

# Print the results
for model, scores in results.items():
    print(f"{model} model:")
    for antibody, score in scores.items():
        print(f"{antibody}: {score}")

Epoch 1/10, Training Loss: 0.6286
Epoch 2/10, Training Loss: 0.4896
Epoch 3/10, Training Loss: 0.4137
Epoch 4/10, Training Loss: 0.3632
Epoch 5/10, Training Loss: 0.3205
Epoch 6/10, Training Loss: 0.2923
Epoch 7/10, Training Loss: 0.2599
Epoch 8/10, Training Loss: 0.2107
Epoch 9/10, Training Loss: 0.1978
Epoch 10/10, Training Loss: 0.1852
ROC/AUC Score: 0.8754
Epoch 1/10, Training Loss: 0.4582
Epoch 2/10, Training Loss: 0.3375
Epoch 3/10, Training Loss: 0.2846
Epoch 4/10, Training Loss: 0.2509
Epoch 5/10, Training Loss: 0.2178
Epoch 6/10, Training Loss: 0.1933
Epoch 7/10, Training Loss: 0.1624
Epoch 8/10, Training Loss: 0.1365
Epoch 9/10, Training Loss: 0.1391
Epoch 10/10, Training Loss: 0.1366
ROC/AUC Score: 0.8747
Epoch 1/10, Training Loss: 0.4450
Epoch 2/10, Training Loss: 0.3090
Epoch 3/10, Training Loss: 0.2431
Epoch 4/10, Training Loss: 0.2036
Epoch 5/10, Training Loss: 0.1738
Epoch 6/10, Training Loss: 0.1466
Epoch 7/10, Training Loss: 0.1313
Epoch 8/10, Training Loss: 0.1046
Ep

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Create boxplot
fig, ax = plt.subplots()
model_names = list(results.keys())
data = [results[model_name][antibody] for model_name in model_names for antibody in antibodies]
ax.boxplot(data, labels=[f"{model_name}\n{antibody}" for model_name in model_names for antibody in antibodies])

plt.title('ROC/AUC Score Comparison')
plt.ylabel('ROC/AUC Score')
plt.show()

In [245]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m486.6 kB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
Collecting fsspec
  Downloading fsspec-2023.5.0-py3-none-any.whl (160 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.1/1

In [None]:
import xgboost as xgb
import torch
from transformers import T5Tokenizer, T5EncoderModel
import re
from tqdm import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

transformer_link = "Rostlab/prot_t5_xl_half_uniref50-enc"
model_t5 = T5EncoderModel.from_pretrained(transformer_link)
model_t5.full() if device=='cpu' else model_t5.half() 
model_t5 = model_t5.to(device)
model_t5 = model_t5.eval()
tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False )

def preprocess_sequences(sequences):
    sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequences]
    embeddings = batch_process_sequences(model_t5, tokenizer, sequence_examples, batch_size=20)
    return embeddings

def prepare_data_with_embedding(antibody, sample_size):
    # Select a subset of data
    subset = antibody.sample(sample_size)
    sequences = subset["junction_aa"]
    labels = subset["binds"]
    
    # Preprocess the sequences into embeddings
    embeddings = preprocess_sequences(sequences)
    
    # Create a TensorDataset from the embeddings and labels
    dataset = torch.utils.data.TensorDataset(embeddings, torch.Tensor(labels.values))
    
    # Create a DataLoader
    loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)
    return loader

input_dim = 1024  # number of features in ProtT5 embeddings
output_dim = 1

antibodies = ["LY16", "LY555", "REGN33", "REGN87"]
models = {
    "CNN": ConvNet(input_dim, output_dim),
    "XGBoost": xgb.XGBClassifier(objective="binary:logistic", random_state=42)
}
results = {model: {antibody: [] for antibody in antibodies} for model in models}

for i, (train_antibody, test_antibody) in enumerate(zip(train_antibodies, test_antibodies)):
    for _ in range(5):
        train_loader = prepare_data_with_embedding(train_antibody, sample_size=1000)
        test_loader = prepare_data_with_embedding(test_antibody, sample_size=1000)
        for model_name, model in models.items():
            if model_name == "XGBoost":
                # Flatten the embeddings
                X_train = torch.vstack([x for x, y in train_loader]).reshape(-1, input_dim).numpy()
                y_train = torch.hstack([y for x, y in train_loader]).numpy()
                X_test = torch.vstack([x for x, y in test_loader]).reshape(-1, input_dim).numpy()
                y_test = torch.hstack([y for x, y in test_loader]).numpy()

                # Train the XGBoost model
                model.fit(X_train, y_train)

                # Predict on test set
                y_pred = model.predict_proba(X_test)[:, 1]

                # Calculate ROC/AUC score
                score = roc_auc_score(y_test, y_pred)
            else:
                criterion = nn.BCEWithLogitsLoss()
                learning_rate = 1e-3
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
                model = model.to(device)
                score = train_and_evaluate(model, train_loader, test_loader, criterion, optimizer)
            results[model_name][antibodies[i]].append(score)

# Print the results
for model, scores in results.items():
    print(f"{model} model:")
    for antibody, score in scores.items():
        print(f"{antibody}: {score}")

import matplotlib.pyplot as plt

# Create boxplot
fig, ax = plt.subplots()
model_names = list(results.keys())
data = [results[model_name][antibody] for model_name in model_names for antibody in antibodies]
ax.boxplot(data, labels=[f"{model_name}\n{antibody}" for model_name in model_names for antibody in antibodies])

plt.title('ROC/AUC Score Comparison')
plt.ylabel('ROC/AUC Score')
plt.show()

Downloading (…)lve/main/config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.42G [00:00<?, ?B/s]