In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import accuracy_score, recall_score,f1_score,roc_curve, auc,precision_score ,classification_report,confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import re
import torchtext
from torchtext.data.utils import get_tokenizer
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
data=pd.read_excel('/file_path/training_data.xlsx')
data['HLA type']=data['HLA type'].str.replace('*','')
data['HLA type']=data['HLA type'].str.replace(' ','')
data['HLA type']=data['HLA type'].str.replace('\xa0','')
data.head()

In [None]:
HLA_seq = pd.read_csv('/file_path/MHC_pseudo.dat', sep='\t')
HLA_seq.head()

In [None]:
HLA_seq.columns=['HLA type','sequence']
data=pd.merge(data,HLA_seq,on='HLA type',how='left')
data.head()

In [None]:
data=data.drop('HLA type',axis=1)

In [None]:
data.rename(columns={'sequence':'HLA type'},inplace=True)
data.head()

In [None]:
y=data['label']

In [None]:
device = torch.device('cuda'if torch.cuda.is_available() else 'cpu')
device

In [None]:
def trans_Mutated(x):
  x=x+'X'*(11-len(x))
  return x

data['M']=data['Mutated Peptide'].apply(trans_Mutated)
data.head()

In [None]:
def getKmers(sequence, size):
  a,b=sequence['M'],sequence['HLA type']
  a=b+a
  a=[a[x:x+size].lower() for x in range(len(a) - size + 1)]
  return ' '.join(a)
data['trans']=data.apply(lambda row:getKmers(row, 2),axis=1)
data.head()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed")
input_ids = []
attention_masks = []

for text in data['trans']:
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=65,
        padding='max_length',
        return_tensors='pt',
        truncation=True
    )
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)

In [None]:
f=data.drop(['Mutated Peptide','label','M','trans','HLA type'],axis=1).values
total_features=np.hstack((input_ids.cpu().numpy(),f))
s=StandardScaler()
s.fit(total_features)
x_scale=s.transform(total_features)

torch.manual_seed(44)
X = torch.tensor(x_scale, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# SMOTE oversampling on training data only
smote = SMOTE(random_state=1)
x_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        return self.X[index], self.y[index]


input_size = X.shape[1]
hidden_size = 53
output_size = 2
batch_size = 32
learning_rate = 0.0001
num_epochs = 31
# Create data loaders
train_dataset = CustomDataset(x_train_resampled, y_train_resampled)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

time1=time.time()


class FullyConnectedModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FullyConnectedModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.drop=nn.Dropout(p=0.5)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

def train(model, train_loader, val_loader, num_epochs=31, learning_rate=0.0001):

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):

        model.train()
        total_train_correct = 0
        total_train_samples = 0
        total_train_loss = 0.0

        for inputs, targets in train_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            _, predicted = torch.max(outputs, 1)
            total_train_samples += targets.size(0)
            total_train_correct += (predicted == targets).sum().item()
            total_train_loss += loss.item() * inputs.size(0)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item() * inputs.size(0)

        train_accuracy = 100 * total_train_correct / total_train_samples
        avg_train_loss = total_train_loss / len(train_loader.dataset)

        train_accuracies.append(train_accuracy)
        train_losses.append(avg_train_loss)

        print(f'Training Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_train_loss:.4f}, Accuracy: {train_accuracy:.2f}%')


        model.eval()
        total_val_correct = 0
        total_val_samples = 0
        total_val_loss = 0.0

        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)

                _, predicted = torch.max(outputs, 1)
                total_val_samples += targets.size(0)
                total_val_correct += (predicted == targets).sum().item()
                total_val_loss += loss.item() * inputs.size(0)

        val_accuracy = 100 * total_val_correct / total_val_samples
        avg_val_loss = total_val_loss / len(val_loader.dataset)

        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)

        print(f'Validation Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.2f}%')


model = FullyConnectedModel(input_size, hidden_size, output_size)

train(model, train_loader, val_loader, num_epochs=num_epochs, learning_rate=learning_rate)
time2=time.time()
print('Time used: ',time2-time1)

torch.save(model.state_dict(), 'FCNN_BioBERT.pth')

In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot training and validation accuracy
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Training and Validation Accuracy Over Epochs')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Evaluate the model on the test data
def evaluate(model, test_loader):
  model.eval()
  test_labels = []
  test_predictions = []
  proba=[]

  with torch.no_grad():
      for inputs, targets in test_loader:
          outputs = model(inputs)
          test_labels += targets.tolist()
          _, predicted = torch.max(outputs, 1)
          test_predictions.extend(predicted.tolist())
          proba.append(outputs)
  # Convert predictions to a torch tensor
  test_predictions = torch.tensor(test_predictions, dtype=torch.long)

  # Calculate the confusion matrix
  cm = confusion_matrix(y_test, test_predictions)

  # Plot the confusion matrix
  plt.figure(figsize=(6, 6))
  sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', cbar=False, square=True)
  plt.xlabel('Predicted Labels')
  plt.ylabel('True Labels')
  plt.title('Confusion Matrix')
  plt.show()

  # Output the classification report
  target_names = ['Class 0', 'Class 1']
  classification_rep = classification_report(y_test, test_predictions, target_names=target_names)
  print('Classification Report:\n', classification_rep)

  proba=torch.cat(proba,dim=0).cpu().numpy()
  auc_value = roc_auc_score(y_test, proba[:,1])

  print(f'AUC : {auc_value:.4f}')
  precision = precision_score(y_test, test_predictions, average='macro')
  recall = recall_score(y_test, test_predictions, average='macro')
  f1 = f1_score(y_test, test_predictions, average='macro')

  print(f'Overall Precision: {precision:.4f}')
  print(f'Overall Recall: {recall:.4f}')
  print(f'Overall F1-score: {f1:.4f}')


  # Calculate true positives, true negatives, false positives, and false negatives
  tn, fp, fn, tp = cm.ravel()

  # Calculate sensitivity (true positive rate)
  sensitivity = tp / (tp + fn)

  # Calculate specificity (true negative rate)
  specificity = tn / (tn + fp)

  print(f'Sensitivity (True Positive Rate): {sensitivity:.4f}')
  print(f'Specificity (True Negative Rate): {specificity:.4f}')

evaluate(model, test_loader)