pLM4VF model for virulence factor prediction

In [None]:
from Bio import SeqIO
import pandas as pd
import numpy as np
def fasta_to_excel(fasta_file, excel_file):
    
    records = SeqIO.parse(fasta_file, "fasta")
    
    
    data = {"ID": [], "Sequence": []}
    for record in records:
        protein_name = record.id
        sequence = str(record.seq)
        data["ID"].append(protein_name)
        data["Sequence"].append(sequence)
    df = pd.DataFrame(data)
    
    
    df.to_excel(excel_file, index=False)

In [None]:
fasta_file = "test.fasta"
excel_file = "test.xlsx"
fasta_to_excel(fasta_file, excel_file)

In [None]:
dataset = pd.read_excel('test.xlsx',na_filter = False) 
sequence_list = dataset['Sequence'].apply(lambda x: x[:1000] if len(x) > 1000 else x)

Gram positive prediction model

In [None]:
def esm_embeddings(peptide_sequence_list):
  import torch
  import esm
  import pandas as pd
  import collections
  model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
  batch_converter = alphabet.get_batch_converter()
  model.eval()  

  embeddings_results = []
  for peptide_sequence in peptide_sequence_list:
      
      batch_labels, batch_strs, batch_tokens = batch_converter([peptide_sequence])
      batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

      
      with torch.no_grad():
          results = model(batch_tokens, repr_layers=[33], return_contacts=True)  
      token_representations = results["representations"][33]
      sequence_representations = []
      for i, tokens_len in enumerate(batch_lens):
          sequence_representations.append(token_representations[i, 1:tokens_len-1].mean(0))
      
      sequence_embeddings = []
      for i in range(len(sequence_representations)):
          each_seq_rep = sequence_representations[i].tolist()
          sequence_embeddings.extend(each_seq_rep)
      
      embeddings_results.append(sequence_embeddings)
  
  embeddings_results = pd.DataFrame(embeddings_results)
  return embeddings_results

In [None]:
peptide_sequence_list = []
for seq in sequence_list:
    format_seq = [seq,seq] 
    tuple_sequence = tuple(format_seq)
    peptide_sequence_list.append(tuple_sequence) 

In [None]:
embeddings_results = esm_embeddings(peptide_sequence_list)
embeddings_results.insert(0, 'ID', dataset['ID'])

In [None]:
import joblib 
from sklearn.preprocessing import StandardScaler
model = joblib.load('pLM4VF(G+)_model.pkl')

In [None]:
col = embeddings_results.columns.values.tolist()[1:1281]
embeddings_pr = pd.DataFrame(embeddings_results[col])
scaler = StandardScaler()
predicted_pr = scaler.fit_transform(embeddings_pr)

In [None]:
prediction = model.predict(predicted_pr)
prediction_results = pd.DataFrame(prediction, columns=['pLM4VF'])
prediction_results.insert(0, 'ID', embeddings_results['ID'])
predicted_probabilities = model.predict_proba(predicted_pr)
prediction_results['Prediction_Scores_0'] = [prob[0] for prob in predicted_probabilities]
prediction_results['Prediction_Scores_1'] = [prob[1] for prob in predicted_probabilities]
prediction_results.to_csv('pLM4VF_G+.csv', index=False)

Gram negative prediction model

In [None]:
def esm_embeddings(peptide_sequence_list):
  import torch
  import esm
  import pandas as pd
  import collections
  
  model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
  batch_converter = alphabet.get_batch_converter()
  model.eval()  

  embeddings_results = []
  for peptide_sequence in peptide_sequence_list:
      
      batch_labels, batch_strs, batch_tokens = batch_converter([peptide_sequence])
      batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

      
      with torch.no_grad():
          results = model(batch_tokens, repr_layers=[33], return_contacts=True)  
      token_representations = results["representations"][33]
      sequence_representations = []
      for i, tokens_len in enumerate(batch_lens):
          sequence_representations.append(token_representations[i, 1:tokens_len-1].mean(0))
      
      sequence_embeddings = []
      for i in range(len(sequence_representations)):
          each_seq_rep = sequence_representations[i].tolist()
          sequence_embeddings.extend(each_seq_rep)
      
      embeddings_results.append(sequence_embeddings)
  
  embeddings_results = pd.DataFrame(embeddings_results)
  return embeddings_results

In [None]:
peptide_sequence_list = []
for seq in sequence_list:
    format_seq = [seq,seq] 
    tuple_sequence = tuple(format_seq)
    peptide_sequence_list.append(tuple_sequence) 

In [None]:
embeddings_results = esm_embeddings(peptide_sequence_list)
embeddings_results.insert(0, 'ID', dataset['ID'])

In [None]:
import joblib 
from sklearn.preprocessing import StandardScaler
model = joblib.load('pLM4VF(G-)_model.pkl')

In [None]:
col = embeddings_results.columns.values.tolist()[1:1281]
embeddings_pr = pd.DataFrame(embeddings_results[col])
scaler = StandardScaler()
predicted_pr = scaler.fit_transform(embeddings_pr)

In [None]:
prediction = model.predict(predicted_pr)
prediction_results = pd.DataFrame(prediction, columns=['pLM4VF'])
prediction_results.insert(0, 'ID', embeddings_results['ID'])
predicted_probabilities = model.predict_proba(predicted_pr)
prediction_results['Prediction_Scores_0'] = [prob[0] for prob in predicted_probabilities]
prediction_results['Prediction_Scores_1'] = [prob[1] for prob in predicted_probabilities]
prediction_results.to_csv('pLM4VF_G-.csv', index=False)