In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import re
#from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import pickle
import joblib
from preprocess import pfeature_process
import csv

# Procesamiento de secuencias generadas para clasificarla como potencialmente 

In [None]:
model_path = '../models/dtr_model.pkl'
cd_hit_path = '../data/processed/generated_seqs_cd_hit.txt'


In [3]:
model = joblib.load(model_path)


In [4]:
model.feature_names_in_

array(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9',
       'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17',
       'PC18'], dtype=object)

In [5]:
test_seqs = pfeature_process(cd_hit_path)
test_seqs


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18
LDLDDWYTVDRDAMSM,1.837949,4.840107,-1.346732,-2.656623,-7.338777,-2.767022,-2.165848,1.325158,2.312710,-0.523462,3.584420,-0.935730,-3.049925,0.432451,1.875415,-2.619759,-0.539151,1.527780
KEAKEGATEWCPIVIN,-0.568820,-0.202947,-5.232013,3.124897,-1.335624,0.048295,2.267388,-0.566788,-1.088166,1.337338,-0.222420,0.119635,1.454117,1.709013,0.716852,1.834802,0.391952,0.424481
IYMYQNPQADYQKTVV,-3.017279,-0.494642,-0.121134,-1.773381,3.953475,-0.026840,2.132437,0.567830,1.115806,0.202249,1.469811,2.944729,2.262455,-0.125601,-0.613322,0.553605,-1.834790,-0.133967
YYIENVMHVAMPMYYK,-9.005871,-3.115693,6.102775,-2.664138,1.976713,-2.158310,-2.036858,-1.237642,0.749774,0.945715,0.729138,-0.266035,-0.652751,1.761443,-0.942270,-0.642509,1.217351,1.163466
DPAMEFDNAEIIDDDD,8.979129,3.274629,-8.581540,-4.372976,-6.572749,-2.914250,-2.991464,-1.163220,1.694494,0.896497,-1.789092,0.307943,-0.014749,0.049112,-0.403571,-0.675637,-2.292907,1.037990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YVYMMYYMYMVRMCHD,-12.379661,-5.053110,12.332297,-5.327318,-1.421743,-3.704995,-2.403012,-1.897894,2.784054,-2.361387,0.044854,1.520783,-0.673718,1.189455,-2.282317,1.665998,-0.140066,-0.677912
DNKHYYDYDTKFNYVV,0.790335,-0.722205,5.335459,-7.882551,-2.099492,1.918256,2.882969,1.351551,-0.544125,0.956576,-1.562458,1.895937,0.518845,-1.007319,-0.165245,-0.739353,-1.007673,-0.296222
WEHEQQHDNQDDGKDN,15.646699,-1.677398,-1.730841,-3.529699,0.123181,-0.802543,-1.732002,-0.947516,-2.755949,-2.185678,1.238209,0.235637,1.678585,-0.481345,0.680570,-0.111502,-1.893974,-0.307282
YYCIMNKMTDKHFFAA,-5.927433,-0.592338,7.104879,-0.959287,-3.463211,-0.251894,-0.099833,-1.084665,0.107046,-2.813702,0.429181,0.811459,1.130484,1.123109,0.232948,0.608606,-1.785464,-0.744345


In [6]:
predict_test = model.predict(test_seqs)
predict_test

array([1, 1, 1, ..., 0, 1, 0])

In [7]:
predict_test = pd.DataFrame(predict_test)

In [8]:
predict_test.index = test_seqs.index
predict_test.reset_index(inplace=True)
predict_test.shape

(1365, 2)

In [9]:
predict_test.rename(columns={'index':'Secuence', 0:'Label'}, inplace=True)



In [10]:
predict_test

Unnamed: 0,Secuence,Label
0,LDLDDWYTVDRDAMSM,1
1,KEAKEGATEWCPIVIN,1
2,IYMYQNPQADYQKTVV,1
3,YYIENVMHVAMPMYYK,1
4,DPAMEFDNAEIIDDDD,0
...,...,...
1360,YVYMMYYMYMVRMCHD,1
1361,DNKHYYDYDTKFNYVV,1
1362,WEHEQQHDNQDDGKDN,0
1363,YYCIMNKMTDKHFFAA,1


In [11]:
predict_positive = predict_test[predict_test['Label']==1]
predict_positive

Unnamed: 0,Secuence,Label
0,LDLDDWYTVDRDAMSM,1
1,KEAKEGATEWCPIVIN,1
2,IYMYQNPQADYQKTVV,1
3,YYIENVMHVAMPMYYK,1
9,LERGNRVEKRWCCCSR,1
...,...,...
1357,EFEFEYFKKDYMYNRI,1
1359,LDEKWHRDAKCYNIKN,1
1360,YVYMMYYMYMVRMCHD,1
1361,DNKHYYDYDTKFNYVV,1


In [12]:
#Solo la columna de secuencias se almacenara como CSV ya que los pasos siguientes solo requieren esta columna
predict_positive['Secuence'].to_csv('../data/processed/predicted_positive.csv', index = False)

# peptideBERT


In [13]:
import torch
import yaml
import sys
sys.path.insert(1, '../')
from models.network import create_model

In [14]:
def load_bert_model(feature, device):
    config = yaml.load(open(f'../models/{feature}/config.yaml', 'r'), Loader=yaml.FullLoader)
    config['device'] = device
    model = create_model(config)
    model.load_state_dict(torch.load(f'../models/{feature}/model.pt',weights_only = False)['model_state_dict'], strict=False)
    return model
  


In [43]:
def predict_peptidebert(sequences):
    peptides =sequences.copy()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MAX_LEN = max(map(len, sequences))
    # convert to tokens
    mapping = dict(zip(
        ['[PAD]','[UNK]','[CLS]','[SEP]','[MASK]','L',
        'A','G','V','E','S','I','K','R','D','T','P','N',
        'Q','F','Y','M','H','C','W'],
        range(30)
    ))

    
    for i in range(len(sequences)):
        sequences[i] = [mapping[c] for c in sequences[i]] 
        sequences[i].extend([0] * (MAX_LEN - len(sequences[i])))  # padding to max length
    
    results = pd.DataFrame({'Sequence':peptides})
    feats = ['hemo','sol','nf']
    with torch.inference_mode():
        for c in feats:
            model = load_bert_model(c,device)
            preds = []
            for i in range(len(sequences)):
                input_ids = torch.tensor([sequences[i]]).to(device)
                attention_mask = (input_ids != 0).float()
                output = int(model(input_ids, attention_mask)[0] > 0.5)
                #print(f'Secuencia {peptides[i]} {c]: {output}')
                preds.append(output)
                
            results = pd.concat([results,pd.DataFrame(preds, columns = [c]).astype(int)], axis=1)
    
    results.to_csv(f'../data/peptideBert_results.csv', index=False)
    return results
    

In [44]:
seqs = []
with open('../data/processed/predicted_positive.csv') as fp:
    next(fp)
    f = csv.reader(fp, delimiter = ',', quotechar='"' )
    for line in f:
        seq = re.sub(r'[\[\'][\'\]]','',str(line)).strip()
        seqs.append(seq)
bert_results = predict_peptidebert(seqs)

In [45]:
bert_results

Unnamed: 0,Sequence,hemo,sol,nf
0,LDLDDWYTVDRDAMSM,0,1,0
1,KEAKEGATEWCPIVIN,0,1,0
2,IYMYQNPQADYQKTVV,0,1,0
3,YYIENVMHVAMPMYYK,0,0,0
4,LERGNRVEKRWCCCSR,0,1,0
...,...,...,...,...
507,EFEFEYFKKDYMYNRI,0,1,0
508,LDEKWHRDAKCYNIKN,0,1,0
509,YVYMMYYMYMVRMCHD,0,0,0
510,DNKHYYDYDTKFNYVV,0,1,0
