### GPU setting

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:

  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon May 19 13:23:13 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:10:00.0 Off |                    0 |
| N/A   28C    P0              54W / 400W |      0MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-40GB          Off | 00000000:16:00.0 Off |  

In [2]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [3]:
import os
import sys
import json
import shutil
import pickle
import random
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from typing import List
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import requests
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ExponentialLR, StepLR
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sys.path.append('../../code/Common_modules')
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
from Utils import set_randomness
set_randomness()
pd.options.display.max_colwidth = 999
SEED = 2021

In [5]:
from typing import List
from dataclasses import dataclass, asdict
from transformers import EsmTokenizer, EsmModel
from Tokenize_modules import Vocabulary, PeptideTokenizer, locate_specials, locate_non_standard_AA

tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
vocab = Vocabulary(file_name = os.path.join('../../data','vocab/vocab.txt'))
peptide_tokenizer = PeptideTokenizer(vocab)

In [6]:
from GPT_modules import GPTConfig
gpt_conf = GPTConfig(voc = vocab)
data_path = gpt_conf.data_path

### AMP dataset

In [7]:
AMP_train_df = pd.read_csv(os.path.join(data_path,"multi_train_35_0.8.csv"))
remove_idx = AMP_train_df[AMP_train_df['sequence'].str.contains('U|Z|B|X')]
AMP_train_df.drop(remove_idx.index,inplace=True)
AMP_train_df

Unnamed: 0,sequence,label,species
0,GIAAGIIIKIKK,1.505150,Escherichia coli
1,VDKKPYRPRPRPPRRIYNR,-0.853976,Escherichia coli
2,ACDTATCVTHRLAGLLSRSGGVVKNNFVPTNVGSKAF,-0.256684,Escherichia coli
3,GWWRRTVAKVRNAGRK,0.491362,Escherichia coli
4,FLGVVFKSASKVFPAVFGKV,2.080356,Escherichia coli
...,...,...,...
29389,VLSAFHKVIKIIHHISHF,1.177475,Staphylococcus haemolyticus
29390,YRGGYTGPIPRPPPIGRPPFRPVCNACYRLSVSDARNCCIKFGSCCHLVK,1.602060,Staphylococcus haemolyticus
29391,FLGLIFHGLVHAGKLIHGLIHRNRG,0.875061,Staphylococcus haemolyticus
29392,RKFRKILHRARKWI,0.622432,Staphylococcus haemolyticus


In [8]:
AMP_val_df = pd.read_csv(os.path.join(data_path,"multi_val_35_0.8.csv"))
remove_idx = AMP_val_df[AMP_val_df['sequence'].str.contains('U|Z|B|X')]
AMP_val_df.drop(remove_idx.index,inplace=True)
AMP_val_df

Unnamed: 0,sequence,label,species
0,GLFNIIKKTIGKLR,1.601792,Escherichia coli
1,LLKELWTKMKGAGKAVLGKIKGLL,-0.026872,Escherichia coli
2,FLGVVFKGASKVFPAVVGKV,2.096592,Escherichia coli
3,KRKILIKRK,1.806180,Escherichia coli
4,GFMKYIKPLIPHAVKAIKKLI,0.795880,Escherichia coli
...,...,...,...
3678,RVRRFWPLVPVAINTVAAGINLYKAIRRK,-0.154902,Staphylococcus haemolyticus
3679,GLVTSLIKGAGKLLGGLFGSVTG,0.795880,Staphylococcus haemolyticus
3680,EDWNHLGAAVHTLKHVYK,1.781212,Staphylococcus haemolyticus
3681,GLVTGLLKTAGKLLGDLFGSLSG,1.397940,Staphylococcus haemolyticus


### PeptideAtlas

In [9]:
Peptideatlas_df = pd.read_csv(os.path.join(data_path,"peptide_atlas_90_2024_02.csv"))
Peptideatlas_df

Unnamed: 0,sequence,organism,length,cluster
0,KVDTHVHHSACMNQK,Zea mays,15,180
1,ICLDILKDKWSPALQIR,Zea mays,17,305
2,AGATAVVPETLEPSLQLAAAVLAQAK,Zea mays,26,615
3,AVAGEAGVPFFSCAASEFVELFVGVGASR,Zea mays,29,631
4,AVGVSNYSEKR,Zea mays,11,632
...,...,...,...,...
1725296,IIPDDILDFILVAFPLAILGAR,Streptococcus pneumoniae D39,22,5909192
1725297,RNEILQLLMLEPTFALLDEIDSGLDIDALKVVSK,Streptococcus pneumoniae D39,34,5909193
1725298,IQLIITLIHEPDLIILDEPFSGLDPVNTELLK,Streptococcus pneumoniae D39,32,5909194
1725299,ELLQQMAGLGLLDEVINIILLLTFNK,Streptococcus pneumoniae D39,26,5909196


### HemoDL (GPU 사용)

In [10]:
import sys
from features import fs_encode
import lightgbm as lgb
import numpy as np
import torch
from transformers import T5Tokenizer, T5Model,T5EncoderModel
import re
from Bio import SeqIO
import argparse

model_esm, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")
model_esm = model_esm.to(device)

tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_uniref50',do_lower_case=False)
model_t5 = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50").to(device)

model_fs = lgb.Booster(model_file="../../Hemolysis_predictor/source/models/model.fs")
model_tr = lgb.Booster(model_file="../../Hemolysis_predictor/source/models/model.transformer")

Using cache found in /scratch/slurm-biillab/juntae/.cache/torch/hub/facebookresearch_esm_main
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Species select

In [11]:
from MIC_predictor import get_features, RegressionModel
from Utils import get_classify, classify_AMP, ClassificationModel
genome_features = torch.load(gpt_conf.genome_feature_path)
species_35 = pd.read_csv(gpt_conf.species_path)
species = species_35['species'].unique()
genome_feats = get_features([species[0]], genome_features)

In [12]:
reg_model = RegressionModel(hidden_feat = 256, pooling = 'CLS')
reg_model.load_state_dict(torch.load('../../MIC_predictor/pepESM_90_35_species_500.pth',map_location=device),strict=False)
reg_model.to(device)

Some weights of EsmModel were not initialized from the model checkpoint at ../../MIC_predictor/pepESM_90_50K and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RegressionModel(
  (bert): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(33, 480, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1026, 480, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-11): 12 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=480, out_features=480, bias=True)
              (key): Linear(in_features=480, out_features=480, bias=True)
              (value): Linear(in_features=480, out_features=480, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
              (rotary_embeddings): RotaryEmbedding()
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=480, out_features=480, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((480,), eps=1e-05, elementwi

In [13]:
cls_model = ClassificationModel(hidden_feat = 256, pooling = 'CLS')
cls_model.load_state_dict(torch.load('../../AMP_classifier/LMPred.pth',map_location=gpt_conf.device),strict=False)
cls_model.to(device)

Some weights of EsmModel were not initialized from the model checkpoint at ../../MIC_predictor/pepESM_90_50K and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ClassificationModel(
  (bert): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(33, 480, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1026, 480, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-11): 12 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=480, out_features=480, bias=True)
              (key): Linear(in_features=480, out_features=480, bias=True)
              (value): Linear(in_features=480, out_features=480, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
              (rotary_embeddings): RotaryEmbedding()
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=480, out_features=480, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((480,), eps=1e-05, eleme

### LoRA PeptideAtlas Pretraining

In [16]:
os.makedirs('../../ckpt/Pretrain/Peptide_pretrain/', exist_ok = True)
Pep_pretrained_path = '../../ckpt/Pretrain/Peptide_pretrain/Best_perplexity_pretrained_model.ckpt'
ckpt_path = Pep_pretrained_path

In [17]:
from GPT_modules import GPTGeneratorConfig, BaseGPTWrapper, GPTGenerator

conf = GPTGeneratorConfig(gpt_conf=GPTConfig(voc = vocab), ckpt_path = ckpt_path, lr_mult=0.95)
basegpt = BaseGPTWrapper(conf.gpt_conf)
generator = GPTGenerator(basegpt, conf)
generator.config.ckpt_path = ckpt_path
generator.prog_num = 0

for param in generator.base_gpt.gpt.parameters():
    param.requires_grad = True
for target_module in generator.base_gpt.gpt.lora_layers:
    for param in generator.base_gpt.gpt.lora_layers[target_module].parameters():
        param.requires_grad = True

In [18]:
from Utils import count_parameters
total_params, trainable_params = count_parameters(generator.base_gpt.gpt)
print(f'trainable parameter : {trainable_params}, total parameter : {total_params}, ratio : {trainable_params / total_params}')

trainable parameter : 95172127, total parameter : 95172127, ratio : 1.0


In [19]:
train_pepAtlas_df = Peptideatlas_df.sample(frac=0.96, random_state=42)
test_pepAtlas_df = Peptideatlas_df.drop(train_pepAtlas_df.index)

In [22]:
from Tokenize_modules import StringDataset
dataset = StringDataset(voc = vocab, peptide_tokenizer = peptide_tokenizer, strings = train_pepAtlas_df.sequence.unique().tolist())
valid_dataset = StringDataset(voc = vocab, peptide_tokenizer = peptide_tokenizer, strings = test_pepAtlas_df.sequence.unique().tolist())

In [23]:
from tqdm import tqdm
set_randomness()
pd.options.display.max_colwidth = 999
SEED = 2021
generator.batch_size = gpt_conf.batch_size
generator.train_n_epochs(dataset, valid_dataset, epochs=100, save_period=1, debug=None, save_path = Pep_pretrained_path)

- epoch: 1  - progress: 0


2it [00:01,  1.64it/s]


-- epoch loss: tensor(58.5400)
-- Training PPL: tensor(17.8838)
-- Validation PPL: tensor(16.2600)
model saved to:  ../../ckpt/Pretrain/Peptide_pretrain/temp_/Best_perplexity_pretrained_model.ckpt
Epoch :  0
New best model saved with perplexity: 16.260019302368164 at epoch 1
- epoch: 2  - progress: 1


2it [00:01,  1.64it/s]


-- epoch loss: tensor(57.9217)
-- Training PPL: tensor(17.2722)
-- Validation PPL: tensor(15.8418)
model saved to:  ../../ckpt/Pretrain/Peptide_pretrain/temp_/Best_perplexity_pretrained_model.ckpt
Epoch :  1
New best model saved with perplexity: 15.841769218444824 at epoch 2
- epoch: 3  - progress: 2


0it [00:00, ?it/s]


KeyboardInterrupt: 

In [25]:
AMP_pretrained_path = '../../ckpt/Pretrain/Peptide_pretrain/Best_perplexity_pretrained_model.ckpt'
ckpt_path = AMP_pretrained_path

In [26]:
with torch.no_grad():
    conf = GPTGeneratorConfig(gpt_conf=GPTConfig(voc = vocab),ckpt_path = ckpt_path,lr_mult=0.95)
    basegpt = BaseGPTWrapper(conf.gpt_conf)
    generator = GPTGenerator(basegpt, conf)
    generator = generator.construct_by_ckpt_dict(GPTConfig(voc = vocab), torch.load(AMP_pretrained_path),vocab)

    for param in generator.base_gpt.gpt.parameters():
        param.requires_grad = False
    for target_module in generator.base_gpt.gpt.lora_layers:
        for param in generator.base_gpt.gpt.lora_layers[target_module].parameters():
            param.requires_grad = True
    generator.base_gpt.gpt.eval()
    sampled = generator.sample_decode(ssize=1000, msl=50, bs=20)
    display(sampled)
generator.base_gpt.gpt.train()
generate = list(set(sampled))



['FFAGISDKYKL',
 'LTSPKQRDQEEIAKE',
 'ATAEIEAPTQWPEK',
 'ADPYKECIEANDVK',
 'KLFINEMQELF',
 'DKYYAGDV',
 'PGKQLIITPIGRSEKG',
 'GGRIKDRTPLTKGPEPISIFDHRPVEAERAHK',
 'REKPIDLLSYQGEHETNVEPLRGMYR',
 'HDLKPKRTKGNTRYGLGNIVTVQHLLLSDQTAHSIY',
 'LEAGGESALATQPRSDG',
 'NFEIEAEFPEKDTERVIRRHFLGIRERFQGDE',
 'GPGKD',
 'SSNPM',
 'EQQVDVKAY',
 'PIHGPANQESQGKRKKINQQGPLFFG',
 'GVVSPTGGGKHVG',
 'NKDTLN',
 'VLGTQRDVQ',
 'ELTQVERRYFQLAPGNLETENLQEQGK',
 'NIEWPLG',
 'LVTTPSATPLDGRTVEEATEAAKWMEEQREIFIPNILNITRSGQYTR',
 'LQRVEVGSAVEAF',
 'TRYEEERPATPLMP',
 'ASKLG',
 'QIISQGVDVAWFGPL',
 'PGADRGAQYDT',
 'GVGGKETNELKYQTGKQ',
 'GPLLSETAQLLKR',
 'GDLTLPS',
 'TREEQFSPGYIKKYGTDHQ',
 'LMFAGNGQVLGQTIYHVENP',
 'ENYKVNYIDGQLNVE',
 'HTQAVRKELE',
 'EEFEGPNP',
 'GAGRGPKEGPLRLLFQTEKRG',
 'VLDGPGYCLDKDREE',
 'KTAWDNRCPD',
 'QPPVQDERDHASNTY',
 'RPGYRLQLKIICAIVEKS',
 'DGRGIL',
 'WSVQGKRGEK',
 'VRRENRIALHETAKSRKIKVLYRDRHIA',
 'TQNKNTGARGIDRTKLPDSP',
 'TMCESH',
 'KTGGIFNRFGLEMGRGLADSGADPF',
 'PQARIAQPHAFYSM',
 'ANKGPYRPLK',
 'FTFTLT'

### LoRA AMP Finetuning

In [39]:
os.makedirs('../../ckpt/Pretrain/AMP_pretrain/', exist_ok = True)
AMP_pretrained_path = '../../ckpt/Pretrain/Peptide_pretrain/Best_perplexity_pretrained_model.ckpt'
AMP_finetuned_path = '../../ckpt/Pretrain/AMP_pretrain/Finetune_Pareto_ckpt20.ckpt'
ckpt_path = AMP_finetuned_path

In [40]:
from GPT_modules import GPTGeneratorConfig, BaseGPTWrapper, GPTGenerator

conf = GPTGeneratorConfig(gpt_conf=GPTConfig(voc = vocab),ckpt_path = ckpt_path, lr_mult=0.95)
basegpt = BaseGPTWrapper(conf.gpt_conf)
generator = GPTGenerator(basegpt, conf)
generator = generator.construct_by_ckpt_dict(GPTConfig(voc = vocab), torch.load(AMP_pretrained_path),vocab)
generator.config.ckpt_path = ckpt_path
generator.prog_num = 0

for param in generator.base_gpt.gpt.parameters():
    param.requires_grad = True
for target_module in generator.base_gpt.gpt.lora_layers:
    for param in generator.base_gpt.gpt.lora_layers[target_module].parameters():
        param.requires_grad = True

In [41]:
total_params, trainable_params = count_parameters(generator.base_gpt.gpt)
print(f'trainable parameter : {trainable_params}, total parameter : {total_params}, ratio : {trainable_params / total_params}')

trainable parameter : 95172127, total parameter : 95172127, ratio : 1.0


In [42]:
AMP_train_seqs = [seq.strip() for seq in AMP_train_df.sequence.unique().tolist()[:500]]
AMP_val_seqs = [seq.strip() for seq in AMP_val_df.sequence.unique().tolist()[:500]]

In [43]:
dataset = StringDataset(voc = vocab, peptide_tokenizer = peptide_tokenizer, strings = AMP_train_seqs)
valid_dataset = StringDataset(voc = vocab, peptide_tokenizer = peptide_tokenizer, strings = AMP_val_seqs)

In [44]:
generator.train_n_epochs(dataset, valid_dataset, epochs=100, save_period=5, debug=None, save_path = AMP_finetuned_path)

- epoch: 1  - progress: 0


2it [00:01,  1.59it/s]


-- epoch loss: tensor(60.3531)
-- Training PPL: tensor(21.0615)
-- Validation PPL: tensor(20.3879)
model saved to:  ../../ckpt/Pretrain/AMP_pretrain/temp_/Finetune_Pareto_ckpt20.ckpt
Epoch :  0
New best model saved with perplexity: 20.387868881225586 at epoch 1
- epoch: 2  - progress: 1


2it [00:01,  1.71it/s]


-- epoch loss: tensor(58.9394)
-- Training PPL: tensor(19.6680)
-- Validation PPL: tensor(17.8363)
model saved to:  ../../ckpt/Pretrain/AMP_pretrain/temp_/Finetune_Pareto_ckpt20.ckpt
Epoch :  1
New best model saved with perplexity: 17.83629035949707 at epoch 2
- epoch: 3  - progress: 2


2it [00:01,  1.75it/s]


-- epoch loss: tensor(56.8977)
-- Training PPL: tensor(17.6966)
-- Validation PPL: tensor(17.1460)
model saved to:  ../../ckpt/Pretrain/AMP_pretrain/temp_/Finetune_Pareto_ckpt20.ckpt
Epoch :  2
New best model saved with perplexity: 17.145999908447266 at epoch 3
- epoch: 4  - progress: 3


1it [00:01,  1.14s/it]


KeyboardInterrupt: 

In [38]:
with torch.no_grad():
    conf = GPTGeneratorConfig(gpt_conf=GPTConfig(voc = vocab),ckpt_path = '',lr_mult=0.95)
    basegpt = BaseGPTWrapper(conf.gpt_conf)
    generator = GPTGenerator(basegpt, conf)
    generator = generator.construct_by_ckpt_dict(GPTConfig(voc = vocab), torch.load(AMP_finetuned_path),vocab)

    for param in generator.base_gpt.gpt.parameters():
        param.requires_grad = False
    for target_module in generator.base_gpt.gpt.lora_layers:
        for param in generator.base_gpt.gpt.lora_layers[target_module].parameters():
            param.requires_grad = True
    generator.base_gpt.gpt.eval()
    sampled = generator.sample_decode(ssize=1000, msl=50, bs=20)
    display(sampled)
generator.base_gpt.gpt.train()
generate = list(set(sampled))

RuntimeError: PytorchStreamReader failed locating file data/8: file not found