In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
import numpy as np
import random
import transformers
from transformers import Trainer, Trainer, TrainingArguments, AdamW, Adafactor, AutoTokenizer, AutoModelForCausalLM
import requests
from peft import PeftConfig, PeftModel, AutoPeftModelForCausalLM

import torch
torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
# choose target species to generate AMP sequences for
species_df = pd.read_csv('/data/hansol/jroot/data_DBAASP/Genomes_35/species.csv')
species_list = np.array(species_df['Species Name'])
ind_species = 0
species = species_list[ind_species]
species

'Escherichia coli'

In [3]:
# input the best set of hyperparameters for the chosen target species
r = '32'
alpha = '32'

tokenizer = AutoTokenizer.from_pretrained('/data/hansol/jroot/ProtGPT2/', do_lower_case = False)
base_model_path = '/data/hansol/jroot/ProtGPT2'
lora_model_path = '/data/hansol/jroot/amp_loragen/'+species+'_r'+r+'_alpha'+alpha
config = PeftConfig.from_pretrained(lora_model_path)
base_model = AutoModelForCausalLM.from_pretrained(base_model_path)

# load the LoRA adapter weights for the chosen target species
model = PeftModel.from_pretrained(base_model, lora_model_path)
model = model.to(device)
model = model.eval()

In [4]:
# generation hyperparameters
max_length = 100
min_length = 10
temperature = 1.0
k = 50
p = 1.0
repetition_penalty = 1.2
num_return_sequences = 1

In [5]:
# choose number of sequences to generate
num_gen = 20

# initialize empty numpy array to store generated sequences
gen_seq = np.empty([num_gen,], dtype='<U' + str(max_length-1))

for i in range(num_gen):
    # start sequence with blank space for generation
    sequences_Example = ' '
    ids = tokenizer.encode(sequences_Example, add_special_tokens=False)
    input_ids = torch.tensor(ids).unsqueeze(0).to(device)
            
    output_ids = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        min_length=min_length,
        temperature=temperature,
        top_k=k,
        top_p=p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.eos_token_id
    )
    
    output_sequence = tokenizer.decode(output_ids[0], skip_special_tokens = True)
    
    # convert array of output tokens to string in single letter residues
    output_seq = ''
    output_seq += output_sequence[1:]

    # delete newline characters
    output_seq = output_seq.replace('\n', '')
    gen_seq[i] = output_seq
    print(i, output_seq)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


0 IRFSPKLRWRPRWF
1 RRFWKRLFRRFGPKILGVAGAVLKALPFPPIK
2 VFALLILLILLIFLRKKK
3 GLLKILKSIKKAAKKVIKAV
4 FGGSWGGSGGSSGGWWKH
5 KEGWLAWWKKGAKKVVHKAAHVVGKAF
6 INTRKLLKLFKRFLKKG
7 VRLLLRRGIRILKKFRKIFWKS
8 KFGPWFWRRWRWRRPRPWWC
9 SGVWKLIKKLFRMVMDAT
10 ISKCGLKISKFSFRIKCG
11 FYIGKWGDKTFKWHWKKPHHW
12 KLLPWWKKILKKIFKILKKL
13 INLLLNNFKLIWKGLKRLFS
14 RRRRLRWLGHVLRRPPNYIPRGVLFWKWRYVKRRPRTWRPRY
15 KNPKQRNPKWRKPKFRKY
16 ITPWWKVFKKLFGKATGKAVKALANVAGKSVAA
17 ARYKAYAKKPAKKKRWLSKLFKK
18 KNSWKKIGKAVKKAGKKIAKAVAKGAAQVVSQIK
19 KSWIKSKFRKLKKLVKKKLKSLIS


In [6]:
# create dataframe of generated sequences
gen_seq_df = pd.DataFrame(columns = ('sequence', 'length'))
gen_seq_df.sequence = gen_seq

# calculate length of each sequence
for i in range(len(gen_seq)):
    gen_seq_df.length[i] = len(gen_seq_df.sequence[i])

# remove sequences with length > 100, if they exist
gen_seq_df = gen_seq_df[gen_seq_df.length < 101]
print(len(gen_seq_df))

# drop sequences with “B” residue (asparagine OR aspartic acid) or unknown 'X'
drop_list = []
for ind, row in gen_seq_df.iterrows():
    if ('B' in row['sequence']) or ('X' in row['sequence']):
        drop_list.append(ind)
print(len(drop_list))

gen_seq_df.drop(drop_list, inplace=True)
gen_seq_df

20
0


Unnamed: 0,sequence,length
0,IRFSPKLRWRPRWF,14
1,RRFWKRLFRRFGPKILGVAGAVLKALPFPPIK,32
2,VFALLILLILLIFLRKKK,18
3,GLLKILKSIKKAAKKVIKAV,20
4,FGGSWGGSGGSSGGWWKH,18
5,KEGWLAWWKKGAKKVVHKAAHVVGKAF,27
6,INTRKLLKLFKRFLKKG,17
7,VRLLLRRGIRILKKFRKIFWKS,22
8,KFGPWFWRRWRWRRPRPWWC,20
9,SGVWKLIKKLFRMVMDAT,18


In [7]:
# save generated sequences as a csv file
gen_seq_df.to_csv('/data/hansol/jroot/amp_loragen/generated_sequences.csv', header = True)