In [7]:
from generation_manager import GeneratorManager

In [8]:
# Here we set the model checkpoint path, 
# for this example, let's use ProGen fine-tuned on phage lysozymes.
model_path = 'ckpt/training_ckpt_4/model_only_state_dict_v0Last_lr0001.pth' 

# Now, time to set the generator manager with default parameters: penalty = 0, and let's set top-k = 2
# This class loads the model and the tokenizer in memory.
generator = GeneratorManager(model_path, topk = 2)

MODEL SIZE: 
1280
Found PyTorch checkpoint at  ckpt/training_ckpt_4/model_only_state_dict_v0Last_lr0001.pth
GPU aviable. Previous checkpoint loaded in GPU


In [9]:
# keywords in input: phage lysozyme keyword (0) defined during fine-tuning
keywords = [0]

# An example sequence: the lysozyme Q37875 (source: https://www.uniprot.org/uniprotkb/Q37875/entry)
sequence = "MKGKTAAGGGAICAIAVMITIVMGNGNVRTNQAGLELIGNAEGCRRDPYMCPAGVWTDGIGNTHGVTPGVRKTDQQIAADWEKNILIAERCINQHFRGKDMPDNAFSAMTSAAFNMGCNSLRTYYSKARGMRVETSIHKWAQKGEWVNMCNHLPDFVNSNGVPLRGLKIRREKERQLCLTGLVNE"

# Next, we set the amino acid prefix to give in input to the model
prefix = 10

# Last step: generation. With after_n_generation we generate up to the real protein length.
res, tokens_prob, offset = generator.after_n_generation(sequence, keywords, prefix)


In [10]:
# What happened during generation?
print('ProGen generated a protein with a prefix of ', offset, 'amino acids.')
print('Generated protein:')
print(res)
print('Actual protein:')
print(sequence[prefix:])
print('Is the predicted protein equal to the real one?', res == sequence[prefix:])


ProGen generated a protein with a prefix of  10 amino acids.
Generated protein:
AICAIAVMITIVMGNGNVRTNQAGLELIGNAEGCRRDPYMCPAGVWTDGIGNTHGVTPGVRKTDQQIAADWEKNILIAERCINQHFRGKDMPDNAFSAMTSAAFNMGCNSLRTYYSKARGMRVETSIHKWAQKGEWVNMCNHLPDFVNSNGVPLPGLKIRREKERQLCLTGLVNE
Actual protein:
AICAIAVMITIVMGNGNVRTNQAGLELIGNAEGCRRDPYMCPAGVWTDGIGNTHGVTPGVRKTDQQIAADWEKNILIAERCINQHFRGKDMPDNAFSAMTSAAFNMGCNSLRTYYSKARGMRVETSIHKWAQKGEWVNMCNHLPDFVNSNGVPLRGLKIRREKERQLCLTGLVNE
Is the predicted protein equal to the real one? False


In [11]:
# here we search for indices of amino acids that differ from the natural sequence
differing_idx = [(i, [actual, predicted]) for i, (actual, predicted) in enumerate(zip(sequence[prefix:], res)) if actual != predicted]
print(differing_idx)

[(154, ['R', 'P'])]


In [12]:
# we can analyze the probabilities of these indices that we have in output from ProGen
from tokenizer import Tokenizer
tokenizer = Tokenizer()
for difference in differing_idx:
    print(f'The predicted sequence in index {difference[0]},'
    f' has probabilities for true aa {difference[1][0]} of {tokens_prob[difference[0]][0][tokenizer.aa_to_probs_index[difference[1][0]]]:.3f},'
    f' and for predicted aa {difference[1][1]} of {tokens_prob[difference[0]][0][tokenizer.aa_to_probs_index[difference[1][1]]]:.3f}')


The predicted sequence in index 154, has probabilities for true aa R of 0.580, and for predicted aa P of 0.390
