# Model A: Protein Generator Diffusion model conditioned on overall secondary structure content

**Citing this work**

Any publication that discloses findings arising from using this notebook should cite the following work.

B. Ni, D.L. Kaplan, M.J. Buehler, Generative design of de novo proteins based on secondary structure constraints using an attention-based diffusion model, Chem, 2023

In [2]:
import torch
import os

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() 
                                  else "cpu")
# device
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
# available_gpus
num_of_gpus = torch.cuda.device_count()
print('# of GPUs: ', num_of_gpus)

# # for debug
# device='cpu'

# of GPUs:  1


In [3]:
prefix='Local_Store/output_model_A/'
if not os.path.exists(prefix):
        print('Create new folder for Model A')
        os.mkdir (prefix)

#### Define model

In [4]:
# Load dataset
ynormfac=22.
batch_size_=512
max_length = 64
number = 99999999999999999
min_length=0

# define the model
embed_dim_position=128
pred_dim=1
cond_dim = 512


In [5]:
from ProteinDiffusionGenerator.transformer_Model_A import ProteinDesigner_A, params

2023-04-23 22:49:02.289661: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
model_A =ProteinDesigner_A(
    timesteps=(96), 
    dim=768, 
    pred_dim=pred_dim, 
    loss_type=0, elucidated=True,
    padding_idx=0,cond_dim = cond_dim,
    text_embed_dim = cond_dim-embed_dim_position,
    embed_dim_position=128,
    max_text_len=8,
    device=device,
                )  .to(device)  

Text conditioning is equal to cond_dim - no linear layer used
768 1
Loss type:  0
Channels in=1, channels out=1


In [7]:
params (model_A)
params (model_A.imagen.unets[0])

Total parameters:  2206419538  trainable parameters:  2206419538
Total parameters:  2206417042  trainable parameters:  2206417042


In [8]:
#@title ####download the trained models...
#@markdown ### If it's the 1st run, this may take tens of minutes

# downlowd the trained model if it is available
# from Markus: 
# https://www.dropbox.com/s/wk8sizfbfjaz6yy/Model_B_final.pt?dl=0

model_weight_file_final = prefix+'Model_A_final.pt'
model_weight_file_early = prefix+'Model_A_early.pt'

file_exists = os.path.exists(model_weight_file_final)
if not (file_exists):
  # download things
  print(os.popen(f"wget https://www.dropbox.com/s/r79cf7uo80z3v15/Model_A_final.pt -P {prefix}").read())

file_exists = os.path.exists(model_weight_file_early)
if not (file_exists):
  # download things
  print(os.popen(f"wget https://www.dropbox.com/s/mz4afbfs0da4vb2/Model_A_early.pt -P {prefix}").read())

print('Done')

Done


In [9]:
# load the model
which_checkpoint_to_load = 'final' #@param ["final", "earlystopping"]

if which_checkpoint_to_load == 'final':
    fname=f"{prefix}Model_A_final.pt"  #Final checkpoint
else:
    fname=f"{prefix}Model_A_early.pt"  #Early stopping checkpoint

if device=="cpu":
    model_A.load_state_dict(torch.load(fname, map_location=torch.device(device=device)))
else:
    model_A.load_state_dict(torch.load(fname))
# model.load_state_dict(torch.load(f"{path}", map_location=torch.device(device=device)))

: 

: 

### Infernece

In [3]:
# load in the tokenizer
import pickle
tokenizer_file = prefix+'Model_A_tokenizers.dat'

# https://github.com/Bo-Ni/ProteinDiffusionGenerator_Colab/raw/main/Model_A_tokenizers.dat
file_exists = os.path.exists(tokenizer_file)
if not (file_exists):
  # download things
  print(os.popen(F"wget https://github.com/Bo-Ni/ProteinDiffusionGenerator_Colab/raw/main/Model_A_tokenizers.dat -P {prefix}").read())
with open(tokenizer_file, "rb") as f:
 tokenizer_data_1 = pickle.load(f)
  # print(pickle.load(f))
# print(tokenizer_data_1[0])
tokenizer_y = tokenizer_data_1[0]

--2023-04-23 23:08:05--  https://github.com/Bo-Ni/ProteinDiffusionGenerator_Colab/raw/main/Model_A_tokenizers.dat
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Bo-Ni/ProteinDiffusionGenerator_Colab/main/Model_A_tokenizers.dat [following]
--2023-04-23 23:08:06--  https://raw.githubusercontent.com/Bo-Ni/ProteinDiffusionGenerator_Colab/main/Model_A_tokenizers.dat
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 861 [application/octet-stream]
Saving to: ‘Model_A_tokenizers.dat’

     0K                                                       100% 12.6M=0s

2023-04-23 23:08:06 (12.6 MB/s) - ‘Model_A_t




2023-04-23 23:08:11.067093: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from ProteinDiffusionGenerator.transformer_Model_A import sample_sequence
import numpy as np

In [None]:
#@title ####Example 2: input vectors

#### Generate candidates

vec_1 = '[0, 0.7, 0.07, 0.1, 0.01, 0.02, 0.01, 0.11]' 


in_vec_1 = np.array(np.matrix(vec_1)).ravel()

norm_flag = False
flag_ref=40000

print('Working on vec_1:')
sample_sequence (model_A,
                X=[in_vec_1], 
                    normalize_input_to_one=norm_flag,
                 flag=flag_ref,cond_scales=1.,foldproteins=True,calc_error=True,
                 # +++++++++++++++++++++++++++++++++++++++=
                 prefix=prefix,
                 ynormfac=ynormfac,
                 tokenizer_y=tokenizer_y,
               )

In [11]:
# model_A

ProteinDesigner_A(
  (fc_embed1): Linear(in_features=8, out_features=64, bias=True)
  (fc_embed2): Linear(in_features=1, out_features=384, bias=True)
  (pos_emb_x): Embedding(9, 128)
  (imagen): ElucidatedImagen(
    (lowres_noise_schedule): GaussianDiffusionContinuousTimes()
    (unets): ModuleList(
      (0): OneD_Unet(
        (init_conv): Conv1d(1, 768, kernel_size=(7,), stride=(1,), padding=(3,))
        (to_time_hiddens): Sequential(
          (0): LearnedSinusoidalPosEmb()
          (1): Linear(in_features=17, out_features=3072, bias=True)
          (2): SiLU()
        )
        (to_time_cond): Sequential(
          (0): Linear(in_features=3072, out_features=3072, bias=True)
        )
        (to_time_tokens): Sequential(
          (0): Linear(in_features=3072, out_features=1024, bias=True)
          (1): Rearrange('b (r d) -> b r d', r=2)
        )
        (norm_cond): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn_pool): PerceiverResampler(
          (pos_

In [12]:
next(model_A.parameters()).device

device(type='cpu')