<a target="_blank" href="https://colab.research.google.com/github/Bo-Ni/ProteinMechanicsDiffusionDesign_pLDM/blob/main/notebook_for_colab/pLDM_inferring_standalong_colab.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## 0. Preparation:
### 0-1. Add packages

In [17]:
# !python --version

In [2]:
# !apt-get install python3.9

In [3]:
# import sys
# sys.path.pop(0)
# sys.path.insert(0, '/usr/bin/python3.9')

In [4]:
import os,sys
import math

#os.environ["CUDA_VISIBLE_DEVICES"] = "-1" #turn off CUDA if needed
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

path_1 = '/opt/bin/'
dssp_file = path_1+'mkdssp'

file_exists = os.path.exists(dssp_file)
if not (file_exists):
  print('\033[1;32m For the 1st run, ')
  # ==============================================
  print('a. Install omegafold...')
  # install omegafold
  # ref: https://github.com/HeliXonProtein/OmegaFold
  !pip install git+https://github.com/HeliXonProtein/OmegaFold.git

  # time-consuming step:
  # Downloading weights from https://helixon.s3.amazonaws.com/release1.pt to /root/.cache/omegafold_ckpt/model.pt
  !mkdir /root/.cache/omegafold_ckpt
  !wget https://helixon.s3.amazonaws.com/release1.pt -O /root/.cache/omegafold_ckpt/model.pt

  print('b. Install DSSP...')
  # download an mkdssp
  # ==============================================
  # download things
  print(os.popen(f"wget https://www.dropbox.com/s/v4azy9z9yojg1c6/mkdssp -P {path_1}").read())
  #
  !chmod u+x /opt/bin/mkdssp

else:
  print('This is not the first run... ')

[1;32m For the 1st run, 
a. Install omegafold...
Collecting git+https://github.com/HeliXonProtein/OmegaFold.git
  Cloning https://github.com/HeliXonProtein/OmegaFold.git to /tmp/pip-req-build-pn2hpkxv
  Running command git clone --filter=blob:none --quiet https://github.com/HeliXonProtein/OmegaFold.git /tmp/pip-req-build-pn2hpkxv
  Resolved https://github.com/HeliXonProtein/OmegaFold.git to commit 313c873ad190b64506a497c926649e15fcd88fcd
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch@ https://download.pytorch.org/whl/cu113/torch-1.12.0%2Bcu113-cp310-cp310-linux_x86_64.whl (from OmegaFold==0.0.0)
  Downloading https://download.pytorch.org/whl/cu113/torch-1.12.0%2Bcu113-cp310-cp310-linux_x86_64.whl (1837.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 GB[0m [31m989.4 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython (from OmegaFold==0.0.0)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [13]:
# add some packages
try:
  print('\033[1;32m a. on Biopython...')
  from Bio.PDB import PDBParser
except ImportError as e:
  print(os.popen('pip install biopython').read())

try:
  print('\033[1;32m b. on kornia...')
  import kornia.augmentation
except ImportError as e:
  print(os.popen('pip install kornia').read())

try:
  print('\033[1;32m c. on einops...')
  from einops import rearrange, repeat, reduce
except ImportError as e:
  print(os.popen('pip install einops').read())

try:
  from einops_exts import rearrange_many, repeat_many, check_shape
except ImportError as e:
  print(os.popen('pip install einops-exts').read())

try:
  import pytorch_warmup as warmup
except ImportError as e:
  print(os.popen('pip install pytorch-warmup').read())

try:
  from ema_pytorch import EMA
except ImportError as e:
  print(os.popen('pip install ema-pytorch').read())

try:
  from accelerate import Accelerator, DistributedType, DistributedDataParallelKwargs
except ImportError as e:
  print(os.popen('pip install accelerate').read())

try:
  import py3Dmol
except ImportError as e:
  print(os.popen('pip install py3Dmol').read())

# added
try:
  import esm
except ImportError as e:
  print(os.popen('pip install fair-esm').read())

try:
  import torchinfo
except ImportError as e:
  print(os.popen('pip install torchinfo').read())

[1;32m a. on Biopython...
[1;32m b. on kornia...
[1;32m c. on einops...
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0



### 0.2. copy src from github

In [6]:
import json, time, os, sys, glob

# just script, no install is used
if not os.path.isdir("ProteinMPNN"):
  os.system("git clone -q https://github.com/dauparas/ProteinMPNN.git")
# sys.path.append('/content/ProteinMPNN/')

# ===================================================================

if not os.path.isdir("ProteinMechanicsDiffusionDesign_pLDM"):
  os.system("git clone -q https://github.com/Bo-Ni/ProteinMechanicsDiffusionDesign_pLDM.git")
# sys.path.append('/content/ProteinMechanicsDiffusionDesign_pLDM/ProteinMechanicsDiffusionDesign/')
sys.path.append('/content/ProteinMechanicsDiffusionDesign_pLDM/')

In [7]:
# a slient test
import ProteinMechanicsDiffusionDesign.UtilityPack as UtilityPack
import ProteinMechanicsDiffusionDesign.DataSetPack as DataSetPack
import ProteinMechanicsDiffusionDesign.ModelPack as ModelPack
import ProteinMechanicsDiffusionDesign.TrainerPack as TrainerPack
import ProteinMechanicsDiffusionDesign.PostMDPack as PostMDPack

### 0.3. Download the model files

In [65]:
# just script, no install is used
this_working_path = '/content/Trained_model/'

if not os.path.isdir(this_working_path):
  print('Creating working path...')
  print(os.popen('mkdir '+this_working_path).read())
  print('Done.')
  print('Downing files...')

this_file = this_working_path+'model_pack.pickle'
file_exists = os.path.exists(this_file)
if not (file_exists):
  # download things
  this_link='https://www.dropbox.com/scl/fi/i2sull7ftjwrrzeaxo8v1/model_pack.pickle?rlkey=7wy5zynrl6m8azufklq3fy8ql&dl=0'
  cmd_line = f"wget -O {this_file} {this_link}"
  print(os.popen(cmd_line).read())

#
this_file = this_working_path+'data_pack.pickle'
file_exists = os.path.exists(this_file)
if not (file_exists):
  # download things
  this_link='https://www.dropbox.com/scl/fi/z7sz0q2nsjn85kyh68p86/data_pack.pickle?rlkey=bwm9fgf29ze8o516r155zg4gl&dl=0'
  cmd_line = f"wget -O {this_file} {this_link}"
  print(os.popen(cmd_line).read())



  # os.system("git clone -q https://github.com/dauparas/ProteinMPNN.git")
# sys.path.append('/content/ProteinMPNN/')





## 1. Working part

### 1.0. Check the floor

In [19]:
import os, sys

In [20]:
print('Here is : \n', os.popen('pwd').read())
print('What we get in hardware: \n', os.popen('nvidia-smi').read())

Here is : 
 /content

What we get in hardware: 
 Sun Oct  1 02:37:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0    44W / 400W |      3MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+--------------------------------------

In [21]:
import torch
print("What we have in software: \n Torch version:", torch.__version__)
print('Python: ', sys.version) # no switch case code

What we have in software: 
 Torch version: 1.12.0+cu113
Python:  3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]


In [8]:
import torch
print("What we have in software: \n Torch version:", torch.__version__)
print('Python: ', sys.version) # no switch case code

What we have in software: 
 Torch version: 1.12.0+cu113
Python:  3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]


In [22]:
print('What hardware the software see:')
device = torch.device(
    "cuda:0" if torch.cuda.is_available() else "cpu"
)
print(device)
num_of_gpus = torch.cuda.device_count()
print("# of GPU", num_of_gpus)

What hardware the software see:
cuda:0
# of GPU 1


In [23]:
torch.cuda.empty_cache()

### 1.1. Setup the problem

In [24]:
# prepare for package debugging
# for debug
import importlib
import json
import matplotlib.pyplot as plt

In [25]:
# import PD_pLMProbXDiff.UtilityPack as UtilityPack
# # run this when updating the package
# importlib.reload(UtilityPack)

In [30]:
# ===============================================
# Global control key setup
# ===============================================
# Control keys:
# This one will be directly modified for each task
CKeys = dict(
    #
    Running_Type=2, # 1-local:engaging cluster, 2-supercould cluster, 3-google colab, 4-local ubuntu
    #
    # Working_Mode=1, # 1-training, 2-sampling for test
    Working_Mode=2,
    #
    # IF_FirstRun=1,  # 1-1st run of training; otherwise, # of training run
    IF_FirstRun=2,  # 1-1st run of training; otherwise, # of training run
    #
    # 1-SecStr-ModelB, 2-MD-ModelB, 3-SecStr-ModelA, 4-MD-ModelA
    # 5-SecStr-ModelB-Embdding, 6-MD-ModelB, 7-SecStr-ModelA-pLM, 8-MD-ModelA
    # 9-MD-Predictor-ModelB, 10-
    # 11-MD-ModelB
    Problem_ID=11, # 8, # 6,
    #
    # Debug=1, # 1-debug mode on; add more debug keys for different blocks
    Debug=0, # 1-debug mode on; add more debug keys for different blocks
    #
    Debug_DataSet=1,
    Debug_Model=1
    #
    # Debug=0
)

In [31]:

if CKeys['Debug']==1:
    # add some
    CKeys['SlientRun']=0 # 1-save figure into files; 0-show figures
    # detailed debug keys
    # 1. for model dimension
    CKeys['Debug_DataPack']=1
    CKeys['Debug_ModelPack']=1
    CKeys['Debug_TrainerPack']=1
    # 2. for trainer part
    CKeys['Debug_DataPack']=1
    CKeys['Debug_ModelPack']=0
    CKeys['Debug_TrainerPack']=3
    # #
    # CKeys['testratio']=0.15 # for small ForcPath problem
    CKeys['testratio']=0.10 # for large ForcPath problem

else:
    # for real working run
    CKeys['SlientRun']=1
    #
    CKeys['Debug_DataPack'] = 0
    CKeys['Debug_ModelPack'] = 0
    CKeys['Debug_TrainerPack'] = 0 # 2 # 1
    # add some for training
    CKeys['epochs'] = 4000-3250 # 1000 # 200
    CKeys['print_loss_every_this_epochs']=50 # 5
    CKeys['sample_every_this_epochs']=100 # 50 # 20
    CKeys['save_model_every_this_epochs']=50 # 20
    # #
    # # add some for training
    # CKeys['epochs'] = 2000 # 1000 # 200
    # CKeys['print_loss_every_this_epochs']=20 # 5
    # CKeys['sample_every_this_epochs']=50 # 50 # 20
    # CKeys['save_model_every_this_epochs']=50 # 20
    #
    # CKeys['testratio']=0.15 # for small ForcPath problem
    CKeys['testratio']=0.10 # for large ForcPath problem
    #
    # # add some for training
    # CKeys['epochs'] = 4 # 1000 # 200
    # CKeys['print_loss_every_this_epochs']=1 # 5
    # CKeys['sample_every_this_epochs']=1 # 50 # 20
    # CKeys['save_model_every_this_epochs']=2 # 20

# for check
print(json.dumps(CKeys, indent=4))


{
    "Running_Type": 2,
    "Working_Mode": 2,
    "IF_FirstRun": 2,
    "Problem_ID": 11,
    "Debug": 0,
    "Debug_DataSet": 1,
    "Debug_Model": 1,
    "SlientRun": 1,
    "Debug_DataPack": 0,
    "Debug_ModelPack": 0,
    "Debug_TrainerPack": 0,
    "epochs": 750,
    "print_loss_every_this_epochs": 50,
    "sample_every_this_epochs": 100,
    "save_model_every_this_epochs": 50,
    "testratio": 0.1
}


In [32]:

# Problem type:
print('Problem type: ', CKeys['Problem_ID'])
print('Debug mode: ', CKeys['Debug'])
print('Working mode: ', CKeys['Working_Mode'])


Problem type:  11
Debug mode:  0
Working mode:  2


In [38]:

# ===========================================================
# Parameter key:
# ===========================================================
#
PKeys = {}
# define this one according the running environment
# add more if needed
# if CKeys['Running_Type']==1 or CKeys['Running_Type']==4:
# if CKeys['Running_Type']==1:
#
#
root_path = '/home/gridsan/bni/Test_ground/jupyter/1_git_project/sort_pdb_database_0/Local_Store/'
if CKeys['Debug']==1:
    # PKeys['prefix']='../Local_Store/For_16_0/'
    # use the absolute path for transformability
    PKeys['prefix']=root_path+'For_20_0/'

if CKeys['Debug']!=1:
    # PKeys['prefix']='../Local_Store/For_16_1/'
    PKeys['prefix']=root_path+'For_20_1/'
    PKeys['prefix']='/home/gridsan/bni/16_WG_git_sort_pdb_database_0/11_pLMProb_Diff_SMD_ModelB_embed_640/0_Training/'
    # add one for google colab
    PKeys['prefix']=this_working_path # '/content/11_pLMProb_Diff_SMD_ModelB_embed_640/0_Training/'

# store the data pack after processing
PKeys['pk_data_pack']=PKeys['prefix']+'data_pack.pickle'
PKeys['pk_model_pack']=PKeys['prefix']+'model_pack.pickle'
# PKeys[]

In [40]:
# print(CKeys['Running_Type'])
# print(CKeys['Working_Mode']==1 and CKeys['IF_FirstRun']==1)
# print(PKeys['prefix'])

In [41]:

#
# clean EVERYTHING in the dir if 1st
#
if CKeys['Working_Mode']==1 and CKeys['IF_FirstRun']==1:
    if os.path.exists(PKeys['prefix']):
        cmd_line=f"rm -r {PKeys['prefix']}"
        print("clean the slade...")
        print(f"excute {cmd_line}")
        os.popen(cmd_line).read()
        #
    # create dir for working space
    UtilityPack.create_path(PKeys['prefix'])

In [42]:
# ========================================================================
# prepare the csv files
# ========================================================================

if CKeys['Problem_ID']==1 or CKeys['Problem_ID']==3 \
or CKeys['Problem_ID']==5 or CKeys['Problem_ID']==7:
#     SS_csv_file = PKeys['prefix']+'PROTEIN_Mar18_2022_SECSTR_ALL.csv'

#     file_exists = os.path.exists(SS_csv_file)
#     if not (file_exists):
#         print('Downing the csv file...')
#         print(os.popen(f"wget https://www.dropbox.com/s/7o7s15w9qr6z76y/PROTEIN_Mar18_2022_SECSTR_ALL.csv -P {PKeys['prefix']}").read())
#         print('Done.')
#     else:
#         print("Already there")
    # +
    # SS_csv_file = '/home/gridsan/bni/Test_ground/jupyter/1_git_project/sort_pdb_database_0/Local_Store/'+'PROTEIN_Mar18_2022_SECSTR_ALL.csv'
    SS_csv_file = root_path+'Local_Store/'+'PROTEIN_Mar18_2022_SECSTR_ALL.csv'

if CKeys['Problem_ID']==2 or CKeys['Problem_ID']==4 \
or CKeys['Problem_ID']==6 or CKeys['Problem_ID']==8 \
or CKeys['Problem_ID']==11:
    # to be copied locally
    # MD_smo_csv_file = '/home/gridsan/bni/Test_ground/jupyter/1_git_project/sort_pdb_database_0/Local_Store/For_1/ForTrain_recon_BSDB_LE_64_smd_disp_forc_df_smo.csv'
    # first debug with LE_64, to be update into LE_128
    MD_smo_csv_file = root_path+'For_1/ForTrain_recon_BSDB_LE_64_smd_disp_forc_df_smo.csv'
    MD_smo_pk_file  = root_path+'For_1/ForTrain_recon_BSDB_LE_64_smd_disp_forc_df_smo_shared.pk'
    # into LE_128
    MD_smo_pk_file  = root_path+'For_1/ForTrain_recon_BSDB_LE_128_smd_disp_forc_df_smo.pk'

### 1.2. Handle the dataset

In [43]:
import pickle
import numpy as np
import pandas as pd

In [45]:
# import PD_pLMProbXDiff.DataSetPack as DataSetPack
# importlib.reload(DataSetPack)

In [46]:
print('On Problem: ', CKeys['Problem_ID'])

On Problem:  11


In [37]:
print(this_working_path)

/content/Trained_model/


In [47]:
# on the sec_str csv file: SecStr
# try to convey all para via one key
# ====================================================
# add some new keys for dataset
# ====================================================
# for data washing: only for 1st training cycle
if CKeys['Working_Mode']==1 and CKeys['IF_FirstRun']==1:

    if CKeys['Problem_ID']==1:
        pass
        # print("1")
        # # +++++++++++++++++++++++++++++++++++++
        # # SecStr as input seq
        # DataKeys={}
        # DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_SS/'
        # # screening rules
        # DataKeys['min_AA_seq_len']=0
        # DataKeys['max_AA_seq_len']=128
        # # X and Y processing
        # DataKeys['Xnormfac']=9.
        # DataKeys['ynormfac']=21.
        # DataKeys['tokenizer_X']=None
        # DataKeys['tokenizer_y']=None
        # # + for AA embending using ESM
        # DataKeys['ESM-2_Model']='esm2_t33_650M_UR50D'
        # # deliver
        # DataKeys['batch_size']=256
        # DataKeys['batch_size']=200
        # DataKeys['testset_ratio']=0.1
        # DataKeys['maxdata']=99999999999999999
        # # add the folder for Data part
        # UtilityPack.create_path(DataKeys['data_dir'])

    elif CKeys['Problem_ID']==2:
        pass
        print("2")
#         # ++++++++++++++++++++++++++++++++++++++
#         # MD record as the input seq
#         #
#         # try to convey all para via one key
#         DataKeys={}
#         # ======================================
#         # keys for "screen_dataset_MD"
#         DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_MD/'
#         # add the folder
#         UtilityPack.create_path(DataKeys['data_dir'])

#         # screening rules
#         DataKeys['min_AA_seq_len']=0
#         DataKeys['max_AA_seq_len']=64
#         DataKeys['max_Force_cap']=1000
#         # special ones
#         # change text arr into np arr
#         DataKeys['arr_key']=[
#             'posi_data','pull_data','forc_data',
#             'gap_data','normalized_gap_data',
#             'pull_gap_data', 'normalized_pull_gap_data',
#             'sample_NormPullGap_data','sample_FORCEpN_data']

#         df_raw, protein_df = DataSetPack.screen_dataset_MD(
#             file_path=MD_smo_csv_file,
#             PKeys=DataKeys, # to be updated
#             CKeys=CKeys,
#         )

#         # save the dataframe
#         pd.to_pickle(protein_df, DataKeys['data_dir']+'protein_df.pk')
#         pd.to_pickle(df_raw, DataKeys['data_dir']+'df_raw.pk')

#         # ======================================
#         # keys for 2nd function
#         DataKeys['X_Key']='sample_FORCEpN_data' # or 'Max_Smo_Force'
#         #
#         DataKeys['tokenizer_X']=None # will not be used
#         DataKeys['tokenizer_y']=None # to be created
#         DataKeys['Xnormfac'] = np.max(protein_df['Max_Smo_Force'])
#         print('Normalization factor for force: ', DataKeys['Xnormfac'])
#         DataKeys['ynormfac']=21. # old force diffusion model 22.
#         #
#         DataKeys['batch_size']=256
#         DataKeys['testset_ratio']=0.15
#         DataKeys['maxdata']=99999999991000

    elif CKeys['Problem_ID']==3:
        pass
        # print("3")
        # # +++++++++++++++++++++++++++++++++++++
        # # SecStr as input seq
        # DataKeys={}
        # DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_SS_ModelA/'
        # # screening rules
        # DataKeys['min_AA_seq_len']=0
        # DataKeys['max_AA_seq_len']=64 # 128
        # DataKeys['max_text_len']=8
        # # X and Y processing
        # DataKeys['Xnormfac']=1.
        # DataKeys['ynormfac']=22. # 21.
        # DataKeys['tokenizer_X']=None
        # DataKeys['tokenizer_y']=None
        # # deliver
        # DataKeys['batch_size']=512
        # # for debug purpose
        # # DataKeys['batch_size']=1
        # DataKeys['testset_ratio']= 0.1
        # DataKeys['maxdata']=99999999999999999
        # # add the folder for Data part
        # UtilityPack.create_path(DataKeys['data_dir'])

    elif CKeys['Problem_ID']==4:
        pass

#         print("4: input text condition, output sequence...")
#         # ++++++++++++++++++++++++++++++++++++++
#         # MD record as the input seq
#         #
#         # try to convey all para via one key
#         DataKeys={}
#         # ======================================
#         # keys for "screen_dataset_MD"
#         DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_MD/'
#         # add the folder
#         UtilityPack.create_path(DataKeys['data_dir'])

#         # screening rules
#         DataKeys['min_AA_seq_len']=0
#         DataKeys['max_AA_seq_len']=64
#         DataKeys['max_text_len']=2
#         DataKeys['max_Force_cap']=1000
#         # special ones
#         # change text arr into np arr
#         DataKeys['arr_key']=[
#             'posi_data','pull_data','forc_data',
#             'gap_data','normalized_gap_data',
#             'pull_gap_data', 'normalized_pull_gap_data',
#             'sample_NormPullGap_data','sample_FORCEpN_data']

#         df_raw, protein_df = DataSetPack.screen_dataset_MD(
#             file_path=MD_smo_csv_file,
#             PKeys=DataKeys, # to be updated
#             CKeys=CKeys,
#         )

#         # save the dataframe
#         pd.to_pickle(protein_df, DataKeys['data_dir']+'protein_df.pk')
#         pd.to_pickle(df_raw, DataKeys['data_dir']+'df_raw.pk')

#         # ======================================
#         # keys for 2nd function
#         DataKeys['X_Key']=['Max_Smo_Force','Int_Smo_ForcPull'] # 'sample_FORCEpN_data' # or 'Max_Smo_Force'
#         #
#         DataKeys['tokenizer_X']=None # will not be used
#         DataKeys['tokenizer_y']=None # to be created
#         #
#         print('Normalization factor for force: ',
#               np.max(protein_df['Max_Smo_Force']))
#         print('Normalization factor for toughness: ',
#               np.max(protein_df['Int_Smo_ForcPull']))
#         #
#         DataKeys['Xnormfac'] = np.array([
#             np.max(protein_df['Max_Smo_Force']),
#             np.max(protein_df['Int_Smo_ForcPull'])
#         ])
#         #
#         DataKeys['ynormfac']=21. # old force diffusion model 22.
#         #
#         DataKeys['batch_size']=256
#         DataKeys['testset_ratio']=0.15
#         DataKeys['maxdata']=99999999991000

    # /////////////////////////////////////////////////////////////
    # try embedding
    # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
    elif CKeys['Problem_ID']==5:
        pass

#         print("5")
#         # +++++++++++++++++++++++++++++++++++++
#         # SecStr as input seq
#         DataKeys={}
#         DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_SS/'
#         # screening rules
#         DataKeys['min_AA_seq_len']=0
#         DataKeys['max_AA_seq_len']=128
#         # X and Y processing
#         DataKeys['Xnormfac']=9.
#         DataKeys['ynormfac']=1 # for ESM # 21.
#         DataKeys['tokenizer_X']=None
#         DataKeys['tokenizer_y']=None
#         # + for AA embending using ESM
#         DataKeys['ESM-2_Model']='esm2_t33_650M_UR50D'
#         # add for embedding space
#         DataKeys['image_channels']=1280
#         # deliver
#         DataKeys['batch_size']=256
#         DataKeys['batch_size']=256 # 0 # 200
#         DataKeys['testset_ratio']=CKeys['testratio'] # 0.1
#         DataKeys['maxdata']=99999999999999999

#         # add the folder for Data part
#         UtilityPack.create_path(DataKeys['data_dir'])
    #
    elif CKeys['Problem_ID']==6:
        print("6, input MD ForcePath, output AA sequence")
        # ++++++++++++++++++++++++++++++++++++++
        # MD record as the tokenized input seq
        #
        # try to convey all para via one key
        DataKeys={}
        # ======================================
        # keys for "screen_dataset_MD"
        DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_MD/'
        # add the folder
        UtilityPack.create_path(DataKeys['data_dir'])

        # screening rules
        DataKeys['min_AA_seq_len']=0
        # DataKeys['max_AA_seq_len']=64
        DataKeys['max_AA_seq_len']=128
        DataKeys['max_Force_cap']=1000
#         # ---------------------------------------
#         # special ones
#         # change text arr into np arr
#         DataKeys['arr_key']=[
#             'posi_data','pull_data','forc_data',
#             'gap_data','normalized_gap_data',
#             'pull_gap_data', 'normalized_pull_gap_data',
#             'sample_NormPullGap_data','sample_FORCEpN_data']

#         df_raw, protein_df = DataSetPack.screen_dataset_MD(
#             csv_file=MD_smo_csv_file,
#             pk_file=None,
#             PKeys=DataKeys, # to be updated
#             CKeys=CKeys,
#         )
        # ++++++++++++++++++++++++++++++++++++++
        df_raw, protein_df = DataSetPack.screen_dataset_MD(
            csv_file=None,
            pk_file=MD_smo_pk_file,
            PKeys=DataKeys, # to be updated
            CKeys=CKeys,
        )

        # save the dataframe
        pd.to_pickle(protein_df, DataKeys['data_dir']+'protein_df.pk')
        pd.to_pickle(df_raw, DataKeys['data_dir']+'df_raw.pk')

        # ======================================
        # keys for 2nd function
        DataKeys['X_Key']='sample_FORCEpN_data' # or 'Max_Smo_Force'
        #
        DataKeys['tokenizer_X']=None # will not be used
        DataKeys['tokenizer_y']=None # to be created
        # think about this: update this one if necessary
        # DataKeys['Xnormfac'] = np.max(protein_df['Max_Smo_Force'])
        DataKeys['Xnormfac'] = 750.

        print('Normalization factor for force: ', DataKeys['Xnormfac'])
        DataKeys['ynormfac']=1. # not used as esm is used # 21. # old force diffusion model 22.
        #
        DataKeys['batch_size']=256
        DataKeys['testset_ratio']=0.15
        DataKeys['maxdata']=99999999991000
        # ++ for pLM
        # for AA embending using ESM
        DataKeys['ESM-2_Model']='esm2_t33_650M_UR50D'
        # add for embedding space
        DataKeys['image_channels']=1280
        #
        DataKeys['ESM-2_Model']='esm2_t12_35M_UR50D'
        DataKeys['image_channels']=480


    elif CKeys['Problem_ID']==7:
        pass
        # print("7")
        # # +++++++++++++++++++++++++++++++++++++
        # # SecStr text as input seq
        # DataKeys={}
        # DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_SS_ModelA/'
        # # screening rules
        # DataKeys['min_AA_seq_len']=0
        # DataKeys['max_AA_seq_len']=64 # 128
        # DataKeys['max_text_len']=8
        # # X and Y processing
        # DataKeys['Xnormfac']=1.
        # DataKeys['ynormfac']=1. # for ESM # 21. 22. # 21.
        # DataKeys['tokenizer_X']=None
        # DataKeys['tokenizer_y']=None
        # # deliver
        # DataKeys['batch_size']=512
        # # for debug purpose
        # # DataKeys['batch_size']=1
        # DataKeys['testset_ratio']= CKeys['testratio'] # 0.1
        # DataKeys['maxdata']=99999999999999999
        # # + for AA embending using ESM
        # DataKeys['ESM-2_Model']='esm2_t33_650M_UR50D'
        # # add for embedding space
        # DataKeys['image_channels']=1280
        # #
        # # add the folder for Data part
        # UtilityPack.create_path(DataKeys['data_dir'])

    elif CKeys['Problem_ID']==8:
        #
        print("8: input text condition, output sequence...")
        # ++++++++++++++++++++++++++++++++++++++
        # MD record as the input seq
        #
        # try to convey all para via one key
        DataKeys={}
        # ======================================
        # keys for "screen_dataset_MD"
        DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_MD/'
        # add the folder
        UtilityPack.create_path(DataKeys['data_dir'])

        # screening rules
        DataKeys['min_AA_seq_len']=0
        DataKeys['max_AA_seq_len']=64
        DataKeys['max_AA_seq_len']=128
        DataKeys['max_text_len']=2
        DataKeys['max_Force_cap']=1000
#         # ---------------------------------------------------------
#         # special ones
#         # change text arr into np arr
#         DataKeys['arr_key']=[
#             'posi_data','pull_data','forc_data',
#             'gap_data','normalized_gap_data',
#             'pull_gap_data', 'normalized_pull_gap_data',
#             'sample_NormPullGap_data','sample_FORCEpN_data']

#         df_raw, protein_df = DataSetPack.screen_dataset_MD(
#             # # --
#             # file_path=MD_smo_csv_file,
#             # ++
#             csv_file=MD_smo_csv_file,
#             pk_file=None,
#             PKeys=DataKeys, # to be updated
#             CKeys=CKeys,
#         )
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        df_raw, protein_df = DataSetPack.screen_dataset_MD(
            csv_file=None,
            pk_file=MD_smo_pk_file,
            PKeys=DataKeys, # to be updated
            CKeys=CKeys,
        )

        # save the dataframe
        pd.to_pickle(protein_df, DataKeys['data_dir']+'protein_df.pk')
        pd.to_pickle(df_raw, DataKeys['data_dir']+'df_raw.pk')

        # ======================================
        # keys for 2nd function
        DataKeys['X_Key']=['Max_Smo_Force','Int_Smo_ForcPull'] # 'sample_FORCEpN_data' # or 'Max_Smo_Force'
        #
        DataKeys['tokenizer_X']=None # will not be used
        DataKeys['tokenizer_y']=None # will not be used # to be created
        #
        print('Normalization factor for force: ',
              np.max(protein_df['Max_Smo_Force']))
        print('Normalization factor for toughness: ',
              np.max(protein_df['Int_Smo_ForcPull']))
        #
        DataKeys['Xnormfac'] = np.array([
            np.max(protein_df['Max_Smo_Force']),
            np.max(protein_df['Int_Smo_ForcPull'])
        ])
        #
        DataKeys['ynormfac']=1.0 # not used in esm # 21. # old force diffusion model 22.
        #
        DataKeys['batch_size']=256
        DataKeys['testset_ratio']=CKeys['testratio'] # 0.15
        DataKeys['maxdata']=99999999991000
        # + for AA embending using ESM
        DataKeys['ESM-2_Model']='esm2_t33_650M_UR50D'
        # add for embedding space
        DataKeys['image_channels']=1280

    elif CKeys['Problem_ID']==11:
        # copied from Problem_ID=6
        #
        print("11, input MD ForcePath, output AA sequence")
        # ++++++++++++++++++++++++++++++++++++++
        # MD record as the tokenized input seq
        #
        # try to convey all para via one key
        DataKeys={}
        # ======================================
        # keys for "screen_dataset_MD"
        DataKeys['data_dir']=PKeys['prefix']+'0_dataprocess_MD/'
        # add the folder
        UtilityPack.create_path(DataKeys['data_dir'])

        # screening rules
        DataKeys['min_AA_seq_len']=0
        # DataKeys['max_AA_seq_len']=64
        DataKeys['max_AA_seq_len']=128
        DataKeys['max_Force_cap']=1000
#         # ---------------------------------------
#         # special ones
#         # change text arr into np arr
#         DataKeys['arr_key']=[
#             'posi_data','pull_data','forc_data',
#             'gap_data','normalized_gap_data',
#             'pull_gap_data', 'normalized_pull_gap_data',
#             'sample_NormPullGap_data','sample_FORCEpN_data']

#         df_raw, protein_df = DataSetPack.screen_dataset_MD(
#             csv_file=MD_smo_csv_file,
#             pk_file=None,
#             PKeys=DataKeys, # to be updated
#             CKeys=CKeys,
#         )
        # ++++++++++++++++++++++++++++++++++++++
        df_raw, protein_df = DataSetPack.screen_dataset_MD(
            csv_file=None,
            pk_file=MD_smo_pk_file,
            PKeys=DataKeys, # to be updated
            CKeys=CKeys,
        )

        # save the dataframe
        pd.to_pickle(protein_df, DataKeys['data_dir']+'protein_df.pk')
        pd.to_pickle(df_raw, DataKeys['data_dir']+'df_raw.pk')

        # ======================================
        # keys for 2nd function
        DataKeys['X_Key']='sample_FORCEpN_data' # or 'Max_Smo_Force'
        #
        DataKeys['tokenizer_X']=None # will not be used
        DataKeys['tokenizer_y']=None # to be created
        # think about this: update this one if necessary
        # DataKeys['Xnormfac'] = np.max(protein_df['Max_Smo_Force'])
        DataKeys['Xnormfac'] = 750.

        print('Normalization factor for force: ', DataKeys['Xnormfac'])
        DataKeys['ynormfac']=1. # not used as esm is used # 21. # old force diffusion model 22.
        #
        DataKeys['batch_size']=256
        DataKeys['testset_ratio']=0.15
        DataKeys['maxdata']=99999999991000
        # ++ for pLM
        # for AA embending using ESM
        DataKeys['ESM-2_Model']='esm2_t33_650M_UR50D'
        # add for embedding space
        # DataKeys['image_channels']=1280
        #
        # DataKeys['ESM-2_Model']='esm2_t12_35M_UR50D'
        # # DataKeys['image_channels']=480
        #
        # DataKeys['ESM-2_Model']='esm2_t36_3B_UR50D'
        # DataKeys['image_channels']=2560
        #
        DataKeys['ESM-2_Model']='esm2_t30_150M_UR50D'
        # DataKeys['image_channels']=640

        # only use the probability part
        DataKeys['image_channels']=33

    else:
        print('No Problem Type found...')
# else:
#     # load back if there is anything generated in the 1st run
#     if CKeys['Problem_ID']==2 or CKeys['Problem_ID']==6:
#         protein_df = pd.read_pickle(DataKeys['data_dir']+'protein_df.pk')
#         df_raw = pd.read_pickle(DataKeys['data_dir']+'df_raw.pk')

In [48]:
print(CKeys)

{'Running_Type': 2, 'Working_Mode': 2, 'IF_FirstRun': 2, 'Problem_ID': 11, 'Debug': 0, 'Debug_DataSet': 1, 'Debug_Model': 1, 'SlientRun': 1, 'Debug_DataPack': 0, 'Debug_ModelPack': 0, 'Debug_TrainerPack': 0, 'epochs': 750, 'print_loss_every_this_epochs': 50, 'sample_every_this_epochs': 100, 'save_model_every_this_epochs': 50, 'testratio': 0.1}


In [67]:


# ====================================================
# convert into datasets
# ====================================================
if CKeys['Working_Mode']==1 and CKeys['IF_FirstRun']==1:
    if CKeys['Problem_ID']==1:
        pass
        # train_loader, \
        # train_loader_noshuffle, \
        # test_loader, \
        # tokenizer_y, tokenizer_X = DataSetPack.load_data_set_SS_InSeqToOuSeq(
        #     file_path=SS_csv_file,
        #     PKeys=DataKeys, # to be updated
        #     CKeys=CKeys,
        # )

    elif CKeys['Problem_ID']==2:
        pass
        # train_loader, train_loader_noshuffle, \
        # test_loader, tokenizer_y, tokenizer_X = DataSetPack.load_data_set_from_df_SMD(
        #     protein_df,
        #     PKeys=DataKeys, # to be updated
        #     CKeys=CKeys,
        # )

    elif CKeys['Problem_ID']==3:
        pass
        # train_loader, train_loader_noshuffle, \
        # test_loader,tokenizer_y, tokenizer_X = DataSetPack.load_data_set_seq2seq_SecStr_ModelA (
        #     file_path=SS_csv_file, # 'PROTEIN_Mar18_2022_SECSTR_ALL.csv',
        #     PKeys=DataKeys, # to be updated
        #     CKeys=CKeys,
        # )

    elif CKeys['Problem_ID']==4:
        pass
        # train_loader, train_loader_noshuffle, \
        # test_loader,tokenizer_y, tokenizer_X = DataSetPack.load_data_set_text2seq_MD_ModelA (
        #     protein_df,
        #     PKeys=DataKeys, # to be updated
        #     CKeys=CKeys,
        # )

    # ///////////////////////////////////////////////////////////////
    #  add embedding cases
    # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
    elif CKeys['Problem_ID']==5:
        pass
        # train_loader, \
        # train_loader_noshuffle, \
        # test_loader, \
        # tokenizer_y, \
        # tokenizer_X = DataSetPack.load_data_set_SS_InSeqToOuSeq_pLM(
        #     file_path=SS_csv_file,
        #     PKeys=DataKeys, # to be updated
        #     CKeys=CKeys,
        # )
        # # this will triger the following downloading
        # # Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /home/gridsan/bni/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
        # # excute the following if without internet on the node
        # # 1 $ wget https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt -O  /home/gridsan/bni/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
        # # 2 $ wget https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt -O /home/gridsan/bni/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt

    # add if needed
    elif CKeys['Problem_ID']==6:

        train_loader, \
        train_loader_noshuffle, \
        test_loader, \
        tokenizer_y, \
        tokenizer_X = DataSetPack.load_data_set_from_df_SMD_pLM(
                protein_df,
                PKeys=DataKeys, # to be updated
                CKeys=CKeys,
            )

    elif CKeys['Problem_ID']==7:
        pass
        # train_loader, \
        # train_loader_noshuffle, \
        # test_loader, \
        # tokenizer_y, \
        # tokenizer_X = DataSetPack.load_data_set_seq2seq_SecStr_ModelA_pLM (
        #     file_path=SS_csv_file, # 'PROTEIN_Mar18_2022_SECSTR_ALL.csv',
        #     PKeys=DataKeys, # to be updated
        #     CKeys=CKeys,
        # )

    elif CKeys['Problem_ID']==8:

        train_loader, \
        train_loader_noshuffle, \
        test_loader,\
        tokenizer_y, \
        tokenizer_X = DataSetPack.load_data_set_text2seq_MD_ModelA_pLM (
            protein_df,
            PKeys=DataKeys, # to be updated
            CKeys=CKeys,
        )

    elif CKeys['Problem_ID']==11:

        train_loader, \
        train_loader_noshuffle, \
        test_loader, \
        tokenizer_y, \
        tokenizer_X = DataSetPack.load_data_set_from_df_SMD_pLM(
                protein_df,
                PKeys=DataKeys, # to be updated
                CKeys=CKeys,
            )

    elif CKeys['Problem_ID']==12:
        pass

    else:
        print('No Problem Type found...')

    print("==========================================")
    print("Save the datasets ...")
    print("==========================================")
    # save the dataset for for the 1st run
    data_pack = {}
    data_pack['train_loader']=train_loader
    data_pack['train_loader_noshuffle']=train_loader_noshuffle
    data_pack['test_loader']=test_loader
    data_pack['tokenizer_X']=tokenizer_X
    data_pack['tokenizer_y']=tokenizer_y
    # keys
    data_pack['DataKeys']=DataKeys
    # data_pack['CKeys']=CKeys
    data_pack['PKeys']=PKeys
    with open(PKeys['pk_data_pack'], 'wb') as handle:
        pickle.dump(data_pack, handle, protocol=pickle.HIGHEST_PROTOCOL)

else: # work both for training and testing

    print('This is not the first run')
    print('Load back in the data packages...')
    with open(PKeys['pk_data_pack'], 'rb') as handle:
        data_pack = pickle.load(handle)
    # deliver the results
    train_loader=data_pack['train_loader']
    train_loader_noshuffle=data_pack['train_loader_noshuffle']
    test_loader=data_pack['test_loader']
    tokenizer_X=data_pack['tokenizer_X']
    tokenizer_y=data_pack['tokenizer_y']
    # keys (create or update)
    DataKeys=data_pack['DataKeys']
    # CKeys=data_pack['CKeys']
    PKeys=data_pack['PKeys']
    # add some for specific problem
    if CKeys['Problem_ID']==2 or CKeys['Problem_ID']==6 \
    or CKeys['Problem_ID']==11:
        protein_df = pd.read_pickle(DataKeys['data_dir']+'protein_df.pk')
        df_raw = pd.read_pickle(DataKeys['data_dir']+'df_raw.pk')
    print('Done.')



This is not the first run
Load back in the data packages...


ModuleNotFoundError: ignored

In [52]:
print(PKeys['pk_data_pack'])

/content/Trained_model/data_pack.pickle


In [50]:
print(PKeys['pk_data_pack'])


/content/Trained_model/data_pack.pickle


In [None]:
import ModelPack

In [3]:
print('test')

test
