In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [12]:
from models.models import TransformerModel 

model = TransformerModel(
    running_units=256,
    d=64,
    h=4,
    ffn_mult=1,
    depth=3,
    pos_type='learned', # learned
    prec_type="embed_input", # embed_input | pretoken | inject_pre | inject_ffn
    learned_pos=True,
    prenorm=False,
    norm_type="layer",
    penultimate_units=None,
    output_units=174,
    max_charge=6,
    sequence_length=30,
    alphabet=False,
    dropout=0,
    precursor_units=None,
    inject_position="all" # all | pre | post    
)


from dlomix.losses import masked_spectral_distance, masked_pearson_correlation_distance

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer='adam', 
            loss=masked_spectral_distance,
            metrics=[masked_pearson_correlation_distance])

model.build(input_shape=(None, 30))

model.summary()
#models

ModuleNotFoundError: No module named 'tensorflow.keras.layers.experimental'

In [1]:
# Prosit dataset

import os
os.environ['HF_HOME'] = "/cmnfs/proj/prosit_astral"
os.environ['HF_DATASETS_CACHE'] = "/cmnfs/proj/prosit_astral/datasets"

print("[UNIMOD:1]-K[UNIMOD:1]".count('[UNIMOD:' + '1' + ']'))

import numpy as np
from dlomix.data import FragmentIonIntensityDataset
import pandas as pd

from datasets import disable_caching
disable_caching()

PTMS_ALPHABET = {
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
    "M[UNIMOD:35]": 21,
    "R[UNIMOD:7]":22,
    "C[UNIMOD:4]": 2,
    "Q[UNIMOD:7]":4,
    "N[UNIMOD:7]":3,
}

rt_data = FragmentIonIntensityDataset(
    data_source="/cmnfs/data/proteomics/Prosit_PTMs/Transformer_Train/clean_train.parquet",
    val_data_source="/cmnfs/data/proteomics/Prosit_PTMs/Transformer_Train/clean_val.parquet",
    test_data_source="/cmnfs/data/proteomics/Prosit_PTMs/Transformer_Train/clean_test.parquet",
    data_format="parquet", 
    val_ratio=0.2, max_seq_len=30, encoding_scheme="naive-mods",
    vocab=PTMS_ALPHABET,
    model_features=["precursor_charge_onehot", "collision_energy_aligned_normed","method_nbr"],
    batch_size=2048
)

2


  from .autonotebook import tqdm as notebook_tqdm



Avaliable feature extractors are (use the key of the following dict and pass it to features_to_extract in the Dataset Class):
{
   "atom_count": "Atom count of PTM.",
   "delta_mass": "Delta mass of PTM.",
   "mod_gain": "Gain of atoms due to PTM.",
   "mod_loss": "Loss of atoms due to PTM."
}.
When writing your own feature extractor, you can either
    (1) use the FeatureExtractor class or
    (2) write a function that can be mapped to the Hugging Face dataset.
In both cases, you can access the parsed sequence information from the dataset using the following keys, which all provide python lists:
    - _parsed_sequence: The parsed sequence
    - _n_term_mods: The N-terminal modifications
    - _c_term_mods: The C-terminal modifications



                    Multiple data sources provided {'train': '/cmnfs/data/proteomics/Prosit_PTMs/Transformer_Train/clean_train.parquet', 'val': '/cmnfs/data/proteomics/Prosit_PTMs/Transformer_Train/clean_val.parquet', 'test': '/cmnfs/data/proteomics/Prosit_PTMs/Transformer_Train/clean_test.parquet'}, please ensure that the data sources are already split into train, val and test sets
                    since no splitting will happen. If not, please provide only one data source and set the val_ratio to split the data into train and val sets."
                    
Mapping SequenceParsingProcessor (num_proc=2): 100%|████████████████████████████████████████████████████████████████████████████████████████████| 2836/2836 [00:00<00:00, 13932.44 examples/s]
Mapping SequenceParsingProcessor (num_proc=2): 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 814/814 [00:00<00:00, 5401.02 examples/s]
Mapping SequenceParsingProcessor (num_proc=2): 10

In [18]:
rt_data["train"]

Dataset({
    features: ['modified_sequence', 'intensities_raw', 'precursor_charge_onehot', 'collision_energy_aligned_normed', 'method_nbr', '_parsed_sequence', '_n_term_mods', '_c_term_mods'],
    num_rows: 2836
})

In [9]:
rt_data["train"]['_parsed_sequence']

[['D', 'S', 'Y', 'D', 'S', 'Y', 'A', 'T', 'H', 'N', 'E'],
 ['R',
  'D',
  'K',
  'C[UNIMOD:4]',
  'E',
  'L',
  'M',
  'A',
  'S',
  'V',
  'T',
  'S',
  'H',
  'K'],
 ['D',
  'T',
  'A',
  'S',
  'S',
  'R',
  'C[UNIMOD:4]',
  'E',
  'S',
  'C[UNIMOD:4]',
  'S',
  'E',
  'R',
  'E',
  'E',
  'A',
  'G',
  'K'],
 ['D', 'F', 'R', 'K', 'A', 'Y', 'D', 'Y', 'I', 'R'],
 ['V', 'A', 'G', 'S', 'M', 'G', 'F', 'D', 'V', 'D', 'Y', 'P', 'K', 'M'],
 ['D',
  'T',
  'M',
  'S',
  'L',
  'L',
  'A',
  'A',
  'D',
  'N',
  'L',
  'L',
  'A',
  'G',
  'L',
  'R'],
 ['I', 'S', 'A', 'I', 'L', 'E', 'K'],
 ['S', 'R', 'W', 'E', 'T', 'G', 'E', 'V', 'Q', 'A', 'E', 'S', 'A', 'A', 'K'],
 ['S',
  'E',
  'L',
  'S',
  'E',
  'D',
  'A',
  'E',
  'P',
  'A',
  'G',
  'S',
  'Q',
  'E',
  'T',
  'K'],
 ['T',
  'L',
  'N',
  'I',
  'S',
  'G',
  'N',
  'E',
  'I',
  'E',
  'R',
  'L',
  'P',
  'Q',
  'M',
  'L',
  'A',
  'H',
  'V',
  'R'],
 ['E', 'R[UNIMOD:7]', 'A', 'E', 'I', 'Q', 'P', 'R', 'H', 'R'],
 ['S', 'F', 'S

In [18]:
from models.models import TransformerModel 

model = TransformerModel(
    running_units=256,
    d=64,
    h=4,
    ffn_mult=1,
    depth=3,
    pos_type='learned', # learned
    prec_type="embed_input", # embed_input | pretoken | inject_pre | inject_ffn
    learned_pos=True,
    prenorm=False,
    norm_type="layer",
    penultimate_units=None,
    output_units=174,
    max_charge=6,
    sequence_length=30,
    alphabet=False,
    dropout=0,
    precursor_units=None,
    inject_position="all" # all | pre | post    
)


from dlomix.losses import masked_spectral_distance, masked_pearson_correlation_distance

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer='adam', 
            loss=masked_spectral_distance,
            metrics=[masked_pearson_correlation_distance])


optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from callbacks import CyclicLR, LearningRateLogging

cyclicLR = CyclicLR(base_lr=0.000001, max_lr=0.0002, step_size=2, mode='triangular',
                 gamma=0.95)

early_stopping = EarlyStopping(
    monitor="val_loss",
    min_delta=0.001,
    patience=20,
    restore_best_weights=True)

learningRate = LearningRateLogging()


model.fit(
    rt_data.tensor_train_data,
    validation_data=rt_data.tensor_val_data,
    epochs=100,
    callbacks=[
#        WandbCallback(save_model=False),
        cyclicLR,
        early_stopping,
        #save_best,
        learningRate
    ]
)

model.summary()

Epoch 1/100


ValueError: in user code:

    File "/nfs/home/students/d.lochert/miniconda3/envs/astral/lib/python3.10/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/nfs/home/students/d.lochert/miniconda3/envs/astral/lib/python3.10/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/nfs/home/students/d.lochert/miniconda3/envs/astral/lib/python3.10/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/nfs/home/students/d.lochert/miniconda3/envs/astral/lib/python3.10/site-packages/keras/src/engine/training.py", line 1126, in train_step
        y_pred = self(x, training=True)
    File "/nfs/home/students/d.lochert/miniconda3/envs/astral/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_filej7__7af1.py", line 10, in tf__call
        out = ag__.converted_call(ag__.ld(self).EmbedInputs, (ag__.ld(x)['modified_sequence'], ag__.ld(x)['precursor_charge_onehot'], ag__.ld(x)['collision_energy_aligned_normed']), None, fscope)
    File "/tmp/__autograph_generated_filelac6qojg.py", line 31, in tf__EmbedInputs
        ag__.if_stmt(ag__.ld(self).prec_type == 'embed_input', if_body, else_body, get_state, set_state, ('input_embedding',), 1)
    File "/tmp/__autograph_generated_filelac6qojg.py", line 23, in if_body
        ce_emb = ag__.converted_call(ag__.ld(tf).tile, (ag__.ld(collision_energy)[:, None], [1, ag__.ld(length), 1]), None, fscope)

    ValueError: Exception encountered when calling layer 'transformer_model_6' (type TransformerModel).
    
    in user code:
    
        File "/nfs/home/students/d.lochert/projects/astral/dlomix/my_scripts/models/models.py", line 139, in call  *
            out = self.EmbedInputs(x['modified_sequence'], x['precursor_charge_onehot'], x['collision_energy_aligned_normed'])
        File "/nfs/home/students/d.lochert/projects/astral/dlomix/my_scripts/models/models.py", line 117, in EmbedInputs  *
            ce_emb = tf.tile(collision_energy[:,None], [1, length, 1])
    
        ValueError: Shape must be rank 2 but is rank 3 for '{{node transformer_model_6/Tile_1}} = Tile[T=DT_FLOAT, Tmultiples=DT_INT32](transformer_model_6/strided_slice_1, transformer_model_6/Tile_1/multiples)' with input shapes: [?,1], [3].
    
    
    Call arguments received by layer 'transformer_model_6' (type TransformerModel):
      • x={'modified_sequence': 'tf.Tensor(shape=(None, 30), dtype=float32)', 'precursor_charge_onehot': 'tf.Tensor(shape=(None, 6), dtype=int64)', 'collision_energy_aligned_normed': 'tf.Tensor(shape=(None,), dtype=float32)', 'method_nbr': 'tf.Tensor(shape=(None,), dtype=int64)'}
      • training=True


In [None]:
intdata[

In [16]:
ALPHABET_UNMOD = {
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
    "M[UNIMOD:35]": 21,
    "R[UNIMOD:7]":22,
    "C[UNIMOD:4]": 2,
    "Q[UNIMOD:7]":4,
    "N[UNIMOD:7]":3,
}

from tensorflow.keras.layers.experimental import preprocessing
string_lookup = preprocessing.StringLookup(vocabulary=list(ALPHABET_UNMOD.keys()))


input_embedding = tf.one_hot(string_lookup(["A", "S", "V"]), len(ALPHABET_UNMOD))

input_embedding

<tf.Tensor: shape=(3, 25), dtype=float32, numpy=
array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0.]], dtype=float32)>