In [2]:
# https://github.com/CSBiology/deepStabP/blob/main/src/Api/requirements.txt
from pydantic import BaseModel
from transformers import  T5EncoderModel, T5Tokenizer
from tqdm.auto import *
import gc
from app.predictor import * 

In [3]:
# mirrored in dotnet Shared/DeepStabP.Types.fs
class FastaRecord(BaseModel):
    header      : str
    sequence    : str

# mirrored in dotnet Shared/DeepStabP.Types.fs
class PredictorInfo(BaseModel):
    growth_temp : int
    mt_mode     : str # "Lysate" or "Cell"
    fasta       : list[FastaRecord]

tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False )
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")
gc.collect()
# https://github.com/CSBiology/deepStabP/tree/main/src/Api/trained_model/b25_sampled_10k_tuned_2_d01
prediction_net = deepSTAPpMLP.load_from_checkpoint ("trained_model/b25_sampled_10k_tuned_2_d01/checkpoints/epoch=1-step=2316.ckpt")
prediction_net.to("cpu")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Lightning automatically upgraded your loaded checkpoint from v1.7.7 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint trained_model/b25_sampled_10k_tuned_2_d01/checkpoints/epoch=1-step=2316.ckpt`


deepSTAPpMLP(
  (zero_layer): Linear(in_features=1064, out_features=4098, bias=True)
  (zero_dropout): Dropout1d(p=0.1, inplace=False)
  (first_layer): Linear(in_features=4098, out_features=512, bias=True)
  (first_dropout): Dropout1d(p=0.1, inplace=False)
  (second_layer): Linear(in_features=512, out_features=256, bias=True)
  (second_dropout): Dropout1d(p=0.1, inplace=False)
  (third_layer): Linear(in_features=256, out_features=128, bias=True)
  (third_dropout): Dropout1d(p=0.1, inplace=False)
  (seventh_layer): Linear(in_features=128, out_features=1, bias=True)
  (species_layer_one): Linear(in_features=1, out_features=20, bias=True)
  (species_layer_two): Linear(in_features=20, out_features=20, bias=True)
  (species_dropout): Dropout1d(p=0.1, inplace=False)
  (batch_norm0): LayerNorm((4098,), eps=1e-05, elementwise_affine=True)
  (batch_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (batch_norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (batch_norm3)

In [4]:
# The processing can be found here: https://github.com/CSBiology/deepStabP/blob/main/src/Server/FastaReader.fs
# Replace 'O', 'U', 'J', 'Z', 'B' with "X" (None in this example)
SingleFastaMinimal = """>ExampleName"
MAQYHQQHEMKQTMAETQYVTAPPPMGYPVMMKDSPQTVQPPHEGQSKGSGGFLRGCLAAMCCCCVLDCVF"""

SeqTransform = "MAQYHQQHEMKQTMAETQYVTAPPPMGYPVMMKDSPQTVQPPHEGQSKGSGGFLRGCLAAMCCCCVLDCVF"
SeqTransform = "".join([char + " " for char in SeqTransform])
SeqTransform = SeqTransform.strip()
fasta_record_1 = FastaRecord(header="ExampleName", sequence=SeqTransform)

predictor_info = PredictorInfo(
    growth_temp=37,
    mt_mode="Lysate",
    fasta=[fasta_record_1]
)

In [5]:
model.device

device(type='cpu')

In [6]:
predictor_info.fasta

[FastaRecord(header='ExampleName', sequence='M A Q Y H Q Q H E M K Q T M A E T Q Y V T A P P P M G Y P V M M K D S P Q T V Q P P H E G Q S K G S G G F L R G C L A A M C C C C V L D C V F')]

In [7]:
prediction_net.device

device(type='cpu')

In [8]:
prediction = determine_tm (predictor_info.fasta, predictor_info.mt_mode, predictor_info.growth_temp, model, prediction_net, new_features, tokenizer)
prediction


Unnamed: 0,Protein,Tm
0,ExampleName,53.441023


Expecting:

| index | Protein     | Melting temperature [°C] |
|-------|-------------|--------------------------|
| 0     | ExampleName | 53.441018546246454       |