In [4]:
from transformers import AutoTokenizer
import json

In [20]:
def get_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("aatok")
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '<PAD>'})

    if tokenizer.sep_token is None:
        tokenizer.add_special_tokens({'sep_token': '<MIS>'})

    if tokenizer.cls_token is None:
        tokenizer.add_special_tokens({'cls_token': '<CLS>'})

    if tokenizer.eos_token is None:
        tokenizer.add_special_tokens({'eos_token': '<EOS>'})

    if tokenizer.mask_token is None:
        tokenizer.add_special_tokens({'mask_token': '<MASK>'})

    from tokenizers.processors import TemplateProcessing
    tokenizer._tokenizer.post_processor = TemplateProcessing(
        single="<CLS> $A <EOS>",
        pair="<CLS> $A <MIS> $B:1 <EOS>:1",
        special_tokens=[
            ("<EOS>", 2),
            ("<CLS>", 3),
            ("<MIS>", 4),
        ],
    )
    return tokenizer


In [21]:
tokenizer = get_tokenizer()

In [10]:
tokenizer("<MIS>")

{'input_ids': [4, 2], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}

In [13]:
with open("configs/shallow.config.json", "r") as read_file:
    modelconfig = json.load(read_file)


loading hyperparameter


In [11]:
vocabsize = len(tokenizer._tokenizer.get_vocab())


In [14]:
from transformers import BertConfig

decoder_config = BertConfig(vocab_size = vocabsize,
                        max_position_embeddings=50, # this shuold be some large value
                        num_attention_heads=modelconfig["num_attn_heads"],
                        num_hidden_layers = modelconfig["num_hidden_layers"],
                        hidden_size = modelconfig["hidden_size"],
                        type_vocab_size = 1,
                        is_decoder=True,
                        pad_token_id =  tokenizer.pad_token_id)    # Very Important


In [16]:
from src.multiTrans import TulipPetal

In [17]:
tulip_petal = TulipPetal(config=decoder_config)

self.pad_token_id None


In [18]:
tulip_petal()

TypeError: sum() received an invalid combination of arguments - got (bool, dim=int), but expected one of:
 * (Tensor input, *, torch.dtype dtype = None)
 * (Tensor input, tuple of ints dim, bool keepdim = False, *, torch.dtype dtype = None, Tensor out = None)
 * (Tensor input, tuple of names dim, bool keepdim = False, *, torch.dtype dtype = None, Tensor out = None)


In [None]:
from src.multiTrans import Tulip

Tulip.generate

Making a dataset

In [2]:
from src.multiTrans import TCRDataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
test_path = "data/VDJ_test_single_example.csv"
device = "cpu"
target_peptide = pd.read_csv(test_path)["peptide"].unique()[0]
mhctok = AutoTokenizer.from_pretrained("mhctok/")
tcr_dataset = TCRDataset(test_path, tokenizer, device,target_peptide=target_peptide, mhctok=mhctok)
tcr_dataset_masked_peptide = tcr_dataset.generate_unconditional_data(mask_alpha=False, mask_beta=False, mask_peptide=True, mask_mhc=False)



Loading the data ...


In [16]:
tcr_dataset_masked_peptide[0]

('DGT', 'CAT', '<MIS>', 1, 'HLA-A*02:01')

Example of using sample_tcr_denovo

In [17]:
from src.multiTrans import sample_tcr_denovo

Difference between get_starting_batch and get_starting_batch_from_chain

In [18]:
from src.multiTrans import get_starting_batch, get_starting_batch_from_chain

In [22]:
get_starting_batch(peptide=target_peptide, tokenizer=tokenizer, mhctok=mhctok, device="cpu")

({'input_ids': tensor([[ 3, 12, 19, 13,  2]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[3, 4, 2]]),
  'token_type_ids': tensor([[0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1]])},
 {'input_ids': tensor([[3, 4, 2]]),
  'token_type_ids': tensor([[0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1]])},
 tensor([0]),
 {'input_ids': tensor([[1]]),
  'token_type_ids': tensor([[0]]),
  'attention_mask': tensor([[1]])})

In [25]:
get_starting_batch_from_chain(peptide=target_peptide, datainit=tcr_dataset, chain="alpha")

({'input_ids': tensor([[ 3, 12, 19, 13,  2]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[ 3, 14,  7, 15,  2]]),
  'token_type_ids': tensor([[0, 0, 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[3, 4, 2]]),
  'token_type_ids': tensor([[0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1]])},
 tensor([1]),
 {'input_ids': tensor([[1]]),
  'token_type_ids': tensor([[0]]),
  'attention_mask': tensor([[1]])})

Example of TCRDataset.select_chain

In [32]:
tcr_dataset_select_chain = tcr_dataset.select_chain(target_chain="both")


In [31]:
tcr_dataset_select_chain[0]

('DGT', 'CAT', 'KFR', 1, 'HLA-A*02:01')

In [36]:
tcr_dataset_select_peptide = tcr_dataset.select_peptide(target_peptide=target_peptide)

In [37]:
tcr_dataset_select_peptide[0]

('DGT', 'CAT', 'KFR', 1, 'HLA-A*02:01')