In [45]:
from my_tokenizer import SMILES_SPE_Tokenizer

tokenizer = SMILES_SPE_Tokenizer(vocab_file='build_tokenizer/vocab.txt', spe_file='build_tokenizer/merges.txt')

sequence = 'N1[C@@H](CCC1)C(=O)N[C@H](CC(=O)O)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CO)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(=O)N3)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@H](CCC[C@H](C(=O)O)N)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@H](CCC[C@H](C(=O)O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CCSC)C(=O)N[C@@H]([C@H](O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H]([C@H](O)C(=O)O)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](C)C(=O)N[C@H](CS)C(=O)N[C@@H](Cc1ccccc1Cl)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]c2c1c(N)ccc2)C(=O)O'
print(len(sequence))

tokens = tokenizer.tokenize(sequence)
print(len(tokens))


972
309


  warn("Couldn't import ipywidgets properly, progress bar will use console behavior")


In [46]:
import json

# Read vocab.txt and create a dictionary
vocab_file = "build_tokenizer/vocab.txt"
vocab_json_file = "build_tokenizer/vocab.json"

with open(vocab_file, "r", encoding="utf-8") as f:
    tokens = [line.strip() for line in f if line.strip()]  # Remove empty lines

# Create a dictionary where tokens are keys and indices are values
vocab_dict = {token: idx for idx, token in enumerate(tokens)}
keys = list(vocab_dict.keys())
# Save as vocab.json
with open(vocab_json_file, "w", encoding="utf-8") as f:
    json.dump(vocab_dict, f, ensure_ascii=False, indent=4)

print(f"vocab.json created successfully with {len(vocab_dict)} tokens!")

vocab.json created successfully with 405 tokens!


In [9]:
from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers import pre_tokenizers
from tokenizers import processors

# Paths to your files
vocab_path = "build_tokenizer/vocab.json"
merges_path = "build_tokenizer/merges_cleaned.txt"

# Step 1: Load BPE model with vocab and merges
tokenizer = Tokenizer(BPE(vocab=vocab_path,
                          merges=merges_path, 
                          unk_token="[UNK]",
                          continuing_subword_prefix="",
                          end_of_word_suffix="",
                          fuse_unk=True,
                          byte_fallback=False,
                          ignore_merges=False))

# Step 2: Define and set pre-tokenizer using regex pattern
regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\?|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
pre_tokenizer = pre_tokenizers.Split(Regex(regex_pattern), behavior="isolated")
tokenizer.pre_tokenizer = pre_tokenizer

# Step 4: Add special tokens and configure padding/truncation (optional)
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
tokenizer.enable_truncation(max_length=768)

# Step 5: Add post-processor for special tokens
seq_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

tokenizer.post_processor = seq_processor

# Step 6: Save tokenizer in Hugging Face format (optional)
tokenizer.save("build_tokenizer/tokenizer.json")

# Test tokenizer functionality
test_sequence = "BrClCC[C@@H][C@H]CCC"
pre_tokenized_output = tokenizer.pre_tokenizer.pre_tokenize_str(test_sequence)
print("Pre-tokenized output:", pre_tokenized_output)

encoded_output = tokenizer.encode(test_sequence)
print("Encoded tokens:", encoded_output.tokens)

# Debugging: Check if specific tokens exist in the vocabulary
vocab = tokenizer.get_vocab()
print("Is 'Br' in vocab?", "Br" in vocab)
print("Is 'Cl' in vocab?", "Cl" in vocab)
print("Is 'C' in vocab?", "C" in vocab)
print("Is '[C@@H]' in vocab?", "[C@@H]" in vocab)
print("Is '[C@H]' in vocab?", "[C@H]" in vocab)


Pre-tokenized output: [('Br', (0, 2)), ('Cl', (2, 4)), ('C', (4, 5)), ('C', (5, 6)), ('[C@@H]', (6, 12)), ('[C@H]', (12, 17)), ('C', (17, 18)), ('C', (18, 19)), ('C', (19, 20))]
Encoded tokens: ['[CLS]', 'B', '[UNK]', 'C', '[UNK]', 'C', 'C', '[UNK]', 'C', '[UNK]', '[UNK]', 'C', '[UNK]', 'C', 'C', 'C', '[SEP]']
Is 'Br' in vocab? True
Is 'Cl' in vocab? True
Is 'C' in vocab? True
Is '[C@@H]' in vocab? True
Is '[C@H]' in vocab? True


  tokenizer = Tokenizer(BPE(vocab=vocab_path,


In [14]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece  # Let's try WordPiece instead of BPE for now
from tokenizers import pre_tokenizers
from tokenizers import processors

# First, let's load and check the vocabulary
import json
with open("build_tokenizer/vocab.json", "r") as f:
    vocab = json.load(f)

# Create tokenizer with WordPiece model
tokenizer = Tokenizer(WordPiece(vocab=vocab, unk_token="[UNK]"))

# Set up pre-tokenizer
regex_pattern = r"(\[[^\]]+]|Br|Cl|N|O|S|P|F|I|B|C|n|o|s|p|c|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
pre_tokenizer = pre_tokenizers.Split(Regex(regex_pattern), behavior="isolated")
tokenizer.pre_tokenizer = pre_tokenizer

# Add special tokens and post-processor
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
tokenizer.enable_truncation(max_length=768)

seq_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)
tokenizer.post_processor = seq_processor

# Test
test_sequence = "BrClCC[C@@H][C@H]CCC"
print("\nTesting tokenization:")
print("Pre-tokenized:", tokenizer.pre_tokenizer.pre_tokenize_str(test_sequence))
encoded = tokenizer.encode(test_sequence)
print("Encoded tokens:", encoded.tokens)
print("Token IDs:", encoded.ids)

Vocabulary sample: [('[PAD]', 0), ('[UNK]', 1), ('[CLS]', 2), ('[SEP]', 3), ('[MASK]', 4)]

Testing tokenization:
Pre-tokenized: [('Br', (0, 2)), ('Cl', (2, 4)), ('C', (4, 5)), ('C', (5, 6)), ('[C@@H]', (6, 12)), ('[C@H]', (12, 17)), ('C', (17, 18)), ('C', (18, 19)), ('C', (19, 20))]
Encoded tokens: ['[CLS]', 'Br', 'Cl', 'C', 'C', '[C@@H]', '[C@H]', 'C', 'C', 'C', '[SEP]']
Token IDs: [2, 97, 42, 83, 83, 67, 141, 83, 83, 83, 3]


In [17]:
from tokenizers import Tokenizer, Regex
from tokenizers.models import BPE
from tokenizers import pre_tokenizers
from tokenizers import processors

# First, let's check our input files
import json
with open("build_tokenizer/vocab.json", "r") as f:
    vocab = json.load(f)
    print("Vocabulary sample:", list(vocab.items())[:5])

with open("build_tokenizer/merges_cleaned.txt", "r") as f:
    # make each line a tuple (first element is the first token, second element is the second token)
    merges = [tuple(line.strip().split()) for line in f]
    print("Merges sample:", merges[:5])

# Create BPE model with explicit vocab and merges
tokenizer = Tokenizer(BPE(
    vocab=vocab,
    merges=merges,
    cache_capacity=10000,
    dropout=None,
    unk_token="[UNK]",
    continuing_subword_prefix="",
    end_of_word_suffix="",
    fuse_unk=True
))

# Set up pre-tokenizer
regex_pattern = r"(\[[^\]]+]|Br|Cl|N|O|S|P|F|I|B|C|n|o|s|p|c|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
pre_tokenizer = pre_tokenizers.Split(Regex(regex_pattern), behavior="isolated")
tokenizer.pre_tokenizer = pre_tokenizer

# Add special tokens and post-processor
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
tokenizer.enable_truncation(max_length=768)

seq_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)
tokenizer.post_processor = seq_processor

# Test
test_sequence = "BrClCC[C@@H][C@H]CCC"
print("\nTesting tokenization:")
print("Pre-tokenized:", tokenizer.pre_tokenizer.pre_tokenize_str(test_sequence))
encoded = tokenizer.encode(test_sequence)
print("Encoded tokens:", encoded.tokens)
print("Token IDs:", encoded.ids)

Vocabulary sample: [('[PAD]', 0), ('[UNK]', 1), ('[CLS]', 2), ('[SEP]', 3), ('[MASK]', 4)]
Merges sample: [('=', 'O'), ('C', 'C'), ('N', 'C'), ('C', 'O'), ('c', 'c')]

Testing tokenization:
Pre-tokenized: [('Br', (0, 2)), ('Cl', (2, 4)), ('C', (4, 5)), ('C', (5, 6)), ('[C@@H]', (6, 12)), ('[C@H]', (12, 17)), ('C', (17, 18)), ('C', (18, 19)), ('C', (19, 20))]
Encoded tokens: ['[CLS]', 'B', '[UNK]', 'C', '[UNK]', 'C', 'C', '[UNK]', 'C', '[UNK]', '[UNK]', 'C', '[UNK]', 'C', 'C', 'C', '[SEP]']
Token IDs: [2, 122, 1, 83, 1, 83, 83, 1, 83, 1, 1, 83, 1, 83, 83, 83, 3]


In [47]:
from tokenizers import Tokenizer, Regex, processors
from tokenizers.models import WordPiece
from tokenizers import pre_tokenizers

# Create WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(
    vocab=vocab,
    unk_token="[UNK]",
    max_input_chars_per_word=100
))

# Chemical regex pattern with correct escaping
chemical_pattern = r"(\[[^\]]+]|C\(=N\)N|CCC\(C\)|\(CCCN\)|NC\(=O\)|C\(C\)=O|=C\(N\)N|N=C\(N\)|NC\(=N\)|C\(=O\)C|CS\(=O\)|OC\(=O\)|C\(=O\)c|c\(=O\)n|C\(=O\)O|C\(N\)=O|cc\(Br\)|CC\(=O\)|C\(=O\)N|ccc\(C\)|ccc\(F\)|c\(=O\)|C\(=N\)|c\(O\)c|NC\(C\)|n\(C\)c|CC\(O\)|cc\(N\)|CC\(C\)|cc\(C\)|C\(=O\)|cc\(O\)|c\(N\)c|c\(Cl\)|C\(N\)N|N\(C\)C|NC\(N\)|=C\(N\)|C\(O\)C|c\(OC\)|\(C#N\)|C\(C\)C|CC\(N\)|C\(C\)N|c\(CO\)|c\(Br\)|\(CCO\)|C\(CC\)|S\(=O\)|c\(C\)c|\(=N\)|c\(O\)|\(Br\)|\(CS\)|c\(C\)|\(CC\)|c\(I\)|C\(C\)|N\(C\)|C\(O\)|C\(I\)|C\(F\)|\(Cl\)|n\(C\)|\(OC\)|\(=O\)|c\(F\)|CCCN\)|\(=S\)|c\(N\)|\(CO\)|C\(N\)|\(C\)|ccccc|\(S\)|\(F\)|\(O\)|C#N\)|CCO\)|\(N\)|C\(=N|\(I\)|CSSC|=N\)|CC=O|CCCO|Cl\)|CCNO|=O\)|CCSC|\(=N|CO\)|CCNC|CCCC|=S\)|CN=C|CCCS|cccc|CCCN|Br\)|cccn|CS\)|C=CC|OC\)|CC=C|cnn|=NC|COC|OCC|\(O|CCS|CNc|#Cc|=CC|ccn|C=C|CSc|ccc|NCc|CCO|N=C|cnc|I\)|CCc|OCc|CCl|ccs|COc|CCn|CSC|SCC|NCC|CCN|CNC|C#C|C=O|CNO|CCC|SSC|C#N|O=C|NOC|S\)|csc|ncc|C\)|N\)|\(C|ncn|F\)|O\)|N#C|nnc|CSS|cco|Cl|NC|nc|co|CS|CO|no|cc|CN|cn|SS|OC|\)|SN|nn|CC|#C|NO|=S|NS|cs|=C|Oc|=O|oc|Nc|Cc|=N|NN|C=|C#|\(|SC|sc|Br|N#|#N|p|O|I|N|C|s|=|c|B|S|F|n|P|#|o)"

tokenizer.pre_tokenizer = pre_tokenizers.Split(Regex(chemical_pattern), behavior="isolated")

# Add post-processor for special tokens
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# save the tokenizer
tokenizer.save("build_tokenizer/tokenizer.json")

# Test sequences
test_sequences = [
    "CCC",
    "CCCC",
    "BrCCCB",
    "BrClCC[C@@H][C@H]CCC",
    "C(=O)N",  # Test parentheses
    "CC(=O)CC"  # Test more complex structure
]

for seq in test_sequences:
    print(f"\nTesting: {seq}")
    pre_tokens = tokenizer.pre_tokenizer.pre_tokenize_str(seq)
    print("Pre-tokenized:", pre_tokens)
    encoded = tokenizer.encode(seq)
    print("Final tokens:", encoded.tokens)


Testing: CCC
Pre-tokenized: [('CCC', (0, 3))]
Final tokens: ['[CLS]', 'CCC', '[SEP]']

Testing: CCCC
Pre-tokenized: [('CCCC', (0, 4))]
Final tokens: ['[CLS]', 'CCCC', '[SEP]']

Testing: BrCCCB
Pre-tokenized: [('Br', (0, 2)), ('CCC', (2, 5)), ('B', (5, 6))]
Final tokens: ['[CLS]', 'Br', 'CCC', 'B', '[SEP]']

Testing: BrClCC[C@@H][C@H]CCC
Pre-tokenized: [('Br', (0, 2)), ('Cl', (2, 4)), ('CC', (4, 6)), ('[C@@H]', (6, 12)), ('[C@H]', (12, 17)), ('CCC', (17, 20))]
Final tokens: ['[CLS]', 'Br', 'Cl', 'CC', '[C@@H]', '[C@H]', 'CCC', '[SEP]']

Testing: C(=O)N
Pre-tokenized: [('C(=O)N', (0, 6))]
Final tokens: ['[CLS]', 'C(=O)N', '[SEP]']

Testing: CC(=O)CC
Pre-tokenized: [('CC(=O)', (0, 6)), ('CC', (6, 8))]
Final tokens: ['[CLS]', 'CC(=O)', 'CC', '[SEP]']


In [2]:
from tokenizers import Tokenizer

# load the tokenizer
tokenizer = Tokenizer.from_file("build_tokenizer/tokenizer.json")

# test the tokenizer
sequence = 'N1[C@@H](CCC1)C(=O)N[C@H](CC(=O)O)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CO)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(=O)N3)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@H](CCC[C@H](C(=O)O)N)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@H](CCC[C@H](C(=O)O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CCSC)C(=O)N[C@@H]([C@H](O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H]([C@H](O)C(=O)O)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](C)C(=O)N[C@H](CS)C(=O)N[C@@H](Cc1ccccc1Cl)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]c2c1c(N)ccc2)C(=O)O'
print(len(sequence))

# tokenize the sequence
tokens = tokenizer.encode(sequence)
print(len(tokens))

# decode the tokens
decoded = tokenizer.decode(tokens.ids)
# replace all spaces with ''
decoded = decoded.replace(" ", "")
# remove [CLS] and [SEP]
decoded = decoded.replace("[CLS]", "").replace("[SEP]", "")

print(decoded)


# check if decoded is the same as sequence
print(decoded == sequence)


972
286
N1[C@@H](CCC1)C(=O)N[C@H](CC(=O)O)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CO)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(=O)N3)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@H](CCC[C@H](C(=O)O)N)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@H](CCC[C@H](C(=O)O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CCSC)C(=O)N[C@@H]([C@H](O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H]([C@H](O)C(=O)O)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](C)C(=O)N[C@H](CS)C(=O)N[C@@H](Cc1ccccc1Cl)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]c2c1c(N)ccc2)C(=O)O
True


In [12]:
from tokenizers import Tokenizer, Regex, processors
from tokenizers.models import WordPiece
from tokenizers import pre_tokenizers
from transformers import PreTrainedTokenizerFast

# First, convert your tokenizer to a PreTrainedTokenizerFast
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)

# Add tokenizer metadata
fast_tokenizer.name_or_path = "chemical-wordpiece-tokenizer"
fast_tokenizer.model_max_length = 768  # or whatever max length you want

# Push to hub (make sure you're logged in first)
from huggingface_hub import login

# Push to hub
fast_tokenizer.push_to_hub(
    "aaronfeller/PeptideMTR",  # e.g., "pharmapsychotic/chemical-wordpiece"
    commit_message="Add chemical WordPiece tokenizer"
)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/aaronfeller/PeptideMTR/commit/bbd6290716d787242ecbfa325211474cbb8b6161', commit_message='Add chemical WordPiece tokenizer', commit_description='', oid='bbd6290716d787242ecbfa325211474cbb8b6161', pr_url=None, repo_url=RepoUrl('https://huggingface.co/aaronfeller/PeptideMTR', endpoint='https://huggingface.co', repo_type='model', repo_id='aaronfeller/PeptideMTR'), pr_revision=None, pr_num=None)

In [21]:
from transformers import PreTrainedTokenizerFast

# load the tokenizer from the hub
tokenizer = PreTrainedTokenizerFast.from_pretrained("aaronfeller/PeptideMTR")

tokenizer.encode("CCC")

tokenizer.decode(tokenizer.encode("CCC"))

sequence = 'N1[C@@H](CCC1)C(=O)N[C@H](CC(=O)O)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CO)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CC(=O)N3)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)N[C@H](CCC[C@H](C(=O)O)N)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@H](CCC[C@H](C(=O)O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CCSC)C(=O)N[C@@H]([C@H](O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CO)C(=O)N[C@@H](C)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H]([C@H](O)C(=O)O)C(=O)N[C@@H](CC1=CNC=N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](C)C(=O)N[C@H](CS)C(=O)N[C@@H](Cc1ccccc1Cl)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CCC(=O)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]c2c1c(N)ccc2)C(=O)O'

tokens = tokenizer.encode(sequence)
print(tokens)

decoded = tokenizer.decode(tokens, skip_special_tokens=True)
print(decoded)

# remove spaces in decoded
decoded = decoded.replace(" ", "")

print(decoded == sequence)


[2, 72, 107, 67, 239, 219, 107, 117, 237, 141, 239, 244, 117, 237, 67, 239, 83, 107, 278, 220, 253, 107, 117, 237, 67, 250, 237, 67, 149, 141, 376, 269, 237, 67, 239, 65, 107, 231, 107, 117, 237, 67, 239, 242, 117, 237, 67, 239, 237, 26, 117, 237, 67, 239, 242, 117, 237, 67, 250, 237, 67, 239, 65, 107, 231, 107, 117, 237, 243, 72, 141, 239, 219, 141, 239, 235, 238, 232, 237, 67, 239, 237, 117, 237, 141, 239, 219, 141, 239, 235, 238, 232, 237, 67, 239, 83, 149, 278, 72, 69, 117, 83, 107, 278, 69, 381, 278, 107, 117, 237, 67, 239, 277, 117, 237, 67, 239, 277, 117, 237, 67, 149, 141, 270, 269, 237, 67, 239, 242, 117, 237, 67, 250, 237, 67, 240, 237, 67, 239, 243, 238, 237, 67, 239, 254, 257, 232, 237, 67, 239, 240, 269, 237, 67, 239, 237, 117, 237, 67, 239, 254, 257, 232, 237, 67, 239, 244, 117, 237, 67, 239, 65, 107, 231, 107, 117, 237, 67, 239, 83, 107, 278, 220, 253, 107, 117, 237, 67, 239, 254, 257, 232, 237, 67, 239, 277, 117, 237, 67, 239, 83, 107, 278, 220, 253, 107, 117, 237, 67, 