In [None]:
!pip install sentencepiece
!pip install transformers



### transformers tokenizers Implementation

In [None]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
from transformers import PreTrainedTokenizerFast
import json

In [None]:
# Step 0: Prepare toy sequence data and Iterator
genome_sequence = 100*"AAAAAAACACGCTAATTGCCCGCTTAGATCCCGATTGCTGCTCGTGCTGCTGCTATATATATATATACCCCGTTACTTGAACTGGCA"
def batch_iterator(genome_sequence, batch_size=75):
    for i in range(0, len(genome_sequence), batch_size):
        yield [genome_sequence[i: i + batch_size]]

In [None]:
# Step 1: Initialize the BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Step 2: Set pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Step 3: Train the tokenizer
trainer = trainers.BpeTrainer(
    vocab_size=4096,
    special_tokens=["[UNK]","[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
tokenizer.train_from_iterator(batch_iterator(genome_sequence), trainer=trainer)

# Step 4: Set the post-processor
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ]
)

# Step 5: Save the trained tokenizer and config
tokenizer.save("tokenizer.json")

config = {
    "unk_token": "[UNK]",
    "cls_token": "[CLS]",
    "sep_token": "[SEP]",
    "pad_token": "[PAD]",
    "mask_token": "[MASK]"
}
with open("tokenizer_config.json", "w") as f:
    json.dump(config, f)

In [None]:
# Load the tokenizer configuration
with open("tokenizer_config.json") as f:
    config = json.load(f)

# Load the tokenizer using PreTrainedTokenizerFast
toy_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json", **config)

# Example usage of the fast tokenizer
encoded = toy_tokenizer("ACTGACTGACTG")
print(encoded)
decoded = toy_tokenizer.decode(encoded["input_ids"])
print(decoded)

{'input_ids': [1, 5, 25, 5, 25, 5, 25, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] A CTG A CTG A CTG [SEP]


#### Let's check the toy Tokenizer



In [None]:
toy_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=153, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

#### Let's check the real DNABERT-2 Tokenizer
It's the same (except vacab size) with our toy tokenzier

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

real_tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
real_tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


PreTrainedTokenizerFast(name_or_path='zhihan1996/DNABERT-2-117M', vocab_size=4096, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

### SentencePiece Style

In [None]:
import sentencepiece as spm

In [None]:
# write a toy.txt file with some random text
with open("toy_genome.txt", "w", encoding="utf-8") as f:
  f.write("AACGCTTGCTAGCTAGCAATTGCCCGCTTAGATCCCGATTGCTGCTCGTGCTGCTGCTATATATATATATACCCCGTTACTTGAACTGGCA")

In [None]:
# train a sentencepiece model on it
# the settings here are (best effort) those used for training Llama 2
import os

options = dict(
  # Input-related
  input="toy_genome.txt",               # Training dataset file
  input_format="text",                  # Format of training dataset

  # Output-related
  model_prefix="dnabert2",              # Output filename prefix

  # Algorithm
  model_type="bpe",                     # Use BPE Algorithm
  vocab_size=50,                        # Vocabulary size, DNABERT-2 size = 4096

  # normalization
  normalization_rule_name="identity",   # Turn off normalization

  # rare word treatment
  byte_fallback=False,                  # Turn off byte-level fallback

  # merge rules
  max_sentencepiece_length=16,          # Set of the max length of a token
  add_dummy_prefix=False,               # Don't add '_' in the begining of seq

  # special tokens
  unk_piece="[UNK]",                    # UNK token
  bos_piece="[CLS]",                    # BOS token
  eos_piece="[SEP]",                    # EOS token
  pad_piece="[PAD]",                    # PAD token
  unk_id=0,
  bos_id=1,
  eos_id=2,
  pad_id=3,
  user_defined_symbols='[MASK]',        # Special token needed for DNABERT-2

  # systems
  num_threads=os.cpu_count(), # use ~all system resources
)

spm.SentencePieceTrainer.train(**options)


In [None]:
sp = spm.SentencePieceProcessor()
sp.load('dnabert2.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
#vocab

In [None]:
ids = sp.encode("AATATCGATTC")
print(ids)

[10, 6, 26, 18, 46, 47]


In [None]:
print([sp.id_to_piece(idx) for idx in ids])

['AA', 'TA', 'TCG', 'AT', 'T', 'C']
