In [13]:
import tiktoken
from tiktoken.load import load_tiktoken_bpe

In [48]:
PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
ENDOFTEXT = "<|endoftext|>"
IMSTART = "<|im_start|>"
IMEND = "<|im_end|>"

In [64]:
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
SPECIAL_TOKENS = (
    ENDOFTEXT,
    IMSTART,
    IMEND,
) + EXTRAS

In [71]:
def get_encoder(tiktoken_tokenizer_path: str):
    mergeable_ranks = load_tiktoken_bpe(tiktoken_tokenizer_path)
    special_tokens = {
        token: index
        for index, token in enumerate(
            SPECIAL_TOKENS, start=len(mergeable_ranks))
    }
    encoder = tiktoken.Encoding(
        "qwen",
        pat_str=PAT_STR,
        mergeable_ranks=mergeable_ranks,
        special_tokens=special_tokens,
    )
    return encoder

In [66]:
def bytes_to_unicode():
    bs = (
        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

In [67]:
def token_bytes_to_string(b):
    byte_encoder = bytes_to_unicode()
    return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])

In [68]:
def bpe(mergeable_ranks: dict, token: bytes, max_rank: Optional[int] = None) -> list:
    parts = [bytes([b]) for b in token]
    while True:
        min_idx = None
        min_rank = None
        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
            rank = mergeable_ranks.get(pair[0] + pair[1])
            if rank is not None and (min_rank is None or rank < min_rank):
                min_idx = i
                min_rank = rank
        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
            break
        assert min_idx is not None
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
    return parts

In [69]:
def generate_vocab_and_merges(encoder):
    mergeable_ranks = encoder._mergeable_ranks

    merges = []
    vocab = {}
    for token, rank in mergeable_ranks.items():
        vocab[token_bytes_to_string(token)] = rank

        if len(token) == 1:
            continue
        merged = tuple(bpe(mergeable_ranks, token, max_rank=rank))
        assert len(merged) == 2

        merges.append(' '.join(map(token_bytes_to_string, merged)))

    # also add special tokens
    vocab.update(encoder._special_tokens)

    return vocab, merges

In [73]:
import json
import os

def convert_tiktoken(tiktoken_tokenizer_path: str, model_name, output_dir=None):
    
    if output_dir is None:
        output_dir = model_name
        
    encoder = get_encoder(tiktoken_tokenizer_path)

    vocab, merges = generate_vocab_and_merges(encoder)

    added_tokens = [{
            "id": id,
            "content": content,
            "single_word": False,
            "lstrip": False,
            "rstrip": False,
            "normalized": False,
            "special": True}
        for content, id in encoder._special_tokens.items()
    ]
    
    tokenizer_template = {
        "version": "1.0",
        "truncation": None,
        "padding": None,
        "added_tokens": added_tokens,
        "normalizer": None,
        "pre_tokenizer": {
          "type": "ByteLevel",
          "add_prefix_space": False,
          "trim_offsets": True,
          "use_regex": True
        },
        "post_processor": {
          "type": "ByteLevel",
          "add_prefix_space": True,
          "trim_offsets": False,
          "use_regex": True
        },
        "decoder": {
          "type": "ByteLevel",
          "add_prefix_space": True,
          "trim_offsets": True,
          "use_regex": True
        },
        "model": {
          "type": "BPE",
          "dropout": None,
          "unk_token": None,
          "continuing_subword_prefix": "",
          "end_of_word_suffix": "",
          "fuse_unk": False,
          "byte_fallback": False,
          "vocab": vocab,
          "merges": merges
        }
    }
    
    
    tokenizer_config_template = {
        "model_max_length": 8192,
        "tokenizer_class": "AutoTokenizer"
    }

    os.makedirs(output_dir, exist_ok=True)

    # Save to files
    with open(os.path.join(output_dir, 'vocab.json'), 'w', encoding='utf-8') as fp:
        json.dump(vocab, fp, indent=2, ensure_ascii=False)

    with open(os.path.join(output_dir, 'tokenizer.json'), 'w', encoding='utf-8') as fp:
        json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False)

    with open(os.path.join(output_dir, 'tokenizer_config.json'), 'w', encoding='utf-8') as fp:
        json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False)
        
output_dir = './tokenizer'
tiktoken_tokenizer_path = "./qwen_tokenizer/qwen.tiktoken"
convert_tiktoken(tiktoken_tokenizer_path, "qwen", output_dir)

In [75]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tokenizer", trust_remote_code=True)

RecursionError: maximum recursion depth exceeded while calling a Python object