In [20]:
import tiktoken

class Tokenizer:
    
    def __init__(self, encoding_name: str = "cl100k_base", model_name: str = None):

        if model_name:
            self.encoding = tiktoken.encoding_for_model(model_name)
        else:
            self.encoding = tiktoken.get_encoding(encoding_name)
    
    def encode(self, text: str) -> list:

        return self.encoding.encode(text)
    
    def decode(self, tokens: list) -> str:

        return self.encoding.decode(tokens)
    
    def token_count(self, text: str) -> int:

        return len(self.encoding.encode(text))
    
    def __call__(self, text: str) -> list:

        return self.encode(text)

# Example usage:
if __name__ == "__main__":

    tokenizer = Tokenizer(model_name="gpt-4")
    
    sample_text = "Testing tokenizer!."
    

    tokens = tokenizer.encode(sample_text)
    print("Encoded tokens:", tokens)
    
    count = tokenizer.token_count(sample_text)
    print("Token count:", count)
    
    decoded_text = tokenizer.decode(tokens[2:3])
    print("Decoded text:", decoded_text)


Encoded tokens: [16856, 47058, 15725]
Token count: 3
Decoded text: !.
