## Install transformers

In [None]:
!pip install transformers[sentencepiece]

In [3]:
input_text = "My name is Dipankar Dey"
tokenized_text = input_text.split()
print(tokenized_text)

['My', 'name', 'is', 'Dipankar', 'Dey']


## Tokenization using BertTokenizer

In [22]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenized_text = tokenizer(input_text)
tokenized_text

{'input_ids': [101, 1422, 1271, 1110, 12120, 10224, 6610, 3177, 1183, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [23]:
tokenizer.convert_tokens_to_ids(input_text)

100

In [24]:
tokenizer.decode(tokenized_text['input_ids'])

'[CLS] My name is Dipankar Dey [SEP]'

In [25]:
tokenizer.tokenize(input_text)

['My', 'name', 'is', 'Di', '##pan', '##kar', 'De', '##y']

## Tokenization using Albert tokenizer

In [17]:
from transformers import AlbertTokenizer
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v1")
tokenized_text = tokenizer(input_text)
tokenized_text

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

{'input_ids': [2, 51, 204, 25, 926, 3206, 2423, 121, 93, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
tokenizer.decode(tokenized_text['input_ids'])

'my name is dipankar dey'

In [21]:
tokenizer.tokenize(input_text)

['▁my', '▁name', '▁is', '▁di', 'pan', 'kar', '▁de', 'y']

## Tokenization using AutoTokenizer class

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [27]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
tokenizer.tokenize(input_text)

['My', 'name', 'is', 'Di', '##pan', '##kar', 'De', '##y']

In [29]:
tokenizer.save_pretrained("directory_on_my_computer") # saving pretrained model in local drive

('directory_on_my_computer/tokenizer_config.json',
 'directory_on_my_computer/special_tokens_map.json',
 'directory_on_my_computer/vocab.txt',
 'directory_on_my_computer/added_tokens.json',
 'directory_on_my_computer/tokenizer.json')

In [31]:
tokenizer = AutoTokenizer.from_pretrained('directory_on_my_computer') # loading pretrained tokenizer from local drive

In [34]:
input_text = 'This is my house and it is at 10 minutes by walk from Kalighat metro station'
tokens = tokenizer.tokenize(input_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
tokenized_text = tokenizer(input_text)

print(ids)
print(tokenizer.decode(tokenized_text['input_ids']))

[1188, 1110, 1139, 1402, 1105, 1122, 1110, 1120, 1275, 1904, 1118, 2647, 1121, 22576, 5084, 2980, 16411, 1466]
[CLS] This is my house and it is at 10 minutes by walk from Kalighat metro station [SEP]


In [36]:
decoded_string = tokenizer.decode([1188, 1110, 1139, 1402, 1105, 1122, 1110, 1120, 1275, 1904, 1118, 2647, 1121, 22576, 5084, 2980, 16411, 1466])
print(decoded_string)

This is my house and it is at 10 minutes by walk from Kalighat metro station


In [38]:
print(tokenizer.convert_ids_to_tokens(ids))

['This', 'is', 'my', 'house', 'and', 'it', 'is', 'at', '10', 'minutes', 'by', 'walk', 'from', 'Kali', '##gh', '##at', 'metro', 'station']


- tokenizer.tokenize(input_text)

- tokenizer.covert_tokens_to_ids(token)

- tokenizer.decode(list(ids))

- tokenizer.save_pretrained('local_directory')