In [1]:
from datasets import load_dataset
import torch
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [2]:
raw_dataset = load_dataset("Nan-Do/code-search-net-python")

In [3]:
raw_dataset['train']

Dataset({
    features: ['repo', 'path', 'func_name', 'original_string', 'language', 'code', 'code_tokens', 'docstring', 'docstring_tokens', 'sha', 'url', 'partition', 'summary'],
    num_rows: 455243
})

In [4]:
raw_dataset['train'][123543]['original_string']

'def _api_on_write_error(self, status_code, **kwargs):\n        """\n        Catches errors and renders it as a JSON message. Adds the traceback if\n        debug is enabled.\n        """\n\n        return_error = { "code": self.get_status() }\n        exc_info = kwargs.get("exc_info")\n\n        if exc_info and isinstance(exc_info[1], oz.json_api.ApiError):\n            return_error["error"] = exc_info[1].message\n        else:\n            return_error["error"] = API_ERROR_CODE_MAP.get(self.get_status(), "Unknown error")\n\n        if oz.settings.get("debug"):\n            return_error["trace"] = "".join(traceback.format_exception(*exc_info))\n\n        self.finish(return_error)\n        return oz.break_trigger'

In [5]:
# we turn the data into lists so that tokenizer would train fast
training_corpus = (
    raw_dataset['train'][i:i+1000]['original_string']
    for i in range(0, len(raw_dataset['train']),1000)
)# only 1000 texts at a time will be loaded. Thus our memory won't be exhausted while processing

In [6]:
gen = (i for i in range(10))
print(list(gen))
print(list(gen)) # generator function return the value once only which is a problem

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[]


In [7]:
# Thus, we define a function that return a generator instead
def get_training_corpus():
    return (
        raw_dataset['train'][i:i+1000]['original_string']
        for i in range(0,len(raw_dataset['train']), 1000)
    )

In [8]:
training_corpus = get_training_corpus()

In [9]:
# We can also use for loop for the above function
def get_training_corpus():
    dataset = raw_dataset['train']
    for start_idx in range(0,len(dataset),1000):
        samples = dataset[start_idx:start_idx+1000]
        yield  samples['original_string']

## Training A New Tokenizer

In [10]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [11]:
# Let's look at the tokenization of the old tokenizer
sentences = '''def addition(x,y):
    Adding two numbers x and y
        return x+y'''

tokens = old_tokenizer.tokenize(sentences)
print(tokens)

['def', 'Ġaddition', '(', 'x', ',', 'y', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'ĠAdding', 'Ġtwo', 'Ġnumbers', 'Ġx', 'Ġand', 'Ġy', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġx', '+', 'y']


In [12]:
# Now we train the tokenizer on our training corpus
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus,52000)

In [13]:
new_tokens = new_tokenizer.tokenize(sentences)

In [14]:
print(new_tokens)

['def', 'Ġaddition', '(', 'x', ',', 'y', '):', 'ĊĠĠĠ', 'ĠAdding', 'Ġtwo', 'Ġnumbers', 'Ġx', 'Ġand', 'Ġy', 'ĊĠĠĠĠĠĠĠ', 'Ġreturn', 'Ġx', '+', 'y']


In [15]:
# see the length of tokens here
print(len(tokens))
print(len(new_tokens))

29
19


In [22]:
# Now save the tokenizer
new_tokenizer.save_pretrained('code_search_net_tokenizer_gpt2')

('code_search_net_tokenizer_gpt2\\tokenizer_config.json',
 'code_search_net_tokenizer_gpt2\\special_tokens_map.json',
 'code_search_net_tokenizer_gpt2\\vocab.json',
 'code_search_net_tokenizer_gpt2\\merges.txt',
 'code_search_net_tokenizer_gpt2\\added_tokens.json',
 'code_search_net_tokenizer_gpt2\\tokenizer.json')

In [23]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
!huggingface-cli login

^C


In [None]:
new_tokenizer.push_to_hub('code_search_net_tokenizer_gpt2')