In [3]:
%%capture
!pip install datasets

%matplotlib inline

import os
import re
import torch
import numpy as np
from tqdm import tqdm
from datasets import Dataset
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.nn.functional import cross_entropy, mse_loss
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2Model, GPT2Config, AutoTokenizer

from tokenizers import (
    decoders,
    models,
    processors,
    Tokenizer,
    pre_tokenizers,
)

In [4]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

drive_path = '/content/drive/MyDrive/NN/data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Description of the Implementation:


This project aim was to reproduce the tokenization technique proposed in the paper *“xVal : A Continuous Number Encoding for Large Language Models”* .
XVal embeds all numerical values along a specific learnable direction of the embedding space. Given a string composed by text and numbers, all numerical values will be extracted and collected inside a separate list called x_num. The string will be modified by replacing all the numbers with the same token <font color='olive'>[NUM]</font>, which will act as a placeholder.

From all the Datasets proposed by the authors of the paper, I have chosen the one related to planetary motion. I have obtained the Dataset from the simulations generated by the <font color='olive'>REBOUND</font> library. This is an example of what we have inside of it:

`{'description':{'planet0':{'m':1.3125619386712204,'a':1.944262696822503,'e':1.89780665411419} `

As we can see from the previous sample, the Dataset is composed with few keys and a lot of numbers. Each sample, which is represented by a row in our Dataset, represent a simulation. Each simulation is composed by some planet (at least 2), which is described by 3 attributes: mass <font color='orange'>m</font>, eccentricity <font color='orange'>e</font> and the major semi-axis <font color='orange'>a</font>.


## Import Data

In [5]:
"""
All data are obtained through a simulation that uses REBOUND library
training_data a list composed by 300k sublists
Each sublists has a variable number of samples
"""
training_data = Dataset.from_text(drive_path + '/training')
test_data = Dataset.from_text(drive_path + '/test')


print(training_data)
print('\n')
print(training_data.info)
print('\n')
print(test_data)
print('\n')
# what the first 100 characters of the 1st row look like:
print(training_data['text'][0][:100])

Dataset({
    features: ['text'],
    num_rows: 300000
})


DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='text', dataset_name='text', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=2031998548, num_examples=300000, shard_lengths=[74235, 74445, 74508, 74296, 2516], dataset_name='text')}, download_checksums={'/content/drive/MyDrive/NN/data/training': {'num_bytes': 2031098548, 'checksum': None}}, download_size=2031098548, post_processing_size=None, dataset_size=2031998548, size_in_bytes=4063097096)


Dataset({
    features: ['text'],
    num_rows: 125000
})


{'description':{'planet0':{'m':3.8050266189989066,'a':1.2661963245440728,'e':0.5255877225805115},'pl


In [None]:
def keys_extraction(text):
    # the Dataset is characterized by a specific pattern. All words are contained between ' '
    regex = r"'(.*?)'"
    return re.findall(regex, text)

In [None]:
"""
As previously stated in the Report, the Dataset is composed by few words and a lot of numbers.
We are going to see how many different words we have inside our Dataset (some of them are fixed, while other can be specified
during the simulation like the number of planets)

"""

keys_occurrences = training_data.map( # by doing this, we will have all keys (words)
    lambda x: {"keys": keys_extraction(x['text'])}, # create a column of keys
    remove_columns=['text'] # get rid of the text column
)

# inside keys we have all the occurrences of the words
# now we create a set of them and see how many different words we have inside our dataset
keys = set()
for row in keys_occurrences['keys']:
    for word in row:
        keys.add(word)

print(f"We have a Total of {len(keys)} Keys, which are: {keys}")

# count the number of occurrences of all the keys
m = 0
desc = 0
pl0 = 0
pl1 = 0
pl2 = 0
pl3 = 0
pl4 = 0

for row in keys_occurrences['keys']:
    m += row.count('m')
    desc += row.count('description')
    pl0 += row.count('planet0')
    pl1 += row.count('planet1')
    pl2 += row.count('planet2')
    pl3 += row.count('planet3')
    pl4 += row.count('planet4')

print(f"Number of occurrences of all the keys 'm', 'a', 'e': {m}")
print(f"Number of occurrences of all the keys 'description', 'data', 'stepsize': {desc}")
print(f"Number of occurrences of all the keys 'planet0': {pl0}, 'planet1': {pl1}, 'planet2': {pl2}, 'planet3': {pl3}, 'planet4': {pl4}")

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

We have a Total of 11 Keys, which are: {'m', 'data', 'description', 'planet3', 'planet1', 'planet0', 'e', 'planet4', 'planet2', 'stepsize', 'a'}
Number of occurrences of all the keys 'm', 'a', 'e': 1049975
Number of occurrences of all the keys 'description', 'data', 'stepsize': 300000
Number of occurrences of all the keys 'planet0': 300000, 'planet1': 300000, 'planet2': 225177, 'planet3': 149920, 'planet4': 74878


As we can see from the previous numbers, each simulation is composed by one occurrence of the keys <font color="cyan">description</font> , <font color='cyan'>data</font>  and <font color='cyan'>stepsize</font> .
We also have at least 2 planets (<font color='orange'>planet0</font> and <font color='orange'>planet1</font>) for each simulation. The keys <font color='olive'>m</font>, <font color='olive'>a</font>, <font color='olive'>e</font>  have an occurrence for each of the planets involved in that simulation.

In [6]:
def numbers_extraction(data):
    # extract all numbers
    regex = r"(-?[0-9]\.[0-9]*)"
    return re.findall(regex, data)

def numbers_substitution(data):
    # replace all numbers with [NUM] token
    regex = r"(-?[0-9]\.[0-9]*)"
    return re.sub(regex, '[NUM]', data)

def numerical_processing(data):

    # numbers extraction and substitution
    numbers = data.map(
    lambda x: {'nums': numbers_extraction(x['text']), 'no_nums': numbers_substitution(x['text'])},
    remove_columns=['text']
    )

    return numbers

In [7]:
processed_numbers = numerical_processing(training_data)

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

In [8]:
print(f"First Hundred characters after replacing numbers: {processed_numbers['no_nums'][0][:100]}")
print(f"Numbers replaced by [NUM]: {processed_numbers['nums'][0][:6]}")

First Hundred characters after replacing numbers: {'description':{'planet0':{'m':[NUM],'a':[NUM],'e':[NUM]},'planet1':{'m':[NUM],'a':[NUM],'e':[NUM]},
Numbers replaced by [NUM]: ['3.8050266189989066', '1.2661963245440728', '0.5255877225805115', '2.9670398477427886', '1.5323926490881454', '0.9370771996085853']


# Tokenizer

Use BPE algorithm:

1. Initialize the vocabulary with all the bytes or characters in the text corpus
2. Calculate the frequency of each byte or character in the text corpus.
3. Repeat the following steps until the desired vocabulary size is reached:
    4. Find the most frequent pair of consecutive bytes or characters in the text corpus
    5. Merge the pair to create a new subword unit.
    6. Update the frequency counts of all the bytes or characters that contain the merged pair.
7. Add the new subword unit to the vocabulary.

In [None]:
"""
As we have seen, the samples are composed mostly by the same words.
We have to create the vocabulary of our Tokenizer :
"""
# each word of the keys inside our data is between ' ' and before a : (e.g 'description':)

vocabulary = [
    "'description':",
    "'data':",
    "'stepsize':",
    "'planet0':",
    "'planet1':",
    "'planet2':",
    "'planet3':",
    "'planet4':",
    "'m':",
    "'a':",
    "'e':",
    "[",
    "[[",
    "[[[",
    "]",
    "]]",
    "]]]",
    "{",
    "}",
    ",",
    "],[",
    "]],[[",
    "]]],[[[",
]

tokenizer = Tokenizer(models.BPE(vocab={}, merges=[]))
tokenizer.add_tokens(vocabulary)
tokenizer.add_special_tokens(['[NUM]', '[PAD]', '[UNK]', '[END]'])
# since all words are lowercase and there isn't any accent or space, we don't need to apply any normalization
tokenizer.save(drive_path + '../src/my_tokenizer.json')

In [9]:
my_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file='/content/drive/MyDrive/NN/src/my_tokenizer.json',
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [None]:
my_tokenizer.vocab

{"'m':": 8,
 '{': 17,
 ']]]': 16,
 "'planet3':": 6,
 '[[[': 13,
 '[SEP]': 27,
 '[NUM]': 23,
 "'data':": 1,
 '[UNK]': 25,
 '[': 11,
 ']]],[[[': 22,
 "'a':": 9,
 '[MASK]': 29,
 ']]': 15,
 "'planet0':": 3,
 "'e':": 10,
 "'planet4':": 7,
 "'planet1':": 4,
 "'description':": 0,
 ']],[[': 21,
 '[PAD]': 24,
 '[CLS]': 28,
 ',': 19,
 '[END]': 26,
 '}': 18,
 ']': 14,
 "'stepsize':": 2,
 '[[': 12,
 "'planet2':": 5,
 '],[': 20}

In [10]:
enc = my_tokenizer.encode(processed_numbers['no_nums'][0][:100])
print(enc)

[17, 0, 17, 3, 17, 8, 23, 19, 9, 23, 19, 10, 23, 18, 19, 4, 17, 8, 23, 19, 9, 23, 19, 10, 23, 18, 19]


In [11]:
dec = my_tokenizer.decode(enc)
print(dec)

{ 'description': { 'planet0': {'m': [NUM], 'a': [NUM], 'e': [NUM] }, 'planet1': {'m': [NUM], 'a': [NUM], 'e': [NUM] },


In [12]:
def tokenization(data):
    # this function return the tokenized dataset
    return data.map(
        lambda x: {"token_ids": my_tokenizer.encode(x['no_nums'])}
    )


tokenized_data = tokenization(processed_numbers)

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

In [15]:
print(f"20 tokenized characters: {tokenized_data['token_ids'][0][:20]}")

20 tokenized characters: [17, 0, 17, 3, 17, 8, 23, 19, 9, 23, 19, 10, 23, 18, 19, 4, 17, 8, 23, 19]


In [None]:
"""
Definition of a Collator function to be used by the DataLoader in order to prepare our data for training
"""

def collator(batch):
    # translation to pytorch tensor
    x_id = [torch.tensor(sample['tokenized']) for sample in batch]  # all token ids
    x_num = [torch.tensor(sample['nums']) for sample in batch]      # all original numbers

    # since we have variable length for each simulation, it might be a good idea to extract the maximum one

    max_length = max([len(sample) for sample in x_id])

    # add padding to all samples in order to have them of equal length
    # my_tokenizer.pad_token_id aka 24
    x_id_pad = torch.full((len(batch), max_length), fill_value=my_tokenizer.pad_token_id, dtype=torch.long)
    for idx, sample in enumerate(x_id):
        x_id_pad[idx, :len(sample)] = torch.tensor(sample)

    # add padding to all original_numbers in order to have them of equal length
    # when you don't have a number you will have -100
    x_num_pad = torch.full((len(batch), max_length), fill_value=-100, dtype=torch.float)
    for i, sample in enumerate(x_id):     # extract all original number of each sample
        for j, token_id in enumerate(x_id):   # extract all numbers from the lis
            if token_id==23:    # if that value is a number
                x_num_pad[i, j] = x_num[i][j]

    return x_id_pad, x_num_pad


pre_processed_data = tokenized_data
tokenized_data = tokenized_data.add_column("nums", tokenized_data['nums'])

train_loader = DataLoader(tokenized_data, batch_size=32, collate_fn=collator, shuffle=True)