In [2]:
%%capture
!pip install datasets

%matplotlib inline

import os
import re
import torch
import numpy as np
from tqdm import tqdm
from datasets import Dataset
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from torch.nn.functional import cross_entropy, mse_loss
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2Tokenizer

from tokenizers import (
    decoders,
    models,
    processors,
    Tokenizer,
    pre_tokenizers,
)

In [3]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

drive_path = '/content/drive/MyDrive/NN/data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Description of the Implementation:


This project aim was to reproduce the tokenization technique proposed in the paper *“xVal : A Continuous Number Encoding for Large Language Models”* .
XVal embeds all numerical values along a specific learnable direction of the embedding space. Given a string composed by text and numbers, all numerical values will be extracted and collected inside a separate list called x_num. The string will be modified by replacing all the numbers with the same token <font color='olive'>[NUM]</font>, which will act as a placeholder.

From all the Datasets proposed by the authors of the paper, I have chosen the one related to planetary motion. I have obtained the Dataset from the simulations generated by the <font color='olive'>REBOUND</font> library. This is an example of what we have inside of it:

`{'description':{'planet0':{'m':1.3125619386712204,'a':1.944262696822503,'e':1.89780665411419} `

As we can see from the previous sample, the Dataset is composed with few keys and a lot of numbers. Each sample, which is represented by a row in our Dataset, represent a simulation. Each simulation is composed by some planet (at least 2), which is described by 3 attributes: mass <font color='orange'>m</font>, eccentricity <font color='orange'>e</font> and the major semi-axis <font color='orange'>a</font>.


## Import Data

In [13]:
"""
All data are obtained through a simulation that uses REBOUND library
training_data a list composed by 300k sublists
Each sublists has a variable number of samples
"""
training_data = Dataset.from_text(drive_path + '/training')
test_data = Dataset.from_text(drive_path + '/test')


print(training_data)
print('\n')
print(training_data.info)
print('\n')
print(test_data)
print('\n')
# what the first 100 characters of the 1st row look like:
print(training_data['text'][0][:100])

Dataset({
    features: ['text'],
    num_rows: 300000
})


DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='text', dataset_name='text', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=2031998548, num_examples=300000, shard_lengths=[74235, 74445, 74508, 74296, 2516], dataset_name='text')}, download_checksums={'/content/drive/MyDrive/NN/data/training': {'num_bytes': 2031098548, 'checksum': None}}, download_size=2031098548, post_processing_size=None, dataset_size=2031998548, size_in_bytes=4063097096)


Dataset({
    features: ['text'],
    num_rows: 125000
})


{'description':{'planet0':{'m':3.8050266189989066,'a':1.2661963245440728,'e':0.5255877225805115},'pl


In [5]:
def keys_extraction(text):
    # the Dataset is characterized by a specific pattern. All words are contained between ' '
    regex = r"'(.*?)'"
    return re.findall(regex, text)

In [6]:
"""
As previously stated in the Report, the Dataset is composed by few words and a lot of numbers.
We are going to see how many different words we have inside our Dataset (some of them are fixed, while other can be specified
during the simulation like the number of planets)

"""

keys_occurrences = training_data.map( # by doing this, we will have all keys (words)
    lambda x: {"keys": keys_extraction(x['text'])}, # create a column of keys
    remove_columns=['text'] # get rid of the text column
)

# inside keys we have all the occurrences of the words
# now we create a set of them and see how many different words we have inside our dataset
keys = set()
for row in keys_occurrences['keys']:
    for word in row:
        keys.add(word)

print(f"We have a Total of {len(keys)} Keys, which are: {keys}")

# count the number of occurrences of all the keys
m = 0
desc = 0
pl0 = 0
pl1 = 0
pl2 = 0
pl3 = 0
pl4 = 0

for row in keys_occurrences['keys']:
    m += row.count('m')
    desc += row.count('description')
    pl0 += row.count('planet0')
    pl1 += row.count('planet1')
    pl2 += row.count('planet2')
    pl3 += row.count('planet3')
    pl4 += row.count('planet4')

print(f"Number of occurrences of all the keys 'm', 'a', 'e': {m}")
print(f"Number of occurrences of all the keys 'description', 'data', 'stepsize': {desc}")
print(f"Number of occurrences of all the keys 'planet0': {pl0}, 'planet1': {pl1}, 'planet2': {pl2}, 'planet3': {pl3}, 'planet4': {pl4}")

We have a Total of 11 Keys, which are: {'stepsize', 'description', 'planet3', 'planet1', 'planet0', 'a', 'planet4', 'planet2', 'data', 'e', 'm'}
Number of occurrences of all the keys 'm', 'a', 'e': 1049975
Number of occurrences of all the keys 'description', 'data', 'stepsize': 300000
Number of occurrences of all the keys 'planet0': 300000, 'planet1': 300000, 'planet2': 225177, 'planet3': 149920, 'planet4': 74878


As we can see from the previous numbers, each simulation is composed by one occurrence of the keys <font color="cyan">description</font> , <font color='cyan'>data</font>  and <font color='cyan'>stepsize</font> .
We also have at least 2 planets (<font color='orange'>planet0</font> and <font color='orange'>planet1</font>) for each simulation. The keys <font color='olive'>m</font>, <font color='olive'>a</font>, <font color='olive'>e</font>  have an occurrence for each of the planets involved in that simulation.

In [14]:
def numbers_extraction(data):
    # extract all numbers
    regex = r"(-?[0-9]\.[0-9]*)"
    return re.findall(regex, data)

def numbers_substitution(data):
    # replace all numbers with [NUM] token
    regex = r"(-?[0-9]\.[0-9]*)"
    return re.sub(regex, '[NUM]', data)

In [15]:
numbers = training_data.map(
    lambda x: {'nums': numbers_extraction(x['text'])},
    remove_columns=['text']
)

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

In [18]:
training_without_numbers = training_data.map(
    lambda x: {'no_nums': numbers_substitution(x['text'])},
    remove_columns=['text']
)

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

In [17]:
numbers['nums'][0][:8]

['3.8050266189989066',
 '1.2661963245440728',
 '0.5255877225805115',
 '2.9670398477427886',
 '1.5323926490881454',
 '0.9370771996085853',
 '1.0876850360386103',
 '1.798588973632218']

In [20]:
training_without_numbers['no_nums'][0][:100]

"{'description':{'planet0':{'m':[NUM],'a':[NUM],'e':[NUM]},'planet1':{'m':[NUM],'a':[NUM],'e':[NUM]},"