In [61]:
import torch
import itertools
import numpy as np
import pandas as pd
import re

# Hyperparameters

In [44]:
D_VOCAB = 10 # BERT tokenizer uses single tokens for these numericals

# Util functions

In [50]:
def shuffle_data(data):
  "Shuffling the data randomly"
  indices = np.array(range(len(data)))
  np.random.shuffle(indices)
  return data[indices]

def split_data(data, train_ratio=0.8):
  "Spliting the data into training and validation sets accoring to some ratio."
  n_train = int(len(data) * train_ratio)
  train_data = data[:n_train]
  val_data = data[n_train:]
  return train_data, val_data

def add_ground_truths(data):
    "Adding the ground truth labels to the data. In this case the maximum of the row"
    maximum = torch.max(data, dim=1).values
    res = torch.concat((data,maximum.unsqueeze(1)), axis=1)
    return res

# Experiment 1:

We generate training and test data, without allowing that the exact same list appears in both sets. Trivial lists [x,x] are included.

Example:    if [3,2] in train => [3,2] not in test <br>
            if [3,2] in train => [2,3] can be in test

In [35]:
def create_data_1(n_digits=D_VOCAB, sequence_length=2):
  "Generating exhaustive list of two numbers using the Cartesian product of [0,...,n_digits]"
  data = list(itertools.product(range(n_digits+1), repeat=sequence_length))
  data = torch.tensor(data)
  return data

def generate_data_1(n_digits=D_VOCAB, sequence_length=2, train_ratio=0.8):
  "Generating the train and validation data. No same lists will appear in both sets."
  data = create_data_1(n_digits, sequence_length)
  data = shuffle_data(data)
  train_data, val_data = split_data(data, train_ratio)
  return train_data, val_data

In [53]:
generate_data_1()

(tensor([[1, 2],
         [3, 2],
         [0, 1],
         [0, 2],
         [2, 2],
         [2, 1],
         [2, 0],
         [0, 0],
         [3, 0],
         [1, 0],
         [1, 1],
         [0, 3]]),
 tensor([[2, 3],
         [3, 3],
         [1, 3],
         [3, 1]]))

# Experiment 2:
We generate training and test data without allowing that a permuted list appears in both sets.

Example: if [2,3] in train => [3,2] not in test

In [37]:
def create_data_2(n_digits=D_VOCAB, sequence_length=2):
    "Generating combinations of two numbers without permutations"
    data = list(itertools.combinations(range(n_digits+1), sequence_length))
    data = torch.tensor(data)
    return data

def add_permutations_2(train_data, val_data):
    "Adding permutations after train-test split."
    permuted_train_data = train_data[:,[1,0]]
    permuted_val_data = val_data[:,[1,0]]
    new_train_data = torch.concat((train_data, permuted_train_data), axis=0)
    new_test_data = torch.concat((val_data, permuted_val_data), axis=0)
    return new_train_data, new_test_data

def add_same_numbers_2(n_digits,train_data, val_data, train_ratio=0.8):
    "Adding same number lists [x,x] to train and test sets."

    # Create a list of same number lists [x,x]
    numbers = torch.tensor(range(n_digits+1)).unsqueeze(1)
    same_number_lists = torch.cat((numbers, numbers), axis=1)

    # Shuffle the list
    indices = np.array(range(len(same_number_lists)))
    np.random.shuffle(indices)
    same_number_lists = same_number_lists[indices]

    # Add them proportionally to train and test sets
    n_train = int(len(same_number_lists) * train_ratio)
    train_data = torch.concat((train_data, same_number_lists[:n_train]), axis=0)
    val_data = torch.concat((val_data, same_number_lists[n_train:]), axis=0)
    return train_data, val_data

def generate_data_2(n_digits=D_VOCAB, sequence_length=2, train_ratio=0.8):
    "Generating the train and validation data. No same lists will appear in both sets."
    data = create_data_2(n_digits, sequence_length)
    data = shuffle_data(data)
    train_data, val_data = split_data(data, train_ratio)
    train_data, val_data = add_permutations_2(train_data, val_data)
    train_data, val_data = add_same_numbers_2(n_digits, train_data, val_data, train_ratio)
    train_data = shuffle_data(train_data)
    val_data = shuffle_data(val_data)
    return train_data, val_data

(tensor([[1, 1],
        [1, 0],
        [2, 3],
        [2, 2],
        [2, 1],
        [1, 2],
        [3, 2],
        [1, 3],
        [0, 1],
        [3, 1],
        [3, 3]]), tensor([[0, 2],
        [3, 0],
        [2, 0],
        [0, 3],
        [0, 0]]))


In [52]:
generate_data_2()

(tensor([[0, 1],
         [0, 0],
         [2, 3],
         [1, 2],
         [3, 0],
         [0, 3],
         [1, 0],
         [2, 2],
         [2, 1],
         [3, 2],
         [3, 3]]),
 tensor([[1, 1],
         [0, 2],
         [3, 1],
         [2, 0],
         [1, 3]]))

# Experiment 3

We generate training and test data without allowing that one number can appear both in train and test. 

In [48]:
def create_data_3(n_digits=D_VOCAB, sequence_length=2):
    data = np.array(range(n_digits+1))
    return data

def generate_data_3(n_digits=D_VOCAB, sequence_length=2, train_ratio=0.8):
    data = create_data_3(n_digits, sequence_length)
    data = shuffle_data(data)
    train_data, val_data = split_data(data, train_ratio)
    train_data, val_data = create_data_4(train_data, val_data)
    train_data = shuffle_data(train_data)
    val_data = shuffle_data(val_data)
    return train_data, val_data

def create_data_4(train_data, val_data):
    train_data = list(itertools.product(train_data, repeat=2))
    val_data = list(itertools.product(val_data, repeat=2))
    train_data = torch.tensor(train_data)
    val_data = torch.tensor(val_data)
    return train_data, val_data

(tensor([[ 1,  4],
        [ 9,  9],
        [ 4,  7],
        [ 7, 10],
        [ 9,  5],
        [10,  0],
        [ 1,  1],
        [ 0,  7],
        [ 6,  0],
        [10,  9],
        [ 6,  6],
        [ 1,  9],
        [ 0,  5],
        [ 5,  6],
        [ 4,  9],
        [ 7,  4],
        [10,  6],
        [ 0,  1],
        [10,  5],
        [ 5,  4],
        [10,  1],
        [ 9,  6],
        [10,  7],
        [10, 10],
        [ 9,  7],
        [ 4,  6],
        [ 4, 10],
        [ 6,  4],
        [ 6,  7],
        [ 5,  0],
        [ 6,  9],
        [ 9,  4],
        [ 6,  5],
        [ 5,  9],
        [ 0,  6],
        [ 1,  7],
        [ 0,  4],
        [ 0, 10],
        [ 0,  0],
        [ 1,  6],
        [ 7,  7],
        [10,  4],
        [ 6,  1],
        [ 6, 10],
        [ 4,  4],
        [ 0,  9],
        [ 1,  5],
        [ 1, 10],
        [ 9, 10],
        [ 7,  5],
        [ 4,  1],
        [ 4,  5],
        [ 5, 10],
        [ 7,  6],
        [ 7,  9],
        [

In [51]:
generate_data_3()

(tensor([[4, 1],
         [0, 2],
         [8, 7],
         [7, 8],
         [2, 0],
         [9, 5],
         [5, 2],
         [9, 7],
         [1, 4],
         [8, 4],
         [7, 9],
         [1, 1],
         [0, 8],
         [7, 7],
         [2, 7],
         [5, 1],
         [4, 7],
         [9, 2],
         [0, 1],
         [5, 0],
         [7, 1],
         [1, 2],
         [8, 8],
         [8, 2],
         [0, 7],
         [5, 7],
         [5, 8],
         [1, 9],
         [9, 9],
         [2, 8],
         [7, 0],
         [2, 2],
         [0, 5],
         [7, 4],
         [8, 0],
         [0, 9],
         [2, 9],
         [9, 4],
         [4, 4],
         [9, 0],
         [0, 4],
         [9, 8],
         [5, 4],
         [1, 8],
         [5, 9],
         [4, 0],
         [4, 9],
         [4, 5],
         [1, 7],
         [8, 9],
         [7, 2],
         [7, 5],
         [2, 4],
         [0, 0],
         [9, 1],
         [1, 5],
         [1, 0],
         [4, 8],
         [2, 1

# Parse to CSV files
The format is:
input, output

Example:
 "[ 296, 34 ]", "296"

In [96]:
def parse_train_data_to_csv(data, file_name):

    # Convert to numpy array
    data = data.numpy()

    parsed_inputs = []
    for row in data:
        # Create a regular expression pattern for the current row
        pattern = r'START {} {} STOP [MASK]'.format(''.join(str(row[0])), str(row[1]))
        parsed_inputs.append(pattern)

    parsed_outputs = []
    for row in data:
        # Create a regular expression pattern for the current row
        pattern = r'{}'.format(''.join(str(row[2])))
        parsed_outputs.append(pattern)

    # Create a DataFrame from the parsed rows
    df = pd.DataFrame()
    df['input'] = parsed_inputs
    df['output'] = parsed_outputs

    # Write the DataFrame to a CSV file
    df.to_csv(file_name, index=False)

tensor = torch.tensor([[2,3,3],[3,6,6],[8,10,10]])
parse_train_data_to_csv(tensor, "test.csv")