# Process Data

## Utilities

In [1]:
import torch
import numpy as np
from tqdm import tqdm

In [2]:
def tokenize_q(text):
    l = len(text)
    return [char_to_idx[c] for c in text] + [eos_token] + [empty_token] * (max_q_len - l - 1)

def tokenize_a(text):
    l = len(text)
    token_a = [char_to_idx[c] for c in text]
    token_a = [start_token] + token_a + [eos_token] + [empty_token] * (max_a_len - l - 1)
    token_target = token_a[:-1]
    token_label = token_a[1:]
    return token_target, token_label

def invert_tokenization(idx):
    return [idx_to_char[i] for i in idx]

In [3]:
def load_qa(filename):
    with open(f'{data_path}/{tr_l}/{task}.txt') as f:
        text = f.read().splitlines()
        quess = text[::2]
        anss = text[1::2]

    return quess, anss

def tokenize_qa(quess, anss):
    tokenized_source = [tokenize_q(text) for text in tqdm(quess)]

    tokenized_target, tokenized_label = [], []
    for text in tqdm(anss):
        tt, tl = tokenize_a(text)
        tokenized_target.append(tt)
        tokenized_label.append(tl)

    tokenized_source = torch.tensor(tokenized_source)
    tokenized_target = torch.tensor(tokenized_target)
    tokenized_label = torch.tensor(tokenized_label)

    return tokenized_source, tokenized_target, tokenized_label

In [4]:
def create_ds(fname):
    quess, anss = load_qa(fname)

    tokenized_source, tokenized_target, tokenized_label = tokenize_qa(quess, anss)

    return torch.utils.data.TensorDataset(tokenized_source, tokenized_target, tokenized_label)

In [5]:
data_path = '../../data/math/mathematics_dataset-v1.0'
out_path = 'tokenized_data'

In [6]:
with open('text_vectorizer/vocabulary.txt') as f:
    vocab = f.read().splitlines()

idx_to_char = {i: c for i, c in enumerate(vocab)}
char_to_idx = {c: i for i, c in enumerate(vocab)}

empty_token = char_to_idx['']
eos_token = char_to_idx[';']
start_token = char_to_idx['@']

max_q_len, max_a_len = 161, 31

## Process Task data

In [7]:
task = 'polynomials__expand'

In [8]:
train_levels = ['train-easy', 'train-medium', 'train-hard']

### Train ds

In [9]:
tss, tts, tls = [], [], []

for tr_l in tqdm(train_levels):
    fname = f'{data_path}/{tr_l}/{task}.txt'

    quess, anss = load_qa(fname)

    tokenized_source, tokenized_target, tokenized_label = tokenize_qa(quess, anss)
    tss.append(tokenized_source)
    tts.append(tokenized_target)
    tls.append(tokenized_label)

    del quess, anss

100%|██████████| 666666/666666 [00:04<00:00, 154544.75it/s]
100%|██████████| 666666/666666 [00:04<00:00, 160722.16it/s]
100%|██████████| 666666/666666 [00:03<00:00, 169745.24it/s]
100%|██████████| 666666/666666 [00:03<00:00, 191003.56it/s]
100%|██████████| 666666/666666 [00:04<00:00, 159092.56it/s]
100%|██████████| 666666/666666 [00:03<00:00, 191631.62it/s]
100%|██████████| 3/3 [01:05<00:00, 21.85s/it]


In [10]:
ts = torch.concat(tss)
tt = torch.concat(tts)
tl = torch.concat(tls)

del tss, tts, tls

In [11]:
train_ds = torch.utils.data.TensorDataset(ts, tt, tl)

In [12]:
torch.save(train_ds, f'{out_path}/{task}_train.pt')

### Test ds

In [13]:
interpolate_ds = create_ds(f'{data_path}/interpolate/{task}.txt')
torch.save(interpolate_ds, f'{out_path}/{task}_interpolate.pt')
interpolate_ds = create_ds(f'{data_path}/extrapolate/{task}.txt')
torch.save(interpolate_ds, f'{out_path}/{task}_extrapolate.pt')

100%|██████████| 666666/666666 [00:04<00:00, 141784.27it/s]
100%|██████████| 666666/666666 [00:03<00:00, 183483.86it/s]
100%|██████████| 666666/666666 [00:04<00:00, 151822.61it/s]
100%|██████████| 666666/666666 [00:03<00:00, 190589.63it/s]
