# Data Generation

## Addition Problem
Source: https://github.com/batzner/indrnn/blob/master/examples/addition_rnn.py

Timesteps params: https://arxiv.org/abs/1803.04831

BatchSize params: https://arxiv.org/pdf/1511.06464.pdf

In [1]:
import pandas as pd
import os, sys
import numpy as np

from random import randint
from numpy import array

batch_size_arr = [80, 50, 100, 180, 200]
time_steps_arr = [100, 500, 1000, 5000, 10000, 15000]

def generateAddingProblemData(batch_size, time_steps):
    # Build the first sequence
    add_values = np.random.rand(batch_size, time_steps)

    # Build the second sequence with one 1 in each half and 0s otherwise
    add_indices = np.zeros_like(add_values, dtype=int)
    half = int(time_steps / 2)
    for i in range(batch_size):
        first_half = np.random.randint(half)
        second_half = np.random.randint(half, time_steps)
        add_indices[i, [first_half, second_half]] = 1

    # Zip the values and indices in a third dimension:
    # inputs has the shape (batch_size, time_steps, 2)
    inputs = np.dstack((add_values, add_indices))
    targets = np.sum(np.multiply(add_values, add_indices), axis=1)
    data = np.column_stack((inputs.reshape(batch_size, time_steps*2), targets))
    return inputs, targets, data

for bs in batch_size_arr:
    for ts in time_steps_arr:
        _, _, addingproblemdata = (generateAddingProblemData(bs*2, ts))
        with open(f"../../Datasets/2_addingproblem/addingProblem.bs={bs}.ts={ts}.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[2, 1]]),fmt='%d', delimiter=",")
        with open(f"../../Datasets/2_addingproblem/addingProblem.bs={bs}.ts={ts}.csv",'a') as csvfile:
            np.savetxt(csvfile, addingproblemdata, fmt='%.4f', delimiter=",")

## MNIST Problem

Source: https://github.com/batzner/indrnn/blob/8239a819100c40d5662f0d7440bfa7b539366b7f/examples/sequential_mnist.py#L258

Hyperparams: https://arxiv.org/abs/1803.04831 and https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne

In [40]:
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

# Data Dimension
num_input = 28          # MNIST data input (image shape: 28x28)
timesteps = 28          # Timesteps
n_classes = 10          # Number of classes, one class per digit

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train_oh = np.zeros((y_train.shape[0], y_train.max()+1), dtype=np.float32)
y_train_oh[np.arange(y_train.shape[0]), y_train] = 1
y_test_oh = np.zeros((y_test.shape[0], y_test.max()+1), dtype=np.float32)
y_test_oh[np.arange(y_test.shape[0]), y_test] = 1

trainset = np.column_stack((x_train.reshape(x_train.shape[0], x_train.shape[1]*x_train.shape[2]),y_train_oh))
testset = np.column_stack((x_test.reshape(x_test.shape[0], x_test.shape[1]*x_test.shape[2]),y_test_oh))
mnist_problemdata = np.vstack((trainset, testset))
display(trainset.shape)
display(testset.shape)
display(mnist_problemdata.shape)
with open(f"../../Datasets/3_mnist/mnist.ni={num_input}.no={n_classes}.ts={timesteps}.train={60000}.test={10000}.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[num_input, n_classes]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/3_mnist/mnist.ni={num_input}.no={n_classes}.ts={timesteps}.train={60000}.test={10000}.csv",'a') as csvfile:
    np.savetxt(csvfile, mnist_problemdata, fmt='%.4f', delimiter=",")

(60000, 794)

(10000, 794)

(70000, 794)

## Penn Treebank (PTB) Problem

Source: https://catalog.ldc.upenn.edu/LDC95T7

Hyperparams: https://arxiv.org/abs/1803.04831

In [3]:
ptbdataset_path = '../../Datasets/4_ptb/ptbdataset'
with open(f'{ptbdataset_path}/ptb.train.txt','r') as f1,open(f'{ptbdataset_path}/ptb.valid.txt','r') as f2,open(
    f'{ptbdataset_path}/ptb.test.txt','r') as f3:
    ptb_train=f1.read().replace('\n','<eos>').split(' ')
    ptb_valid=f2.read().replace('\n','<eos>').split(' ')
    ptb_test=f3.read().replace('\n','<eos>').split(' ')

display(ptb_train)


['',
 'aer',
 'banknote',
 'berlitz',
 'calloway',
 'centrust',
 'cluett',
 'fromstein',
 'gitano',
 'guterman',
 'hydro-quebec',
 'ipo',
 'kia',
 'memotec',
 'mlx',
 'nahb',
 'punts',
 'rake',
 'regatta',
 'rubens',
 'sim',
 'snack-food',
 'ssangyong',
 'swapo',
 'wachter',
 '<eos>',
 'pierre',
 '<unk>',
 'N',
 'years',
 'old',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'nov.',
 'N',
 '<eos>',
 'mr.',
 '<unk>',
 'is',
 'chairman',
 'of',
 '<unk>',
 'n.v.',
 'the',
 'dutch',
 'publishing',
 'group',
 '<eos>',
 'rudolph',
 '<unk>',
 'N',
 'years',
 'old',
 'and',
 'former',
 'chairman',
 'of',
 'consolidated',
 'gold',
 'fields',
 'plc',
 'was',
 'named',
 'a',
 'nonexecutive',
 'director',
 'of',
 'this',
 'british',
 'industrial',
 'conglomerate',
 '<eos>',
 'a',
 'form',
 'of',
 'asbestos',
 'once',
 'used',
 'to',
 'make',
 'kent',
 'cigarette',
 'filters',
 'has',
 'caused',
 'a',
 'high',
 'percentage',
 'of',
 'cancer',
 'deaths',
 'among',
 '