# Data Generation

## Addition Problem
Source: https://github.com/batzner/indrnn/blob/master/examples/addition_rnn.py

Timesteps params: https://arxiv.org/abs/1803.04831

BatchSize params: https://arxiv.org/pdf/1511.06464.pdf

In [1]:
import pandas as pd
import os, sys
import numpy as np

from random import randint
from numpy import array

batch_size_arr = [80, 50, 100, 180, 200]
time_steps_arr = [100, 500, 1000, 5000, 10000, 15000]

def generateAddingProblemData(batch_size, time_steps):
    # Build the first sequence
    add_values = np.random.rand(batch_size, time_steps)

    # Build the second sequence with one 1 in each half and 0s otherwise
    add_indices = np.zeros_like(add_values, dtype=int)
    half = int(time_steps / 2)
    for i in range(batch_size):
        first_half = np.random.randint(half)
        second_half = np.random.randint(half, time_steps)
        add_indices[i, [first_half, second_half]] = 1

    # Zip the values and indices in a third dimension:
    # inputs has the shape (batch_size, time_steps, 2)
    inputs = np.dstack((add_values, add_indices))
    targets = np.sum(np.multiply(add_values, add_indices), axis=1)
    data = np.column_stack((inputs.reshape(batch_size, time_steps*2), targets))
    return inputs, targets, data

for bs in batch_size_arr:
    for ts in time_steps_arr:
        _, _, addingproblemdata = (generateAddingProblemData(bs*2, ts))
        with open(f"../../Datasets/2_addingproblem/addingProblem.bs={bs}.ts={ts}.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[2, 1]]),fmt='%d', delimiter=",")
        with open(f"../../Datasets/2_addingproblem/addingProblem.bs={bs}.ts={ts}.csv",'a') as csvfile:
            np.savetxt(csvfile, addingproblemdata, fmt='%.4f', delimiter=",")

## MNIST Problem

Source: https://github.com/batzner/indrnn/blob/8239a819100c40d5662f0d7440bfa7b539366b7f/examples/sequential_mnist.py#L258

Hyperparams: https://arxiv.org/abs/1803.04831 and https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne

In [40]:
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

# Data Dimension
num_input = 28          # MNIST data input (image shape: 28x28)
timesteps = 28          # Timesteps
n_classes = 10          # Number of classes, one class per digit

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train_oh = np.zeros((y_train.shape[0], y_train.max()+1), dtype=np.float32)
y_train_oh[np.arange(y_train.shape[0]), y_train] = 1
y_test_oh = np.zeros((y_test.shape[0], y_test.max()+1), dtype=np.float32)
y_test_oh[np.arange(y_test.shape[0]), y_test] = 1

trainset = np.column_stack((x_train.reshape(x_train.shape[0], x_train.shape[1]*x_train.shape[2]),y_train_oh))
testset = np.column_stack((x_test.reshape(x_test.shape[0], x_test.shape[1]*x_test.shape[2]),y_test_oh))
mnist_problemdata = np.vstack((trainset, testset))
display(trainset.shape)
display(testset.shape)
display(mnist_problemdata.shape)
with open(f"../../Datasets/3_mnist/mnist.ni={num_input}.no={n_classes}.ts={timesteps}.train={60000}.test={10000}.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[num_input, n_classes]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/3_mnist/mnist.ni={num_input}.no={n_classes}.ts={timesteps}.train={60000}.test={10000}.csv",'a') as csvfile:
    np.savetxt(csvfile, mnist_problemdata, fmt='%.4f', delimiter=",")

(60000, 794)

(10000, 794)

(70000, 794)

## Penn Treebank (PTB) Problem

Source: 
* https://catalog.ldc.upenn.edu/LDC95T7 
* https://github.com/Sunnydreamrain/IndRNN_pytorch/tree/master/PTB 
* https://gist.github.com/tmatha/905ae0c0d304119851d7432e5b359330

Hyperparams: https://arxiv.org/abs/1803.04831

In [60]:
import numpy as np
import collections
import tensorflow as tf

batch_size=20
seq_len=20
clip_norm=5
learning_rate=1.
decay=0.5
epochs=13
epochs_no_decay=4

ptbdataset_path = '../../Datasets/4_ptb/ptbdataset'

#MIT License - Copyright (c) 2018 tmatha
def features_labels(data_array,batch_size,seq_len,batch_first=True):
    if len(data_array.shape) != 1:
        raise ValueError('Expected 1-d data array, '
                     'instead data array shape is {} '.format(data_array.shape))

    def fold(used_array):
        shaped_array=np.reshape(used_array,(batch_size,seq_len*steps),order='C')
        if batch_first:
            return np.concatenate(np.split(shaped_array,steps,axis=1),axis=0)
        else:
            return np.transpose(shaped_array)

    steps=(data_array.shape[0]-1)//(batch_size*seq_len)
    used=batch_size*seq_len*steps

    features=fold(data_array[:used])
    labels=fold(data_array[1:used+1])
    Data=collections.namedtuple('Data',['features','labels'])
    data_np = np.concatenate((features, labels), axis=1)
    return Data(features=features,labels=labels),steps, data_np

with open(f'{ptbdataset_path}/ptb.train.txt','r') as f1,open(f'{ptbdataset_path}/ptb.valid.txt','r') as f2,open(
    f'{ptbdataset_path}/ptb.test.txt','r') as f3:
    seq_train=f1.read().replace('\n','<eos>').split(' ')
    seq_valid=f2.read().replace('\n','<eos>').split(' ')
    seq_test=f3.read().replace('\n','<eos>').split(' ')

seq_train=list(filter(None,seq_train))
seq_valid=list(filter(None,seq_valid))
seq_test=list(filter(None,seq_test))

vocab_train=set(seq_train)
vocab_valid=set(seq_valid)
vocab_test=set(seq_test)

assert vocab_valid.issubset(vocab_train)
assert vocab_test.issubset(vocab_train)
print('vocab_train {}, vocab_valid {}, vocab_test {}'.format(
    len(vocab_train),len(vocab_valid),len(vocab_test)))

vocab_train=sorted(vocab_train)#must have deterministic ordering, so word2id dictionary is reproducible across invocations

word2id={w:i for i,w in enumerate(vocab_train)}
id2word={i:w for i,w in enumerate(vocab_train)}

ids_train=np.array([word2id[word] for word in seq_train],copy=False,order='C')
ids_valid=np.array([word2id[word] for word in seq_valid],copy=False,order='C')
ids_test=np.array([word2id[word] for word in seq_test],copy=False,order='C')

data_train, steps_train, trainset_np = features_labels(ids_train, batch_size, seq_len, batch_first=False)
data_valid, steps_valid, valset_np   = features_labels(ids_valid, batch_size, seq_len, batch_first=False)
data_test,  steps_test, testset_np   = features_labels(ids_test,  batch_size, seq_len, batch_first=False)

trainset_np = trainset_np / len(vocab_train)
valset_np   = valset_np / len(vocab_valid)
testset_np  = testset_np / len(vocab_test)

dataset_train = tf.data.Dataset.from_tensor_slices(data_train).batch(seq_len, drop_remainder=True)
dataset_valid = tf.data.Dataset.from_tensor_slices(data_valid).batch(seq_len, drop_remainder=True)
dataset_test  = tf.data.Dataset.from_tensor_slices(data_test).batch(seq_len,  drop_remainder=True)

with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_train}.train.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[seq_len, seq_len]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_train}.train.csv",'a') as csvfile:
    np.savetxt(csvfile, trainset_np, fmt='%.4f', delimiter=",")

with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_valid}.val.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[seq_len, seq_len]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_valid}.val.csv",'a') as csvfile:
    np.savetxt(csvfile, valset_np, fmt='%.4f', delimiter=",")
    
with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_test}.test.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[seq_len, seq_len]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_test}.test.csv",'a') as csvfile:
    np.savetxt(csvfile, testset_np, fmt='%.4f', delimiter=",")

vocab_train 10000, vocab_valid 6022, vocab_test 6049


## Skeleton based Action Recognition (NTU RGB+D) Problem

Source: 
* https://github.com/zibeu/Independently-Recurrent-Neural-Network---IndRNN 
* https://github.com/Sunnydreamrain/IndRNN_pytorch/
* https://gist.github.com/tmatha/905ae0c0d304119851d7432e5b359330

Hyperparams: https://arxiv.org/abs/1803.04831

Data Information: https://rose1.ntu.edu.sg/dataset/actionRecognition/

NTU RGB+D is a large-scale dataset for RGB-D human action recognition. It involves 56,880 samples of 60 action classes collected from 40 subjects. The actions can be generally divided into three categories: 40 daily actions (e.g., drinking, eating, reading), nine health-related actions (e.g., sneezing, staggering, falling down), and 11 mutual actions (e.g., punching, kicking, hugging). These actions take place under 17 different scene conditions corresponding to 17 video sequences (i.e., S001–S017). The actions were captured using three cameras with different horizontal imaging viewpoints, namely, −45∘,0∘, and +45∘. Multi-modality information is provided for action characterization, including depth maps, 3D skeleton joint position, RGB frames, and infrared sequences. The performance evaluation is performed by a cross-subject test that split the 40 subjects into training and test groups, and by a cross-view test that employed one camera (+45∘) for testing, and the other two cameras for training.

In [None]:
import glob
import os
import numpy as np
import csv

SKELETON_DIR = '../../Datasets/5_nturgb+d/nturgb+d_skeletons'
NPY_DIR = '../../Datasets/5_nturgb+d/nturgp+d_csv/nturgb_npy'
TRAIN_DS = '_train.csv'
TEST_DS = '_test.csv'

skeleton_files_mask = os.path.join(SKELETON_DIR, '*.skeleton')
skeleton_files = glob.glob(skeleton_files_mask)


max_frame_count = 300
max_joints = 50

full_ds = []

#for idx, file_name in enumerate(skeleton_files[:568]):
for idx, file_name in enumerate(skeleton_files):
    if idx%100 == 0:
        print(idx)
    basename = os.path.basename(file_name)
    name = os.path.splitext(basename)[0]
    label = name.split('A')[1]
    with open(file_name) as f:
        framecount = int(f.readline())

        sequence_frames = []

        for frame in range(framecount):
            body_count = int(f.readline())
            if body_count <= 0 or body_count>2:
                print('continue, no body')
                break
            joints_xyz = []
            for body in range(body_count):
                skeleton_info = f.readline()
                joint_counts = int(f.readline()) #25
                for joint in range(joint_counts):
                    joint_info = f.readline()
                    joint_info_array = joint_info.split()
                    x, y, z = joint_info_array[:3]
                    joint_info_xyz = np.array([float(x), float(y), float(z)])
                    joints_xyz.append(joint_info_xyz)
            pad_joints = max_joints - len(joints_xyz)
            joints_xyz = np.array(joints_xyz)
            joints_xyz = np.pad(joints_xyz, ((0, pad_joints), (0, 0)), mode='constant')
            frame_xyz = np.stack(joints_xyz)
            sequence_frames.append(frame_xyz)
        if len(sequence_frames) > 0:
            file_name = os.path.join(NPY_DIR, name+ '.npy')
            sample = [name+'.npy', int(label)-1]
            full_ds.append(sample)
            np.save(file_name, np.array(sequence_frames))

#train_ds = full_ds[:380]
#test_ds = full_ds[380:]

train_ds = full_ds[:40320]
test_ds = full_ds[40320:]

with open(os.path.join(NPY_DIR, TRAIN_DS), 'w') as train_ds_file:
    writer = csv.writer(train_ds_file, lineterminator='\n')
    writer.writerows(train_ds)

with open(os.path.join(NPY_DIR, TEST_DS), 'w') as test_ds_file:
    writer = csv.writer(test_ds_file, lineterminator='\n')
    writer.writerows(test_ds)