<div class="alert" style="background-color:#29C5F6; color:white; padding:0px 10px; border-radius:5px;">
    <h1 style='margin:15px 15px; color:#000000; font-size:32px'><b>Data Generation (Processing)</b></h1>
        <h2 style='margin:15px 15px; color:#000000; font-size:24px'>Addition, MNIST, PTB, and NTU RGB+D Problem</h2>
</div>

The work is under the **"Master Thesis"** by **Chau Tran** with the supervision from **Prof. Roland Olsson**.

<div class="alert" style="background-color:#29C5F6; border-radius:5px; padding:0px 10px; "><h3 style='margin:15px 15px'>2. Addition Problem</h3></div>

Source: https://github.com/batzner/indrnn/blob/master/examples/addition_rnn.py

Timesteps params: https://arxiv.org/abs/1803.04831

BatchSize params: https://arxiv.org/pdf/1511.06464.pdf

In [1]:
import pandas as pd
import os, sys
import numpy as np

from random import randint
from numpy import array

batch_size_arr = [80, 50, 100, 180, 200]
time_steps_arr = [100, 500, 1000, 5000, 10000, 15000]

def generateAddingProblemData(batch_size, time_steps):
    # Build the first sequence
    add_values = np.random.rand(batch_size, time_steps)

    # Build the second sequence with one 1 in each half and 0s otherwise
    add_indices = np.zeros_like(add_values, dtype=int)
    half = int(time_steps / 2)
    for i in range(batch_size):
        first_half = np.random.randint(half)
        second_half = np.random.randint(half, time_steps)
        add_indices[i, [first_half, second_half]] = 1

    # Zip the values and indices in a third dimension:
    # inputs has the shape (batch_size, time_steps, 2)
    inputs = np.dstack((add_values, add_indices))
    targets = np.sum(np.multiply(add_values, add_indices), axis=1)
    data = np.column_stack((inputs.reshape(batch_size, time_steps*2), targets))
    return inputs, targets, data

for bs in batch_size_arr:
    for ts in time_steps_arr:
        _, _, addingproblemdata = (generateAddingProblemData(bs*2, ts))
        with open(f"../../Datasets/2_addingproblem/addingProblem.bs={bs}.ts={ts}.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[2, 1]]),fmt='%d', delimiter=",")
        with open(f"../../Datasets/2_addingproblem/addingProblem.bs={bs}.ts={ts}.csv",'a') as csvfile:
            np.savetxt(csvfile, addingproblemdata, fmt='%.4f', delimiter=",")

<div class="alert" style="background-color:#29C5F6; border-radius:5px; padding:0px 10px; "><h3 style='margin:15px 15px'>3. MNIST Problem</h3></div>

Source: https://github.com/batzner/indrnn/blob/8239a819100c40d5662f0d7440bfa7b539366b7f/examples/sequential_mnist.py#L258

Hyperparams: https://arxiv.org/abs/1803.04831 and https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne

In [None]:
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

# Data Dimension
num_input = 28          # MNIST data input (image shape: 28x28)
timesteps = 28          # Timesteps
n_classes = 10          # Number of classes, one class per digit

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train_oh = np.zeros((y_train.shape[0], y_train.max()+1), dtype=np.float32)
y_train_oh[np.arange(y_train.shape[0]), y_train] = 1
y_test_oh = np.zeros((y_test.shape[0], y_test.max()+1), dtype=np.float32)
y_test_oh[np.arange(y_test.shape[0]), y_test] = 1

trainset = np.column_stack((x_train.reshape(x_train.shape[0], x_train.shape[1]*x_train.shape[2]),y_train_oh))
testset = np.column_stack((x_test.reshape(x_test.shape[0], x_test.shape[1]*x_test.shape[2]),y_test_oh))
mnist_problemdata = np.vstack((trainset, testset))
display(trainset.shape)
display(testset.shape)
display(mnist_problemdata.shape)
with open(f"../../Datasets/3_mnist/mnist.ni={num_input}.no={n_classes}.ts={timesteps}.train={60000}.test={10000}.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[num_input, n_classes]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/3_mnist/mnist.ni={num_input}.no={n_classes}.ts={timesteps}.train={60000}.test={10000}.csv",'a') as csvfile:
    np.savetxt(csvfile, mnist_problemdata, fmt='%.4f', delimiter=",")

<div class="alert" style="background-color:#29C5F6; border-radius:5px; padding:0px 10px; "><h3 style='margin:15px 15px'>4. Penn Treebank (PTB) Problem</h3></div>

Source: 
* https://catalog.ldc.upenn.edu/LDC95T7 
* https://github.com/Sunnydreamrain/IndRNN_pytorch/tree/master/PTB 
* https://gist.github.com/tmatha/905ae0c0d304119851d7432e5b359330

Hyperparams: https://arxiv.org/abs/1803.04831

In [None]:
import numpy as np
import collections
import tensorflow as tf

batch_size=20
seq_len=20
clip_norm=5
learning_rate=1.
decay=0.5
epochs=13
epochs_no_decay=4

ptbdataset_path = '../../Datasets/4_ptb/ptbdataset'

#MIT License - Copyright (c) 2018 tmatha
def features_labels(data_array,batch_size,seq_len,batch_first=True):
    if len(data_array.shape) != 1:
        raise ValueError('Expected 1-d data array, '
                     'instead data array shape is {} '.format(data_array.shape))

    def fold(used_array):
        shaped_array=np.reshape(used_array,(batch_size,seq_len*steps),order='C')
        if batch_first:
            return np.concatenate(np.split(shaped_array,steps,axis=1),axis=0)
        else:
            return np.transpose(shaped_array)

    steps=(data_array.shape[0]-1)//(batch_size*seq_len)
    used=batch_size*seq_len*steps

    features=fold(data_array[:used])
    labels=fold(data_array[1:used+1])
    Data=collections.namedtuple('Data',['features','labels'])
    data_np = np.concatenate((features, labels), axis=1)
    return Data(features=features,labels=labels),steps, data_np

with open(f'{ptbdataset_path}/ptb.train.txt','r') as f1,open(f'{ptbdataset_path}/ptb.valid.txt','r') as f2,open(
    f'{ptbdataset_path}/ptb.test.txt','r') as f3:
    seq_train=f1.read().replace('\n','<eos>').split(' ')
    seq_valid=f2.read().replace('\n','<eos>').split(' ')
    seq_test=f3.read().replace('\n','<eos>').split(' ')

seq_train=list(filter(None,seq_train))
seq_valid=list(filter(None,seq_valid))
seq_test=list(filter(None,seq_test))

vocab_train=set(seq_train)
vocab_valid=set(seq_valid)
vocab_test=set(seq_test)

assert vocab_valid.issubset(vocab_train)
assert vocab_test.issubset(vocab_train)
print('vocab_train {}, vocab_valid {}, vocab_test {}'.format(
    len(vocab_train),len(vocab_valid),len(vocab_test)))

vocab_train=sorted(vocab_train)#must have deterministic ordering, so word2id dictionary is reproducible across invocations

word2id={w:i for i,w in enumerate(vocab_train)} #id2word={i:w for i,w in enumerate(vocab_train)}

ids_train=np.array([word2id[word] for word in seq_train],copy=False,order='C')
ids_valid=np.array([word2id[word] for word in seq_valid],copy=False,order='C')
ids_test=np.array([word2id[word] for word in seq_test],copy=False,order='C')

data_train, steps_train, trainset_np = features_labels(ids_train, batch_size, seq_len, batch_first=False)
data_valid, steps_valid, valset_np   = features_labels(ids_valid, batch_size, seq_len, batch_first=False)
data_test,  steps_test, testset_np   = features_labels(ids_test,  batch_size, seq_len, batch_first=False)

trainset_np = trainset_np / len(vocab_train)
valset_np   = valset_np / len(vocab_valid)
testset_np  = testset_np / len(vocab_test)

dataset_train = tf.data.Dataset.from_tensor_slices(data_train).batch(seq_len, drop_remainder=True)
dataset_valid = tf.data.Dataset.from_tensor_slices(data_valid).batch(seq_len, drop_remainder=True)
dataset_test  = tf.data.Dataset.from_tensor_slices(data_test).batch(seq_len,  drop_remainder=True)

with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_train}.train.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[seq_len, seq_len]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_train}.train.csv",'a') as csvfile:
    np.savetxt(csvfile, trainset_np, fmt='%.4f', delimiter=",")

with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_valid}.val.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[seq_len, seq_len]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_valid}.val.csv",'a') as csvfile:
    np.savetxt(csvfile, valset_np, fmt='%.4f', delimiter=",")
    
with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_test}.test.csv",'w') as csvfile:
    np.savetxt(csvfile, np.array([[seq_len, seq_len]]),fmt='%d', delimiter=",")
with open(f"../../Datasets/4_ptb/ptb.ni={seq_len}.no={seq_len}.ts={1}.bs={steps_test}.test.csv",'a') as csvfile:
    np.savetxt(csvfile, testset_np, fmt='%.4f', delimiter=",")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import re
from tensorflow.keras.utils import to_categorical
from doc3 import training_doc3
# nltk.download('punkt')
display(training_doc3)
cleaned = re.sub(r'\W+', ' ', training_doc3).lower()
tokens = word_tokenize(cleaned)
train_len = 4
text_sequences = [tokens[i-train_len:i] for i in range(train_len, len(tokens))]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
vocabulary_size = len(tokenizer.word_counts)+1 #increased by 1 for the cause of padding
n_sequences = np.array(sequences, dtype='int32')

train_inputs, train_targets = n_sequences[:,:-1], n_sequences[:,-1]
train_targets = to_categorical(train_targets, num_classes=vocabulary_size)
seq_len = train_inputs.shape[1]

# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import LSTM
# from keras.layers import Embedding
# model = Sequential()
# model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
# model.add(LSTM(50,return_sequences=True))
# model.add(LSTM(50))
# model.add(Dense(50,activation='relu'))
# model.add(Dense(vocabulary_size, activation='softmax'))
# # compiling the network
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.fit(train_inputs,train_targets,epochs=500,verbose=0)

from keras.preprocessing.sequence import pad_sequences
input_text = input().strip().lower()
encoded_text = tokenizer.texts_to_sequences([input_text])[0]
pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
print(encoded_text, pad_encoded)
for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
    pred_word = tokenizer.index_word[i]
    print("Next word suggestion:",pred_word)

In [None]:
import numpy as np
import collections
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import re
from tensorflow.keras.utils import to_categorical

ptbdataset_path = '../../Datasets/4_ptb/ptbdataset'
with open(f'{ptbdataset_path}/ptb.train.txt','r') as f1,open(f'{ptbdataset_path}/ptb.valid.txt','r') as f2,open(
    f'{ptbdataset_path}/ptb.test.txt','r') as f3:
    seq_train=f1.read().replace('\n','')
    seq_valid=f2.read().replace('\n','')
    seq_test=f3.read().replace('\n','')

# 1.Pre-processing data
seq_train_cleaned = re.sub(r'\W+', '', seq_train).lower()
seq_train_cleaned = re.sub(r'\W*\b\w{1,3}\b', '', seq_train_cleaned)
seq_train_tokens = word_tokenize(seq_train_cleaned)
display(seq_train_tokens)
train_len = 4
text_seq_train = [seq_train_tokens[i-train_len:i] for i in range(train_len, len(tokens))]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_seq_train)
seq_train = tokenizer.texts_to_sequences(text_seq_train)
vocabulary_size = len(seq_train_tokens.word_counts)+1 #increased by 1 for the cause of padding
n_sequences = np.array(sequences, dtype='int32')

<div class="alert" style="background-color:#29C5F6; border-radius:5px; padding:0px 10px; "><h3 style='margin:15px 15px'>5. Skeleton based Action Recognition (NTU RGB+D) Problem</h3></div>

Source: 
* https://github.com/zibeu/Independently-Recurrent-Neural-Network---IndRNN 
* https://github.com/Sunnydreamrain/IndRNN_pytorch/
* https://gist.github.com/tmatha/905ae0c0d304119851d7432e5b359330

Hyperparams: https://arxiv.org/abs/1803.04831

Data Information: https://rose1.ntu.edu.sg/dataset/actionRecognition/

NTU RGB+D is a large-scale dataset for RGB-D human action recognition. It involves 56,880 samples of 60 action classes collected from 40 subjects. The actions can be generally divided into three categories: 40 daily actions (e.g., drinking, eating, reading), nine health-related actions (e.g., sneezing, staggering, falling down), and 11 mutual actions (e.g., punching, kicking, hugging). These actions take place under 17 different scene conditions corresponding to 17 video sequences (i.e., S001–S017). The actions were captured using three cameras with different horizontal imaging viewpoints, namely, −45∘,0∘, and +45∘. Multi-modality information is provided for action characterization, including depth maps, 3D skeleton joint position, RGB frames, and infrared sequences. The performance evaluation is performed by a cross-subject test that split the 40 subjects into training and test groups, and by a cross-view test that employed one camera (+45∘) for testing, and the other two cameras for training.

In [None]:
import glob
import os
import numpy as np
import csv

SKELETON_DIR = '../../Datasets/5_nturgb+d/nturgb+d_skeletons'
NPY_DIR = '../../Datasets/5_nturgb+d/nturgb+d_npy/'
TRAIN_DS = '_train.csv'
TEST_DS = '_test.csv'

skeleton_files_mask = os.path.join(SKELETON_DIR, '*.skeleton')
skeleton_files = glob.glob(skeleton_files_mask)


max_frame_count = 300
max_joints = 25

full_ds = []

#for idx, file_name in enumerate(skeleton_files[:568]):
for idx, file_name in enumerate(skeleton_files):
    # if idx%100 == 0:
    #     print(idx)
    basename = os.path.basename(file_name)
    name = os.path.splitext(basename)[0]
    label = name.split('A')[1]
    with open(file_name) as f:
        framecount = int(f.readline())

        sequence_frames = []

        for frame in range(framecount):
            body_count = int(f.readline())
            if body_count <= 0 or body_count>2:
                # print('continue, no body')
                break
            joints_xyz = []
            for body in range(body_count):
                skeleton_info = f.readline()
                joint_counts = int(f.readline()) #25
                for joint in range(joint_counts):
                    joint_info = f.readline()
                    joint_info_array = joint_info.split()
                    x, y, z = joint_info_array[:3]
                    joint_info_xyz = np.array([float(x), float(y), float(z)])
                    joints_xyz.append(joint_info_xyz)
            pad_joints = max_joints - len(joints_xyz)
            joints_xyz = np.array(joints_xyz)
            joints_xyz = np.pad(joints_xyz, ((0, pad_joints), (0, 0)), mode='constant')
            frame_xyz = np.stack(joints_xyz)
            sequence_frames.append(frame_xyz)
        if len(sequence_frames) > 0:
            file_name = os.path.join(NPY_DIR, name+ '.npy')
            sample = [name+'.npy', int(label)-1]
            full_ds.append(sample)
            np.save(file_name, np.array(sequence_frames))

#train_ds = full_ds[:380]
#test_ds = full_ds[380:]

train_ds = full_ds[:40320]
test_ds = full_ds[40320:]

with open(os.path.join(NPY_DIR, TRAIN_DS), 'w') as train_ds_file:
    writer = csv.writer(train_ds_file, lineterminator='\n')
    writer.writerows(train_ds)

with open(os.path.join(NPY_DIR, TEST_DS), 'w') as test_ds_file:
    writer = csv.writer(test_ds_file, lineterminator='\n')
    writer.writerows(test_ds)

for idx, file_name in enumerate(glob.glob(os.path.join(NPY_DIR, '*.npy'))):
    file = np.load(file_name)
    frame_dict[file_name] = file.shape[0]

In [None]:
dataset = np.array([])
fileset = np.array([])
seq_no_arr = [20, 30, 40, 50, 60]
for seq_no in seq_no_arr:
    for k, v in frame_dict.items():
        if v > seq_no:
            frameset = np.array([])
            fileset = np.array([])
            file = np.load(k)
            for frame in range(int(v/seq_no)*seq_no):
                if frameset.shape[0] == 0:
                    frameset = file[frame].reshape(file.shape[1], 1, file.shape[2])
                else:
                    frameset = np.concatenate((frameset, file[frame].reshape(file.shape[1], 1, file.shape[2])), axis=1)
                    if (frameset.shape[1]) % seq_no == 0:
                        if fileset.shape[0] == 0:
                            fileset = frameset.reshape(frameset.shape[0], frameset.shape[1]*frameset.shape[2])
                        else:
                            fileset = np.concatenate((fileset, frameset.reshape(frameset.shape[0], frameset.shape[1]*frameset.shape[2])), axis=0)
                        frameset = np.array([])
                    else:
                        frameset = np.concatenate((frameset, file[frame].reshape(file.shape[1], 1, file.shape[2])), axis=1)
            fileset = np.concatenate((fileset, np.tile(int(k.split('.')[-2].split('A')[-1]), fileset.shape[0]).reshape(-1, 1)), axis=1)
            dataset = fileset if dataset.shape[0] == 0 else np.concatenate((dataset, fileset), axis=0)
            display(dataset.shape)

    with open(f"../../Datasets/5_nturgb+d/nturgb+d.ni={3}.no={60}.ts={seq_no}.bs={50}.csv",'w') as csvfile:
        np.savetxt(csvfile, np.array([[3, 60]]),fmt='%d', delimiter=",")
    with open(f"../../Datasets/5_nturgb+d/nturgb+d.ni={3}.no={60}.ts={seq_no}.bs={50}.csv",'a') as csvfile:
        np.savetxt(csvfile, testset_np, fmt='%.4f', delimiter=",")

## WIreless Sensor Data Mining (WISDM) - Human Activity Recognition Problem

Source: 
* https://www.cis.fordham.edu/wisdm/dataset.php
* https://github.com/AchillesProject/MLCourse2020/blob/main/Project2/HAR_WISDM_MLCourse_v0_DataExploration.ipynb (access required)

Data Format: **[user],[activity],[timestamp],[x-acceleration],[y-accel],[z-accel]**

Number of examples: 1,098,207

Fields:
* user: 1..36
* activity: {Walking, Jogging, Sitting, Standing, Upstairs, Downstairs}
* timestamp: nanoseconds
* x-acceleration: floating-point values between -20 .. 20
* y-accel: floating-point values between -20 .. 20
* z-accel: floating-point values between -20 .. 20

The acceleration in the x direction as measured by the android phone's accelerometer. A value of 10 = 1g = 9.81 m/s^2, and 0 = no acceleration. The acceleration recorded includes gravitational acceleration toward the center of the Earth, so that when the phone is at rest on a flat surface the vertical axis will register +-10.

Data version 2 Information: https://archive.ics.uci.edu/ml/datasets/WISDM+Smartphone+and+Smartwatch+Activity+and+Biometrics+Dataset+


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
import sys

TIME_STEPS_arr = [90, 60, 50, 40]
isSTEPS_arr = [True, False]
SPLIT = 0.5

def divideData_perUser(data, per=0.5):
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    X_df = pd.DataFrame()
    for user in np.unique(data['user']):
        dataPerUser = data[data['user']==user]
        for tag in np.unique(dataPerUser['activity']):
            dataPerActivity = dataPerUser[dataPerUser['activity']==tag]
            n = len(dataPerActivity)
            train_df = train_df.append(dataPerActivity[0:int(n*per)])
            val_df = val_df.append(dataPerActivity[int(n*per):int(n)])
            X_df = X_df.append(dataPerActivity)        
    return X_df, train_df, val_df

# Utils functions for segmenting windows
def windows(data,window_size,step):
    start = 0
    while start< data.count():
        yield int(start), int(start + window_size)
        start+= step
def segment_signal(data, window_size = 90, step=40,columns=[]):
    segments = np.empty((0,window_size,len(columns)))
    labels= np.empty((0))
    for user in np.unique(data['user']):
        userdata = data[(data.user == user)]
        for tag in np.unique(userdata['activity']):
            sub_class_data = userdata[(userdata.activity == tag)]
            for (start, end) in windows(pd.Series(sub_class_data.index.values),window_size,step):
                if end > sub_class_data.shape[0] - 1:
                    end = sub_class_data.shape[0]
                    true_length = end - start
                    remaining_data_length = window_size - true_length
                    start -= remaining_data_length
                if (sub_class_data[start:end].isnull().values.any()):
                    print(sub_class_data[start:end].isnull().sum())
                if(sub_class_data[start:end].shape[0] == window_size):
                    segments = np.vstack([segments,np.dstack([sub_class_data[column][start:end] for column in columns])])
                    labels = np.append(labels, tag) 
    return segments, labels.reshape(-1, 1)

wisdmdataset_path = '../../Datasets/6_wisdm/WISDM_ar_v1.1'
COLUMNS = ['x_axis', 'y_axis', 'z_axis']

rdf = pd.read_csv(f'{wisdmdataset_path}/WISDM_ar_v1.1_raw.txt', header=None, names=['user', 'activity', 'timestamp', 'x_axis', 'y_axis', 'z_axis'])
rdf.z_axis.replace(regex=True, inplace=True, to_replace=r';', value=r'')
rdf['x_axis'] = rdf.x_axis.astype(np.float64)
rdf['y_axis'] = rdf.y_axis.astype(np.float64)
rdf['z_axis'] = rdf.z_axis.astype(np.float64)
rdf['timestamp'].apply(lambda x: float(x))
rdf.dropna(axis=0, how='any', inplace=True)
rdf['activity'] = LabelEncoder().fit(np.unique(rdf['activity'])).transform(rdf['activity'])

X_df, train_df, val_df = divideData_perUser(rdf, SPLIT)

for isSTEPS in isSTEPS_arr:
    for TIME_STEPS in TIME_STEPS_arr:
        STEP = int(round(TIME_STEPS/2,-1)) if isSTEPS else TIME_STEPS
        print(TIME_STEPS, STEP)

        X, y = segment_signal(X_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_train, y_train = segment_signal(train_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_val, y_val = segment_signal(val_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)

        y_train = OneHotEncoder().fit_transform(y_train).toarray()
        y_val = OneHotEncoder().fit_transform(y_val).toarray()
        y =  OneHotEncoder().fit_transform(y).toarray()

        y_train = np.tile(y_train, TIME_STEPS).reshape((y_train.shape[0], TIME_STEPS, y_train.shape[1]))
        y_val   = np.tile(y_val, TIME_STEPS).reshape((y_val.shape[0], TIME_STEPS, y_val.shape[1]))
        y       = np.tile(y, TIME_STEPS).reshape((y.shape[0], TIME_STEPS, y.shape[1]))

        df_train = np.concatenate((X_train, y_train), axis=2).reshape((X_train.shape[0], -1))
        df_val = np.concatenate((X_val, y_val), axis=2).reshape((X_val.shape[0], -1))
        df = np.concatenate((X,y), axis=2).reshape((X.shape[0], -1))
        
        print(X_train.shape, y_train.shape, df_train.shape)
        print(X_val.shape, y_val.shape, df_val.shape)
        print(X.shape, y.shape, df.shape)

        with open(f"{wisdmdataset_path}/../wisdm.ni={3}.no={6}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[3, 6]]),fmt='%d', delimiter=",")
        with open(f"{wisdmdataset_path}/../wisdm.ni={3}.no={6}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'a') as csvfile:
            np.savetxt(csvfile, df, fmt='%.4f', delimiter=",")

        with open(f"{wisdmdataset_path}/../wisdm.ni={3}.no={6}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[3, 6]]),fmt='%d', delimiter=",")
        with open(f"{wisdmdataset_path}/../wisdm.ni={3}.no={6}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'a') as csvfile:
            np.savetxt(csvfile, df_train, fmt='%.4f', delimiter=",")

        with open(f"{wisdmdataset_path}/../wisdm.ni={3}.no={6}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[3, 6]]),fmt='%d', delimiter=",")
        with open(f"{wisdmdataset_path}/../wisdm.ni={3}.no={6}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'a') as csvfile:
            np.savetxt(csvfile, df_val, fmt='%.4f', delimiter=",")

## WIreless Sensor Data Mining (WISDM) - Smartphone & Smartwatch Activity Problem

Source: https://archive.ics.uci.edu/ml/datasets/WISDM+Smartphone+and+Smartwatch+Activity+and+Biometrics+Dataset+

Raw's format: **[subject-id],[activity],[timestamp],[x-accel],[y-accel],[z-accel]**

Number of samples for non-hand-oriented activities (5 activities):
* Phone acceleration: 1,338,067
* Watch acceleration: 1,053,141
* Phone gyroscope:    1,006,749
* Watch gyroscope:    0,949,933

Fields:
* subject-id: 1600..1650 (51 participants)
* activity: {Walking - A, Jogging - B, Stairs - C, Sitting - D, Standing - E}
* timestamp: microsecond (Unix Time)
* x-acceleration: floating-point (can be positive or negative)
* y-accel: floating-point (can be positive or negative)
* z-accel: floating-point (can be positive or negative)

For the accelerometer sensor, the units are m/s2; while, for the gyroscope sensor, the units are radians/s. The force of gravity on Earth, which affects the accelerometer readings, is 9.8m/s2.

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
import sys, glob, os

TIME_STEPS_arr = [90, 60, 50, 40]
isSTEPS_arr = [True, False]
SPLIT = 0.5
COLUMNS = ['x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro']
activities_arr = ['A', 'B', 'C', 'D', 'E']
def divideData_perUser(data, per=0.5):
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    X_df = pd.DataFrame()
    for user in np.unique(data['user']):
        dataPerUser = data[data['user']==user]
        for tag in np.unique(dataPerUser['activity']):
            # if tag in activities_arr:
            dataPerActivity = dataPerUser[dataPerUser['activity']==tag]
            n = len(dataPerActivity)
            train_df = train_df.append(dataPerActivity[0:int(n*per)])
            val_df = val_df.append(dataPerActivity[int(n*per):int(n)])
            X_df = X_df.append(dataPerActivity)        
    return X_df, train_df, val_df

# Utils functions for segmenting windows
def windows(data,window_size,step):
    start = 0
    while start< data.count():
        yield int(start), int(start + window_size)
        start+= step
def segment_signal(data, window_size = 90, step=40,columns=[]):
    segments = np.empty((0,window_size,len(columns)))
    labels= np.empty((0))
    for user in np.unique(data['user']):
        userdata = data[(data.user == user)]
        for tag in np.unique(userdata['activity']):
            sub_class_data = userdata[(userdata.activity == tag)]
            for (start, end) in windows(pd.Series(sub_class_data.index.values),window_size,step):
                if end > sub_class_data.shape[0] - 1:
                    end = sub_class_data.shape[0]
                    true_length = end - start
                    remaining_data_length = window_size - true_length
                    start -= remaining_data_length
                if (sub_class_data[start:end].isnull().values.any()):
                    print(sub_class_data[start:end].isnull().sum())
                if(sub_class_data[start:end].shape[0] == window_size):
                    segments = np.vstack([segments,np.dstack([sub_class_data[column][start:end] for column in columns])])
                    labels = np.append(labels, tag) 
    return segments, labels.reshape(-1, 1)

wisdm_phone_path = '../../Datasets/6_wisdm/WISDM_ar_v2.0/wisdm-dataset/'
wisdm_phone_accel_path = '../../Datasets/6_wisdm/WISDM_ar_v2.0/wisdm-dataset/raw/phone/accel'
wisdm_phone_accel_files_mask = os.path.join(wisdm_phone_accel_path, '*.txt')
wisdm_phone_accel_files = sorted(glob.glob(wisdm_phone_accel_files_mask))

wisdm_phone_gyro_path = '../../Datasets/6_wisdm/WISDM_ar_v2.0/wisdm-dataset/raw/phone/gyro'
wisdm_phone_gyro_files_mask = os.path.join(wisdm_phone_gyro_path, '*.txt')
wisdm_phone_gyro_files = sorted(glob.glob(wisdm_phone_gyro_files_mask))

wisdm_phone_data = pd.DataFrame()
count = 0
for accel_file, gyro_file in zip(wisdm_phone_accel_files, wisdm_phone_gyro_files):
    accel_data = pd.read_csv(accel_file, header=None, names=['user', 'activity', 'timestamp', 'x_accel', 'y_accel', 'z_accel'], index_col=['user', 'activity', 'timestamp'])
    accel_data.z_accel.replace(regex=True, inplace=True, to_replace=r';', value=r'')
    accel_data = accel_data.loc[~accel_data.index.duplicated(keep='first')]
    gyro_data = pd.read_csv(gyro_file, header=None, names=['user', 'activity', 'timestamp', 'x_gyro', 'y_gyro', 'z_gyro'], index_col=['user', 'activity', 'timestamp'])
    gyro_data.z_gyro.replace(regex=True, inplace=True, to_replace=r';', value=r'')
    gyro_data = gyro_data.loc[~gyro_data.index.duplicated(keep='first')]
    user_data = pd.concat([accel_data, gyro_data], axis=1).dropna()
    wisdm_phone_data = wisdm_phone_data.append(user_data)
    
wisdm_phone_data = wisdm_phone_data.reset_index()
wisdm_phone_data['x_accel'] = wisdm_phone_data.x_accel.astype(np.float64)
wisdm_phone_data['y_accel'] = wisdm_phone_data.y_accel.astype(np.float64)
wisdm_phone_data['z_accel'] = wisdm_phone_data.z_accel.astype(np.float64)
wisdm_phone_data['x_gyro'] = wisdm_phone_data.x_gyro.astype(np.float64)
wisdm_phone_data['y_gyro'] = wisdm_phone_data.y_gyro.astype(np.float64)
wisdm_phone_data['z_gyro'] = wisdm_phone_data.z_accel.astype(np.float64)
wisdm_phone_data['timestamp'].apply(lambda x: float(x))
wisdm_phone_data.dropna(axis=0, how='any', inplace=True)
wisdm_phone_data = wisdm_phone_data[wisdm_phone_data.activity.isin(activities_arr) == True].reset_index()
wisdm_phone_data['activity'] = LabelEncoder().fit(np.unique(wisdm_phone_data['activity'])).transform(wisdm_phone_data['activity'])

X_df, train_df, val_df = divideData_perUser(wisdm_phone_data, SPLIT)

for isSTEPS in isSTEPS_arr:
    for TIME_STEPS in TIME_STEPS_arr:
        STEP = int(round(TIME_STEPS/2,-1)) if isSTEPS else TIME_STEPS
        print(TIME_STEPS, STEP)

        X_train, y_train = segment_signal(train_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X_val, y_val = segment_signal(val_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        X, y = segment_signal(X_df, window_size=TIME_STEPS, step=STEP,columns=COLUMNS)
        
        y_train = OneHotEncoder().fit_transform(y_train).toarray()
        y_val = OneHotEncoder().fit_transform(y_val).toarray()
        y =  OneHotEncoder().fit_transform(y).toarray()

        y_train = np.tile(y_train, TIME_STEPS).reshape((y_train.shape[0], TIME_STEPS, y_train.shape[1]))
        y_val   = np.tile(y_val, TIME_STEPS).reshape((y_val.shape[0], TIME_STEPS, y_val.shape[1]))
        y       = np.tile(y, TIME_STEPS).reshape((y.shape[0], TIME_STEPS, y.shape[1]))

        df_train = np.concatenate((X_train, y_train), axis=2).reshape((X_train.shape[0], -1))
        df_val = np.concatenate((X_val, y_val), axis=2).reshape((X_val.shape[0], -1))
        df = np.concatenate((X,y), axis=2).reshape((X.shape[0], -1))
        
        print(X_train.shape, y_train.shape, df_train.shape)
        print(X_val.shape, y_val.shape, df_val.shape)
        print(X.shape, y.shape, df.shape)

        with open(f"{wisdm_phone_path}/wisdm.ni={6}.no={5}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[6, 5]]),fmt='%d', delimiter=",")
        with open(f"{wisdm_phone_path}/wisdm.ni={6}.no={5}.ts={TIME_STEPS}.os={STEP}.spit={0}.all.csv",'a') as csvfile:
            np.savetxt(csvfile, df, fmt='%.4f', delimiter=",")

        with open(f"{wisdm_phone_path}/wisdm.ni={6}.no={5}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[6, 5]]),fmt='%d', delimiter=",")
        with open(f"{wisdm_phone_path}/wisdm.ni={6}.no={5}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.train.csv",'a') as csvfile:
            np.savetxt(csvfile, df_train, fmt='%.4f', delimiter=",")

        with open(f"{wisdm_phone_path}/wisdm.ni={6}.no={5}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'w') as csvfile:
            np.savetxt(csvfile, np.array([[6, 5]]),fmt='%d', delimiter=",")
        with open(f"{wisdm_phone_path}/wisdm.ni={6}.no={5}.ts={TIME_STEPS}.os={STEP}.spit={int(SPLIT*100)}.val.csv",'a') as csvfile:
            np.savetxt(csvfile, df_val, fmt='%.4f', delimiter=",")

90 40
(9561, 90, 6) (9561, 90, 5) (9561, 990)
(9561, 90, 6) (9561, 90, 5) (9561, 990)
(19064, 90, 6) (19064, 90, 5) (19064, 990)
60 30
(12742, 60, 6) (12742, 60, 5) (12742, 660)
(12742, 60, 6) (12742, 60, 5) (12742, 660)
(25311, 60, 6) (25311, 60, 5) (25311, 660)
50 20
(19061, 50, 6) (19061, 50, 5) (19061, 550)
(19061, 50, 6) (19061, 50, 5) (19061, 550)
(38022, 50, 6) (38022, 50, 5) (38022, 550)
40 20
(19064, 40, 6) (19064, 40, 5) (19064, 440)
(19064, 40, 6) (19064, 40, 5) (19064, 440)
(38025, 40, 6) (38025, 40, 5) (38025, 440)
90 90
(4287, 90, 6) (4287, 90, 5) (4287, 990)
(4287, 90, 6) (4287, 90, 5) (4287, 990)
(8517, 90, 6) (8517, 90, 5) (8517, 990)
60 60
(6380, 60, 6) (6380, 60, 5) (6380, 660)
(6380, 60, 6) (6380, 60, 5) (6380, 660)
(12744, 60, 6) (12744, 60, 5) (12744, 660)
50 50
(7671, 50, 6) (7671, 50, 5) (7671, 550)
(7671, 50, 6) (7671, 50, 5) (7671, 550)
(15300, 50, 6) (15300, 50, 5) (15300, 550)
40 40
(9565, 40, 6) (9565, 40, 5) (9565, 440)
(9565, 40, 6) (9565, 40, 5) (9565, 4