In [3]:
import numpy as np 
import pandas as pd 
import datetime
import re
import os, os.path
import time
from sklearn.model_selection import train_test_split
import random
import tensorflow as tf

# Save to Single File

Lets change our directory to the training data. Then go through all folders and files, appending all the data to a single data frame. Finally export that dataframe to a csv file.

In [2]:
cd ../Training_Data

/app/data/Training_Data


In [3]:
folders = ["fist_pump","single_wave","speed_mode","random_motion"]
files =[]
completedf = pd.DataFrame(columns=['gesture','acceleration'])
for idx1,folder in enumerate(folders):
    files = os.listdir(folder)
    for idx2,file in enumerate(files):
        df_temp = pd.read_csv(folder+'/'+file)
        #print(df_temp[['Acc_X','Acc_Y','Acc_Z','Gyro_X', 'Gyro_Y', 'Gyro_Z']].to_numpy())
        x=df_temp[['Acc_X','Acc_Y','Acc_Z']].to_numpy()
        series = pd.Series(data={'gesture': folder, 'acceleration':x.tolist()})
        df_temp2= pd.DataFrame([series])
        completedf=pd.concat([completedf,df_temp2], ignore_index=True)  
completedf.to_csv('complete_data.csv', index=False)

# Split data into Training, Validation, and Testing

## Using Scikit Learn

In [25]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
train_set, test_set = train_test_split(completedf, test_size=1 - train_ratio, random_state=0)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
val_set, test_set = train_test_split(test_set, test_size=test_ratio/
                                     (test_ratio + validation_ratio), random_state=0) 
print('len of train_set: '+ str(len(train_set)))
print('len of test_set: '+ str(len(test_set)))
print('len of val_set: '+ str(len(val_set)))
#print(x_train, x_val, x_test)
train_set.to_csv('train_set.csv', index=False)
test_set.to_csv('test_set.csv', index=False)
val_set.to_csv('val_set.csv', index=False)

len of train_set: 168
len of test_set: 23
len of val_set: 34


In [28]:
print('len of train_set Speed mode: '+ str(len(train_set.query('gesture == "speed_mode"'))))
print('len of val_set Speed mode: '+ str(len(val_set.query('gesture == "speed_mode"'))))
print('len of test_set Speed mode: '+ str(len(test_set.query('gesture == "speed_mode"'))))

print('len of train_set fist_pump: '+ str(len(train_set.query('gesture == "fist_pump"'))))
print('len of val_set fist_pump: '+ str(len(val_set.query('gesture == "fist_pump"'))))
print('len of test_set fist_pump: '+ str(len(test_set.query('gesture == "fist_pump"'))))

print('len of train_set single_wave: '+ str(len(train_set.query('gesture == "single_wave"'))))
print('len of val_set single_wave: '+ str(len(val_set.query('gesture == "single_wave"'))))
print('len of test_set single_wave: '+ str(len(test_set.query('gesture == "single_wave"'))))

print('len of train_set random_motion: '+ str(len(train_set.query('gesture == "random_motion"'))))
print('len of val_set random_motion: '+ str(len(val_set.query('gesture == "random_motion"'))))
print('len of test_set random_motion: '+ str(len(test_set.query('gesture == "random_motion"'))))

len of train_set Speed mode: 18
len of val_set Speed mode: 4
len of test_set Speed mode: 4
len of train_set fist_pump: 49
len of val_set fist_pump: 6
len of test_set fist_pump: 5
len of train_set single_wave: 43
len of val_set single_wave: 11
len of test_set single_wave: 7
len of train_set random_motion: 58
len of val_set random_motion: 13
len of test_set random_motion: 7


## Using Pandas

In [None]:
completedf_copy= completedf.copy()
train_set1 = completedf_copy.sample(frac=0.75, random_state=4)
test_set1 = completedf_copy.drop(train_set1.index)
val_set1 = test_set1.sample(frac=0.6, random_state=4)
test_set1 = test_set1.drop(val_set.index)

In [None]:
print(len(train_set),len(val_set),len(test_set))

print('len of x_train Speed mode: '+ str(len(train_set.query('gesture == "speed_mode"'))))
print('len of x_val Speed mode: '+ str(len(x_val.query('gesture == "speed_mode"'))))
print('len of x_test Speed mode: '+ str(len(test_set.query('gesture == "speed_mode"'))))

print('len of x_train fist_pump: '+ str(len(train_set.query('gesture == "fist_pump"'))))
print('len of x_val fist_pump: '+ str(len(x_val.query('gesture == "fist_pump"'))))
print('len of x_test fist_pump: '+ str(len(test_set.query('gesture == "fist_pump"'))))

print('len of x_train single_wave: '+ str(len(train_set.query('gesture == "single_wave"'))))
print('len of x_val single_wave: '+ str(len(x_val.query('gesture == "single_wave"'))))
print('len of x_test single_wave: '+ str(len(test_set.query('gesture == "single_wave"'))))

print('len of x_train random_motion: '+ str(len(train_set.query('gesture == "random_motion"'))))
print('len of x_val random_motion: '+ str(len(x_val.query('gesture == "random_motion"'))))
print('len of x_test random_motion: '+ str(len(test_set.query('gesture == "random_motion"'))))

# Data Processing

## Load Data From CSV

In [2]:
cd ../Training_Data

/app/data/Training_Data


In [4]:
train_set = pd.read_csv('train_set.csv',converters={'acceleration': eval})
test_set = pd.read_csv('test_set.csv',converters={'acceleration': eval})
val_set = pd.read_csv('val_set.csv',converters={'acceleration': eval})

## Augment Training Data

Why I'm augmenting my Training Data. At the time of creation, I have 168 gesture recordings. each consisting of roughly 750 samples. That is too small a number of samples to really train a model. I want to create a tensorflow lite model I can put onto my micro-controller. So how to get more gesture recordings? 

1) I can take more data points, but this will take me some time

2) I can manipulate and save that manipulated recording as a new sample. 

Augmentation has another advantage over just creating more data, it helps to reduce overfitting. 


What augmentation makes sense for for my data? My data is a time series of x,y,z accelerations, of my arm moving. Thinking about myself, I can do the gesture faster and slower, more theatrical or reserved, and more cleanly or more sloppily. Luckily, I can mimic those types of changes with different algorithms. 

1) Increase and decrease the magnitudes of the xyz data

2) Shift the data to complete faster or slower. Time stretch/shrink

3) Add some noise to the data points

4) Increase and decrease the the xyz data uniformly 

5) Shift the time window around the data, making the data start sooner or later

In [5]:
fract=[(3, 2), (5, 3), (2, 3), (3, 4), (9, 5), (6, 5), (4, 5)] #for creating magnitues
#magnitude shifting 
accel_sets = train_set['acceleration'].to_numpy()
magnitude_set = []
magnitude_labels=[]
magnitudedf=pd.DataFrame(columns=['gesture','acceleration'])
for idx1, aset in enumerate(accel_sets):
    for molecule, denominator in fract:
        magSeries = pd.Series(data={'gesture': train_set['gesture'][idx1],
                                    'acceleration':(np.array(aset, dtype=np.float32) * 
                                                    molecule / denominator).tolist()})
        magnitudedf_temp=pd.DataFrame([magSeries])
        magnitudedf=pd.concat([magnitudedf,magnitudedf_temp], ignore_index=True) 

In [6]:
# Time stretch and shrink
def time_wrapping(molecule, denominator, data):
    """Generate (molecule/denominator)x speed data."""
    tmp_data = [[0 for i in range(len(data[0]))] 
                for j in range((int(len(data) / molecule) - 1) * denominator)]
    for i in range(int(len(data) / molecule) - 1):
        for j in range(len(data[i])):
            for k in range(denominator):
                tmp_data[denominator * i +
                         k][j] = (data[molecule * i + k][j] * (denominator - k) +
                                  data[molecule * i + k + 1][j] * k) / denominator
    return tmp_data

timedf=pd.DataFrame(columns=['gesture','acceleration'])
for idx1, aset in enumerate(accel_sets):
    shiftedAccels =[]
    for molecule, denominator in fract:
        shiftedAccels=time_wrapping(molecule, denominator, aset)
        timeSeries = pd.Series(data={'gesture': train_set['gesture'][idx1],
                                     'acceleration':shiftedAccels})
        timedf_temp=pd.DataFrame([timeSeries])
        timedf=pd.concat([timedf,timedf_temp], ignore_index=True) 

In [7]:
# Add Noise 
noisedf=pd.DataFrame(columns=['gesture','acceleration'])
noiseyAccels =[]
for idx1, aset in enumerate(accel_sets):
    for t in range(5):
        tmp_data = [[0 for i in range(len(aset[0]))] for j in range(len(aset))]
        for q in range(len(aset)):
            for j in range(len(aset[q])):
                  tmp_data[q][j] = aset[q][j] + 4 * random.random()
        noiseSeries = pd.Series(data={'gesture': train_set['gesture'][idx1],
                                      'acceleration':tmp_data})  
        noisedf_temp=pd.DataFrame([noiseSeries])
        noisedf=pd.concat([noisedf,noisedf_temp], ignore_index=True)

In [8]:
# Shift data uniformily up or down in mag
shiftdf=pd.DataFrame(columns=['gesture','acceleration'])
for idx1, aset in enumerate(accel_sets):
    for i in range(5):
        shiftSeries = pd.Series(data={'gesture': train_set['gesture'][idx1],
                                      'acceleration':(np.array(aset, dtype=np.float32)+
                                                      ((random.random()- 0.5)*50)).tolist()})
        shiftdf_temp=pd.DataFrame([shiftSeries])
        shiftdf=pd.concat([shiftdf,shiftdf_temp], ignore_index=True)

Lets add all the databases together 

In [9]:
processedTrain_set = pd.DataFrame(columns=['gesture','acceleration'])
processedTrain_set = pd.concat([train_set,magnitudedf, timedf, noisedf, shiftdf], 
                              ignore_index=True)
processedTrain_set.to_csv('augment_train_data.csv', index=False)

## Gesture Length's Inconsistency 

You can see in the Arduino sketch and the subsequent python data logging sketch that I did not give it a sampling rate, or the frequency at which samples are taken. So I will have an inconsistent number of data points for each motion captured. With an unknown number of data points in each sample it can become difficult to train a machine learning model. So why did I choose to do this to my data?

1. I wanted to test the consistency of my sensor and micro-controller real max data acquisition. With this knowledge I can set a fast but realistic sampling rate. 
2. The real world is filled with messy data. I wanted to gather a data set that I would have to process to use in model training. 


From the Data exploration it seems that the number of samples for each motion is roughly between 250-260 samples per second or 750-780 samples for a single gesture. 
The output of the processed data should be a consistent number, because we need a known number of inputs into our model. This number will be 760 data points. 

760.8633333333333


In [12]:
proc_acc = processedTrain_set['acceleration'].to_numpy()

lensum=0
for gest in proc_acc:
    lensum+=len(gest)  
print(lensum/len(proc_acc))

In [14]:
  def pad(data, seq_length, dim):
    """Get neighbour padding."""
    noise_level = 1
    padded_data = []
    # Before- Neighbour padding
    tmp_data = (np.random.rand(seq_length, dim) - 0.5) * noise_level + data[0]
    tmp_data[(seq_length -
              min(len(data), seq_length)):] = data[:min(len(data), seq_length)]
    padded_data.append(tmp_data)
    # After- Neighbour padding
    tmp_data = (np.random.rand(seq_length, dim) - 0.5) * noise_level + data[-1]
    tmp_data[:min(len(data), seq_length)] = data[:min(len(data), seq_length)]
    padded_data.append(tmp_data)
    return padded_data

In [15]:
pad_train_df = pd.DataFrame(columns=['gesture','acceleration'])
for idx4, proacc in enumerate(proc_acc):
    pad_acc = pad(proacc,760,3)
    for half in pad_acc:
        padSeries = pd.Series(data={'gesture': processedTrain_set['gesture'][idx4],
                                      'acceleration': half.tolist()})
        paddf_temp=pd.DataFrame([padSeries])
        pad_train_df=pd.concat([pad_train_df,paddf_temp], ignore_index=True)
    

In [16]:
proc_acc = test_set['acceleration'].to_numpy()

pad_test_df = pd.DataFrame(columns=['gesture','acceleration'])
for idx4, proacc in enumerate(proc_acc):
    pad_acc = pad(proacc,760,3)
    for half in pad_acc:
        padSeries = pd.Series(data={'gesture': test_set['gesture'][idx4],
                                      'acceleration': half.tolist()})
        paddf_temp=pd.DataFrame([padSeries])
        pad_test_df=pd.concat([pad_test_df,paddf_temp], ignore_index=True)

In [17]:
proc_acc = val_set['acceleration'].to_numpy()

pad_val_df = pd.DataFrame(columns=['gesture','acceleration'])
for idx4, proacc in enumerate(proc_acc):
    pad_acc = pad(proacc,760,3)
    for half in pad_acc:
        padSeries = pd.Series(data={'gesture': val_set['gesture'][idx4],
                                      'acceleration': half.tolist()})
        paddf_temp=pd.DataFrame([padSeries])
        pad_val_df=pd.concat([pad_val_df,paddf_temp], ignore_index=True)

In [18]:
gest_id = {'single_wave': 0, 'fist_pump': 1, 'random_motion': 2, 'speed_mode': 3}
pad_train_df['gesture'] = pad_train_df['gesture'].apply(lambda x: gest_id[x])
pad_test_df['gesture'] = pad_test_df['gesture'].apply(lambda x: gest_id[x])
pad_val_df['gesture'] = pad_val_df['gesture'].apply(lambda x: gest_id[x])

In [19]:
pad_val_df.to_csv('processed_val_set.csv', index=False)
pad_test_df.to_csv('processed_test_set.csv', index=False)
pad_train_df.to_csv('processed_train_set.csv', index=False)