In [1]:
import numpy as np 
import pandas as pd 
import datetime
import re
import os, os.path
import time
from sklearn.model_selection import train_test_split

# Save to Single File

Lets change our directory to the training data. Then go through all folders and files, appending all the data to a single data frame. Finally export that dataframe to a csv file.

In [2]:
cd ../Training_Data

/app/data/Training_Data


In [3]:
folders = ["fist_pump","single_wave","speed_mode","random_motion"]
files =[]
completedf = pd.DataFrame(columns=['gesture','acceleration'])
for idx1,folder in enumerate(folders):
    files = os.listdir(folder)
    for idx2,file in enumerate(files):
        df_temp = pd.read_csv(folder+'/'+file)
        #print(df_temp[['Acc_X','Acc_Y','Acc_Z','Gyro_X', 'Gyro_Y', 'Gyro_Z']].to_numpy())
        x=df_temp[['Acc_X','Acc_Y','Acc_Z']].to_numpy()
        series = pd.Series(data={'gesture': folder, 'acceleration':x.tolist()})
        df_temp2= pd.DataFrame([series])
        completedf=pd.concat([completedf,df_temp2], ignore_index=True)  
completedf.to_csv('complete_data.csv', index=False)

# Split data into Training, Validation, and Testing

## Using Scikit Learn

In [5]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
# the _junk suffix means that we drop that variable completely
train_set, test_set = train_test_split(completedf, test_size=1 - train_ratio, random_state=0)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
val_set, test_set = train_test_split(test_set, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0) 
print('len of train_set: '+ str(len(train_set)))
print('len of test_set: '+ str(len(test_set)))
print('len of val_set: '+ str(len(val_set)))
#print(x_train, x_val, x_test)
train_set.to_csv('train_set.csv', index=False)
test_set.to_csv('test_set.csv', index=False)
val_set.to_csv('val_set.csv', index=False)

len of train_set: 168
len of test_set: 23
len of val_set: 34


In [8]:
print('len of train_set Speed mode: '+ str(len(train_set.query('gesture == "speed_mode"'))))
print('len of val_set Speed mode: '+ str(len(val_set.query('gesture == "speed_mode"'))))
print('len of test_set Speed mode: '+ str(len(test_set.query('gesture == "speed_mode"'))))

print('len of train_set fist_pump: '+ str(len(train_set.query('gesture == "fist_pump"'))))
print('len of val_set fist_pump: '+ str(len(val_set.query('gesture == "fist_pump"'))))
print('len of test_set fist_pump: '+ str(len(test_set.query('gesture == "fist_pump"'))))

print('len of train_set single_wave: '+ str(len(train_set.query('gesture == "single_wave"'))))
print('len of val_set single_wave: '+ str(len(val_set.query('gesture == "single_wave"'))))
print('len of test_set single_wave: '+ str(len(test_set.query('gesture == "single_wave"'))))

print('len of train_set random_motion: '+ str(len(train_set.query('gesture == "random_motion"'))))
print('len of val_set random_motion: '+ str(len(val_set.query('gesture == "random_motion"'))))
print('len of test_set random_motion: '+ str(len(test_set.query('gesture == "random_motion"'))))

len of train_set Speed mode: 18
len of val_set Speed mode: 4
len of test_set Speed mode: 4
len of train_set fist_pump: 49
len of val_set fist_pump: 6
len of test_set fist_pump: 5
len of train_set single_wave: 43
len of val_set single_wave: 11
len of test_set single_wave: 7
len of train_set random_motion: 58
len of val_set random_motion: 13
len of test_set random_motion: 7


## Using Pandas

In [None]:
completedf_copy= completedf.copy()
train_set1 = completedf_copy.sample(frac=0.75, random_state=4)
test_set1 = completedf_copy.drop(train_set1.index)
val_set1 = test_set1.sample(frac=0.6, random_state=4)
test_set1 = test_set1.drop(val_set.index)

In [None]:
print(len(train_set),len(val_set),len(test_set))

print('len of x_train Speed mode: '+ str(len(train_set.query('gesture == "speed_mode"'))))
print('len of x_val Speed mode: '+ str(len(x_val.query('gesture == "speed_mode"'))))
print('len of x_test Speed mode: '+ str(len(test_set.query('gesture == "speed_mode"'))))

print('len of x_train fist_pump: '+ str(len(train_set.query('gesture == "fist_pump"'))))
print('len of x_val fist_pump: '+ str(len(x_val.query('gesture == "fist_pump"'))))
print('len of x_test fist_pump: '+ str(len(test_set.query('gesture == "fist_pump"'))))

print('len of x_train single_wave: '+ str(len(train_set.query('gesture == "single_wave"'))))
print('len of x_val single_wave: '+ str(len(x_val.query('gesture == "single_wave"'))))
print('len of x_test single_wave: '+ str(len(test_set.query('gesture == "single_wave"'))))

print('len of x_train random_motion: '+ str(len(train_set.query('gesture == "random_motion"'))))
print('len of x_val random_motion: '+ str(len(x_val.query('gesture == "random_motion"'))))
print('len of x_test random_motion: '+ str(len(test_set.query('gesture == "random_motion"'))))

# Data Preprocess

## Load Data From CSV

In [None]:
cd ../Training_Data

In [9]:
train_set = pd.read_csv('train_set.csv')
test_set = pd.read_csv('test_set.csv')
val_set = pd.read_csv('val_set.csv')

           gesture                                       acceleration
71     single_wave  [[-7.851, -1.138, 6.209], [-7.851, -1.138, 6.2...
22       fist_pump  [[-8.13, -0.575, 3.906], [-8.13, -0.575, 3.906...
204  random_motion  [[-11.991, -1.032, 4.165], [-12.01700000000000...
45       fist_pump  [[-8.477, -2.352, 2.727], [-8.477, -2.352, 2.7...
199  random_motion  [[-10.081, 1.054, 2.576], [-10.081, 1.054, 2.5...
..             ...                                                ...
67     single_wave  [[-5.607, -5.888, 4.223], [-5.607, -5.888, 4.2...
192  random_motion  [[-9.102, -2.658, 2.386], [-9.102, -2.658, 2.3...
117    single_wave  [[-7.3660000000000005, -2.628, 5.154], [-7.627...
47       fist_pump  [[-3.992, -2.7510000000000003, 3.002], [-3.992...
172  random_motion  [[-8.097000000000001, 0.529, 3.385], [-8.09700...

[168 rows x 2 columns]            gesture                                       acceleration
44       fist_pump  [[-12.818, -5.899, 8.401], [-12.304, -5.94, 7.

## Data Sampling Inconsistencies 
You can see in the Arduino sketch and the subsequent python data logging sketch that I did not give it a sampling rate, or the frequency at which samples are taken. So I will have an inconsistent number of data points for each motion captured. With an unknown number of data points in each sample it can become difficult to train a machine learning model. So why did I choose to do this to my data?

1. I wanted to test the consistency of my sensor and micro-controller real max data acquisition. With this knowledge I can set a fast but realistic sampling rate. 
2. The real world is filled with messy data. I wanted to gather a data set that I would have to clean and preprocess to use in model training. 


From the Data exploration it seems that the number of samples for each motion is roughly between 250-260 samples per second or 750-780 samples for a single gesture. 
The output of the processed data should be a consistant number, because we need a known number of inputs into our model. This number will be 750 data points. 