In [14]:
import pandas as pd
import glob
import numpy as np

In [2]:
TRAIN_FILEPATH = "../capstone_data/train4/*.csv"
TEST_FILEPATH = "../capstone_data/test4/*.csv"

SENSOR_COLS = ["ax", "ay", "az", "y", "p", "r", "start_move", "checksum"]

DANCE_TO_NUM_MAP = {'dab': 0, 'jamesbond': 1, 'mermaid': 2, 
                    'scarecrow': 3, 'pushback': 4, 'cowboy': 5, 
                    'window360': 6, 'snake': 7, 'logout': 8, 'logout2': 9}

In [3]:
def load_data_paths(location):
    """
    Gets file path to each csv data file packaged into an array.
    Input: filepath to csv files i.e. string
    Return: 1D array of filepaths to each csv 
    """
    data_paths = []
    for name in glob.glob(location):
        data_paths.append(name)
    return data_paths

In [4]:
def gen_rawData(given_filepaths):
    """
    Data from each csv packaged into a dict. Ensure that filenames are like "dab_sean_1.csv"
    Input: given_filepaths i.e. filepaths 1D array
    Return: dictionary of raw dfs, with key being {subjectName}_{dance}_{trialNum}
    """
    global SENSOR_COLS
    
    frames = {}
    for filepath in given_filepaths:
        _, s, subjectName, ext = filepath.split("_")
        _, _, dance = s.split("/")
        trialNum, _ = ext.split(".")
        raw_df = pd.read_csv(filepath, names=SENSOR_COLS, header=None, index_col=None)
        raw_df.dropna(inplace= True)
        raw_df.drop(columns=["y", "start_move", "checksum"], axis=1,inplace=True)
        raw_df.reset_index(drop=True,inplace=True)
        raw_df["subject"] = subjectName
        raw_df["trialNum"] = int(trialNum)
        raw_df["dance"] = dance
        frames[f"{subjectName}_{dance}_{trialNum}"] = raw_df
        
    return frames 

In [5]:
def concatenator(raw_dic):
    """
    Concatenate raw dict along the rows to generate a concatenated df.
    Input: raw_dic
    Return: concatenated_df i.e. dframe 
    """
    concatenated_df = pd.concat(raw_dic.values(), axis = 0, ignore_index=True)
    return concatenated_df

In [6]:
def getTrainAndTestData():
    """
    Map the dance moves to target and get the full training & test data frames.
    """
    
    global TRAIN_FILEPATH, TEST_FILEPATH, DANCE_TO_NUM_MAP
    
    raw_train_df = concatenator(gen_rawData(load_data_paths(TRAIN_FILEPATH)))
    raw_test_df = concatenator(gen_rawData(load_data_paths(TEST_FILEPATH)))
    raw_train_df["target"] = raw_train_df["dance"].map(DANCE_TO_NUM_MAP)
    raw_test_df["target"] = raw_test_df["dance"].map(DANCE_TO_NUM_MAP)
    
    
    return raw_train_df, raw_test_df

In [7]:
raw_train_df, raw_test_df = getTrainAndTestData()

In [8]:
raw_train_df.describe()

Unnamed: 0,ax,ay,az,p,r,trialNum,target
count,241200.0,241200.0,241200.0,241200.0,241200.0,241200.0,241200.0
mean,-45.133955,21.678997,13.909648,35.518545,-209.223835,3.776119,3.641791
std,73.29868,80.720217,69.525137,155.091769,213.856568,2.163617,2.933946
min,-392.0,-226.0,-302.0,-732.0,-752.0,1.0,0.0
25%,-86.0,-42.0,-30.0,-44.0,-379.0,2.0,1.0
50%,-45.0,4.0,12.0,49.0,-227.0,4.0,3.0
75%,-8.0,75.0,60.0,142.0,-79.0,5.0,6.0
max,399.0,399.0,317.0,412.0,532.0,10.0,9.0


In [9]:
raw_test_df.describe()

Unnamed: 0,ax,ay,az,p,r,trialNum,target
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,-53.548667,10.76395,16.977117,9.013817,-217.137467,3.0,4.5
std,96.116135,87.25214,96.776882,158.378049,192.926923,1.414225,2.872305
min,-400.0,-247.0,-344.0,-573.0,-717.0,1.0,0.0
25%,-110.0,-57.0,-49.0,-92.0,-364.0,2.0,2.0
50%,-49.0,-3.0,16.0,28.0,-224.0,3.0,4.5
75%,3.0,65.0,81.0,124.0,-86.0,4.0,7.0
max,394.0,399.0,371.0,403.0,361.0,5.0,9.0


In [20]:
raw_test_df["dance"].unique()

array(['snake', 'logout', 'mermaid', 'cowboy', 'jamesbond', 'logout2',
       'dab', 'window360', 'scarecrow', 'pushback'], dtype=object)

In [11]:
# raw_train_df