In [1]:
import math
import pathlib
import csv
import tqdm # This library enables the timer in the "Main Script" block.
import pandas as pd # pandas used to normalize the data.

# Control Variables

In [2]:
test_only = False
    # If this is enabled, then no files will be created.
    # Instead only the header will be printed so you can see the columns you'll get.
    
include_activity_letter     = True
include_binned_distrubution = False
number_of_bins = 10 # 10 is used by the publishers.

base_filename = 'corrected_categories_no_bins_'
    # Use a base_filname to differentiate datasets.
    # The name should end with an underscore.  Example:
    #     'no_bins_'

# Utility Functions

The utility functions do not contribute significantly to understanding of the data model, so you should not have to read the code in this section unless there is an issue.  An underscore begins the name of each utility function to distinguish them from similarly-named functions in other libraries.

In [3]:
def _mean( data):
    return sum( data)/ float( len( data))

def _std_dev(data):
    mean = _mean( data)
    return math.sqrt(sum( [(mean - d)**2 for d in data])) / len(data)

def _abs_dev( data):
    mean = _mean( data)
    return sum( [abs( mean - d) for d in data]) / len(data)

def _min_max( data): 
    '''Helper function for self.generate_bins()'''
    _min = data[0]
    _max = data[0]
    for d in data:
        if d < _min:
            _min = d
        if d > _max:
            _max = d
    return _min, _max

def _bin_upper_edges( data, n):
    '''Helper function for self.generate_bins()'''
    lower_bound, upper_bound = _min_max( data)
    _range = upper_bound - lower_bound
    interval = _range / float( n)
    return [ lower_bound + i * interval for i in range( n-1)]+[upper_bound]

def _bin_proportions( data, n):
    '''Helper function for self.generate_bins()'''
    upper_edges = _bin_upper_edges( data, n)
    count = [0 for _ in range( n)]
    for d in data:
        counted = False
        for i in range( n-1):
            if not counted and d <= upper_edges[ i]:
                counted = True
                count[ i] += 1
        if not counted:
            count[ -1] += 1
    total = len( data)
    return [c/float(total) for c in count]


# Data Structures

This primary transformation is chuncking the data into ten-second intervals, so there are two custom data structures: a **RawTimeSeriesData** object will contain a single time-series data point, and a **TenSecondInterval** object will control access to the transformed data.

## Data Structures: RawTimeSeriesData

In [4]:
class RawTimeSeriesData:
    ''' RawTimeSeriesData encapsulates access to raw data.'''
    def __init__( self, data_list):
        self.subject_id = int(   data_list[0]) # an integer, 1600-1650
        self.activity   = str(   data_list[1]) # a letter, A-S
        self.timestamp  = int(   data_list[2]) # an integer, Linux Time
        self.x          = float( data_list[3]) # x, y, z are numbers, possibly negative
        self.y          = float( data_list[4])
        self.z          = float( data_list[5][:-1]) # lines end with a semicolon
        
    def __str__( self):
        return f"Record of {self.subject_id} at {self.timestamp}: {self.x}, {self.y}, {self.z}"

## Data Structures: TenSecondInterval

Technically, the interval may not be exactly 10 seconds long - instead the size is exactly 200 data points.  Sensors are supposed to be polled 20 times/second but the actual frequency may have vary, since the device being used may prioritize other tasks for CPU time.  A choice had to be made between using the same time interval for each chunk and ensuring that each chunk has the same number of data.  We went with the second option for statistical reasons - it keeps the resulting dataset-of-chunks homoskedastic.

In [5]:
class TenSecondInterval:
    ''' TenSecondInterval encapsulates access to transformed data.
    
    
    Target Attributes (4 variables)
    -------------------------------
        activity        : a letter , A-S
        activity_cat_nh : a boolean, is this a non-hand activity?
        activity_cat_hg : a boolean, is this a hand/ general activity?
        activity_cat_he : a boolean, is this a hand/ eating activity?
        
    Data Attributes 
    ----------------
        Summary Statistics   : 10 variables
        Binned Distributions : 30 variables
    '''
    def __init__( self, raw_data_list):
        ''' raw_data_list : a list of RawTimeSeriesData objects
        '''
        self.data = raw_data_list
        self.activity = self.data[0].activity
        self.generate_summary_stats()
        self.generate_bins()
    
    @property
    def activity_onehot( self):
        temp = [0] * 18
        if self.activity < 'N':
            temp[ ord( self.activity) - 65] = 1
        else:
            temp[ ord( self.activity) - 66] = 1
        return temp
    
    @property
    def cat_nonhand( self):
        return self.activity in { 'A', 'B', 'C', 'D', 'E', 'M'}
    
    @property
    def cat_hand_general( self):
        return self.activity in { 'F', 'G', 'O', 'P', 'Q',' R', 'S'}
    
    @property
    def cat_hand_eating( self):
        return self.activity in { 'H', 'I', 'J', 'K', 'L'}
    
    @property
    def cat_hand_all( self):
        return self.cat_hand_general | self.cat_hand_eating
    
    @property
    def targets( self):
        return self.activity_onehot + [
            int(self.cat_nonhand), 
            int(self.cat_hand_general), 
            int(self.cat_hand_eating), 
            int(self.cat_hand_all)]
    
    def generate_bins( self):
        '''
        Binned Distributions (30 variables)
        -----------------------------------
                # The bins divide (max - min) into 10 equally-sized segments.
            x_bin_0, x_bin_1, ..., x_bin_9 : a proportion
            y_bin_0, y_bin_1, ..., y_bin_9 : a proportion
            z_bin_0, z_bin_1, ..., z_bin_9 : a proportion
        ''' 
        self.x_bin = _bin_proportions( [d.x for d in self.data], number_of_bins)
        self.y_bin = _bin_proportions( [d.y for d in self.data], number_of_bins)
        self.z_bin = _bin_proportions( [d.z for d in self.data], number_of_bins)
    
    def generate_summary_stats( self):
        '''
        Summary Statistics (10 variables)
        ---------------------------------
            {x,y,z}_mean    : the mean value
            {x,y,z}_std_dev : standard deviation
            {x,y,z}_abs_dev : mean absolute deviation
            resultant       : mean resultant value where resultant =
                = sqrt( x^2 + y^2 + z^2) # averaged over all data points
        '''
        data      = [d.x for d in self.data]
        self.x_mean    = _mean(    data)
        self.x_std_dev = _std_dev( data)
        self.x_abs_dev = _abs_dev( data)
        
        data      = [d.y for d in self.data]
        self.y_mean    = _mean(    data)
        self.y_std_dev = _std_dev( data)
        self.y_abs_dev = _abs_dev( data)
        
        data      = [d.z for d in self.data]
        self.z_mean    = _mean(    data)
        self.z_std_dev = _std_dev( data)
        self.z_abs_dev = _abs_dev( data)
        
        self.resultant = sum( [math.sqrt(d.x**2 + d.y**2 + d.z**2) for d in self.data])/float( len( data))
        
    def __str__( self):
        fields = list()
        if include_activity_letter:
            fields += self.activity
        fields += self.targets
        fields += [self.x_mean, self.y_mean, self.z_mean]
        fields += [self.x_std_dev, self.y_std_dev, self.z_std_dev]
        fields += [self.x_abs_dev, self.y_abs_dev, self.z_abs_dev]
        fields += [self.resultant]
        if include_binned_distrubution:
            fields += self.x_bin
            fields += self.y_bin
            fields += self.z_bin
        return ','.join( [str(field) for field in fields])

# Generators

The memory footprint is kept managable with a couple by the **File Generator**, which provides tha path to one file at a time' and the **Time-series Data Chunker**, which puts together a new chunk one line at a time when requested so the whole file is never in memory.

## File Generator

In [6]:
def file_generator( device, sensor):
    for user in range(1600, 1651):
        file_name = f"data_{user}_{sensor}_{device}.txt"
        file_path = pathlib.Path.cwd() / 'raw' / device / sensor / file_name 
        yield file_path
        
        if test_only:
            break

# Time-series Data Chunker

In [7]:
def ten_second_intervals( csv_file_path):
    '''This generator yields an array of records.
    
    csv_file - should have a unix time stamp in the first position.
    '''
    with open( csv_file_path) as fh:
        reader = csv.reader( fh)
        current_activity = 'A'
        current_chunk = list()
        for line in reader:
            raw_data = RawTimeSeriesData( line)
            if raw_data.activity == current_activity:
                current_chunk.append( raw_data)
                if len( current_chunk) == 200:
                    yield TenSecondInterval( current_chunk)
                    current_chunk = list()
                    

            else: 
                # In this case, there were not enough data to make a chunk.
                current_activity = raw_data.activity
                current_chunk = list()

# Main Script

In [8]:
labels = ''
if include_activity_letter:
    labels += 'activity_letter,'
labels += 'activity_a,activity_b,activity_c,activity_d,activity_e,activity_f,activity_g,activity_h,activity_i,activity_j,activity_k,activity_l,activity_m,activity_o,activity_p,activity_q,activity_r,activity_s,cat_nonhand,cat_hand_general,cat_hand_eating,cat_hand_all,x_mean,y_mean,z_mean,x_std_dev,y_std_dev,z_std_dev,x_abs_dev,y_abs_dev,z_abs_dev,resultant'
if include_binned_distrubution:
    labels += ','
    labels += ','.join(
        [f'x_bin_{i}' for i in range(number_of_bins)] +
        [f'y_bin_{i}' for i in range(number_of_bins)] +
        [f'z_bin_{i}' for i in range(number_of_bins)]
    )

for device in ( 'phone', 'watch'):
    if test_only:
        print( labels)
        break
        
    for sensor in ( 'accel', 'gyro'):
        print( f"Building {device} {sensor} dataset...")
        unnormalized_data = f"{base_filename}unnormalized_{device}_{sensor}.csv"
        
        with open( unnormalized_data, "w") as fh:
            fh.write(labels + '
')
            for sensor_log in tqdm.tqdm(file_generator( device, sensor)):
                for chunk in ten_second_intervals( sensor_log):
                    fh.write( str(chunk) + "\n")
 
        targets = set(map( lambda i: f"activity_{chr(i)}", range( 97, 116))) | {'activity_letter', 'cat_nonhand', 'cat_hand_general', 'cat_hand_eating', 'cat_hand_all'}
        df = pd.read_csv( unnormalized_data)
        for col in df:
            if col not in targets:
                col_mean = df[col].mean()
                col_std  = df[col].std()
                df[ col] = (df[ col] - col_mean) / col_std
        df.to_csv( f'{base_filename}normalized_{device}_{sensor}.csv', index=False)


0it [00:00, ?it/s]

Building phone accel dataset...


51it [01:17,  1.52s/it]
0it [00:00, ?it/s]

Building phone gyro dataset...


51it [00:59,  1.17s/it]
0it [00:00, ?it/s]

Building watch accel dataset...


51it [01:19,  1.57s/it]
0it [00:00, ?it/s]

Building watch gyro dataset...


51it [01:24,  1.66s/it]
