In [28]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

from dtw import dtw

import math
import ctypes

import pickle 

import multiprocessing
from joblib import Parallel, delayed

In [5]:
dd_the_big_table = dd.read_parquet(f'../data/db_merged.parquet/*.parquet',engine='fastparquet')

In [6]:
dd_the_big_table

Unnamed: 0_level_0,activity,collar_id,u_id_coleira,gx,gy,gz,ax,ay,az,temp,time_stamp,pet_id,size,race,age,genre,owner_id,lat,long,date_time
npartitions=50,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
,object,int64,object,float64,float64,float64,int64,int64,int64,int64,int64,int64,object,object,object,object,int64,object,object,datetime64[ns]
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [8]:
pet_ids = dd_the_big_table['pet_id'].unique().compute()

In [13]:
pet_ids[5]

1578500180

In [18]:
'''
    Function to convert Accelerometer & Gyroscope data to gravity and DPS respectively 
'''

# functions to convert to G (Gravity)
accel = lambda x:(ctypes.c_int16(int(x)).value * 32)/65536

# function to convert to DPS (Degree per second)
gyro = lambda x:(ctypes.c_int16(int(x)).value * 4000)/65536

def convert_values(df, conversion_type):
    for i in range(len(df)):
        
        if  df.iloc[i] != None and not math.isnan(df.iloc[i]):
            df.iloc[i] = conversion_type(df.iloc[i])
            
    
    return df

In [25]:
def slide_data(pet_id):
    df = dd_the_big_table.map_partitions(lambda df: df[df['pet_id'] == pet_id]).compute()
    df[['ax', 'ay', 'az']] = df[['ax', 'ay', 'az']].apply(convert_values, conversion_type=accel, axis=1)
    df['a_xyz'] = np.sqrt(df['ax']**2 + df['ay']**2 + df['az']**2)
    df.sort_values(by=['date_time'], inplace= True)
    df = df.reset_index()
    
    

    # Determines the window of sliding of a 10Hz sensor.
    jump = 10
    max_len = df.shape[0]
    count = 0

    # Manhattan distance is used to calculate the similarity of two sequences 
    manhattan_distance = lambda x, y: np.abs(x - y)

    df['dist'] = None

    while jump + count < max_len:

        # Gets Current time window
        df_current = df.loc[np.arange(count, count + jump)]

        # Gets next time window
        df_next = df.loc[np.arange(count + jump, jump * 2 + count )]

        # Makes sure the two windows are in the same size
        if df_current.shape[0] < jump or df_next.shape[0] < jump:
            continue 

        # calculates similarity between the two sequences
        d,_, _,_ = dtw(df_current['a_xyz'].values, df_next['a_xyz'].values, dist=manhattan_distance)



        # Calculates the variance of the current wave and adds to a column
        df.loc[df_current.index, 'variance'] = np.var(df_current['a_xyz'].values)

        # Creates a new column with the similarity
        df.loc[df_current.index, 'dist'] = d

        count = count + jump

    '''
        Defines a threshold the determine if a wave it's a repetition or a low variance movement 
    '''
    df.dropna(subset=['dist'], inplace=True)
    idx_low_var = df[(df['dist'] < 0.1) & (df['variance'] < 0.00001)].index
    idx_repetition = df[(df['dist'] < 1) & (df['variance'] > 0.1)].index

    df['type'] = 'ordinary'
    df.loc[idx_low_var, 'type'] = 'constant'
    df.loc[idx_repetition, 'type'] = 'repetitive'
    
    pickle.dump( df, open(f"../data/dtw/{pet_id}.p", "wb" ))

    

In [26]:
num_cores = multiprocessing.cpu_count()

In [39]:
len(pet_ids)

44

In [40]:
%%time
# Use wisely
Parallel(n_jobs=num_cores)(delayed(slide_data)(i) for i in pet_ids)



CPU times: user 3.31 s, sys: 772 ms, total: 4.09 s
Wall time: 45min 38s


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]