In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from dtw import dtw
import math
import ctypes
import pickle 
import multiprocessing
from joblib import Parallel, delayed

In [2]:
dd_the_big_table = dd.read_parquet(f'../data/db_merged.parquet/*.parquet',engine='fastparquet')

In [3]:
dd_the_big_table

Unnamed: 0_level_0,activity,collar_id,u_id_coleira,gx,gy,gz,ax,ay,az,temp,time_stamp,pet_id,size,race,age,genre,owner_id,lat,long,date_time
npartitions=50,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
,object,int64,object,float64,float64,float64,int64,int64,int64,int64,int64,int64,object,object,object,object,int64,object,object,datetime64[ns]
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [6]:
dd_the_big_table.compute().shape

(3847684, 20)

In [7]:
pet_ids = dd_the_big_table['pet_id'].unique().compute()

In [15]:
manhattan_distance = lambda x, y: np.abs(x - y)


'''
    Appends all 'a_xyz' and all 'index' into one value when groupy is used
    in this case each column will have information of 10 rows
'''
def agg(df):
    return pd.Series(
        dict(
        l_xyz =df['a_xyz'].values,
        idxs = df.index.values
    
        )
    )

'''
    Calculates DTW, variance and classify the data
'''
def time_wraping(x,df_full):
    next_idx = x['index'] + 1
    
    # check if next row exists in the dataset
    if next_idx in df_full.index:
        
        # calculates DTW and variance
        d,_, _,_ = dtw(x['l_xyz'], df_full.loc[next_idx]['l_xyz'], dist=manhattan_distance)
        var = np.var(x['l_xyz'])
        
        # Defines if the moviment is repetitive or not
        if d < 1.3 and var > 0.001:
            type_mov = 'repetitive'
        else:
            type_mov = 'ordinary'
        
        return pd.Series([d, var, type_mov])   

In [18]:
def slide_data(pet_id):
    
    #Loads one PET in memory
    df = dd_the_big_table.map_partitions(lambda df: df[df['pet_id'] == pet_id]).compute()
    
    
    df['date_time'] = df['time_stamp'].apply(lambda x: pd.Timestamp(x, unit='ms'))
   
    # Convert values of gyroscope and accelerometer
    df['ax'] = df['ax'].apply(lambda x : (ctypes.c_int16(int(x)).value * 32)/65536)
    df['ay'] = df['ay'].apply(lambda x : (ctypes.c_int16(int(x)).value * 32)/65536)
    df['az'] = df['az'].apply(lambda x : (ctypes.c_int16(int(x)).value * 32)/65536)
    
    # joins all 3 axis
    df['a_xyz'] = np.sqrt(df['ax']**2 + df['ay']**2 + df['az']**2)

    df.sort_values(by=['date_time'], inplace= True)

    df = df.reset_index()
    
    df['dist'] = None
    df['variance'] = None
    df['type'] = None

    # group 10 by 10 
    df_grouped = df.groupby(df.index // 10).apply(agg)

    df_grouped['index'] = df_grouped.index
    
    #calculates dist, variance and defines type (ordinary or repetitive)
    df_grouped[['dist', 'variance', 'type']] = a.apply(time_wraping, df_full=df_grouped, axis = 1)

    pickle.dump(df_grouped, open(f"../data/dtw_new/{pet_id}.p", "wb" ))

In [19]:
num_cores = multiprocessing.cpu_count()

In [20]:
len(pet_ids)

44

In [21]:
%%time
# Use wisely
Parallel(n_jobs=num_cores)(delayed(slide_data)(i) for i in pet_ids)



CPU times: user 1.59 s, sys: 279 ms, total: 1.87 s
Wall time: 4min 35s


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [5]:
df = pickle.load(open('../data/dtw_new/1569850690.p', 'rb'))

In [6]:
df.head()

Unnamed: 0,l_xyz,idxs,index,dist,variance,type
0,"[0.16917377525890814, 1.054921442153391, 0.920...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",0,1.921539,0.092907,ordinary
1,"[1.0404718922196532, 1.0077247285695419, 0.999...","[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]",1,0.787001,0.004046,repetitive
2,"[1.068288071953602, 1.0530215068189455, 1.0660...","[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]",2,1.82509,0.013561,ordinary
3,"[0.8446561398119916, 0.9670023998321452, 1.022...","[30, 31, 32, 33, 34, 35, 36, 37, 38, 39]",3,1.343677,0.03421,ordinary
4,"[1.1122970781744221, 1.0009802543758137, 0.782...","[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]",4,0.839506,0.012414,repetitive
