In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime as dt
import gc
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa

# 각 데이터별 날짜 단위로 모으기

In [3]:
data_dir ="../ETRI_data/test/raw"

# 스마트폰에서 인식된 행동 분류값. 1분마다 1회씩 기록
file_name = "ch2024_test_m_activity.parquet.gzip"
act = pd.read_parquet(os.path.join(data_dir, file_name))

# 스마트워치에서 측정된 심박 데이터. 1초 간격으로 측정
file_name = "ch2024_test_w_heart_rate.parquet.gzip"
hr = pd.read_parquet(os.path.join(data_dir, file_name))

#스마트폰에서 산출된 GPS 좌표 정보(단, 위도 및 경도는 상대 좌표로 변환됨). 5초 간격(1분당 약 12회)으로 측정됨.
file_name = "ch2024_test_m_gps.parquet.gzip"
gps = pd.read_parquet(os.path.join(data_dir, file_name))

# heart_rate

In [5]:
hr['date'] = hr['timestamp'].astype(str).apply(lambda x:x.split(" ")[0])
hr_pt = pd.pivot_table(hr,
                        values=['heart_rate'],
                        index=['subject_id','date'],
                        aggfunc={'heart_rate':list})
hr_pt['heart_rate'].apply(np.array)
hr_pt.reset_index(inplace=True)
hr_pt.columns = ['user', 'date', 'hr']

# parquet으로 저장
# print('Saving..')
# table = pa.Table.from_pandas(hr_pt)
# save_dir = os.path.join(sav_dir, target+'_day.parquet')
# pq.write_table(table, save_dir)

In [6]:
hr_pt.head()

Unnamed: 0,user,date,hr
0,5,2023-11-05,"[90, 65, 86, 89, 80, 94, 91, 90, 89, 92, 86, 8..."
1,5,2023-11-06,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,5,2023-11-07,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,5,2023-11-08,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,2023-11-09,"[74, 68, 72, 70, 70, 65, 71, 71, 73, 86, 87, 7..."


# GPS

In [7]:
gps['date'] =gps['timestamp'].astype(str).apply(lambda x:x.split(" ")[0])

In [8]:
gps.head()

Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed,date
0,5,2023-11-05 02:25:01,95.734328,0.03837,0.028696,0.126473,2023-11-05
1,5,2023-11-05 02:25:06,95.734328,0.038373,0.028697,0.03271,2023-11-05
2,5,2023-11-05 02:25:09,95.734328,0.038373,0.028704,0.237968,2023-11-05
3,5,2023-11-05 02:25:12,95.734328,0.038367,0.028711,0.146265,2023-11-05
4,5,2023-11-05 02:25:15,95.734328,0.038362,0.028713,0.051259,2023-11-05


In [9]:
import math
def measure(lat1, lon1, lat2, lon2):
    R = 6378.137  # Radius of earth in KM
    dLat = lat2 * math.pi / 180 - lat1 * math.pi / 180
    dLon = lon2 * math.pi / 180 - lon1 * math.pi / 180
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + \
        math.cos(lat1 * math.pi / 180) * math.cos(lat2 * math.pi / 180) * \
        math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = R * c
    return d * 1000  # meters

In [10]:
gps_distance = []
for (k1, k2), df in gps.groupby(['subject_id', 'date']):
    lat = df['latitude'].values
    lon = df['longitude'].values
    gps_map = []
    for i in range(len(lat)-1):
        lat1, lon1 = lat[i], lon[i]
        lat2, lon2 = lat[i+1], lon[i+1]
        distance = measure(lat1, lon1, lat2, lon2)
        gps_map.append(distance)
    gps_map.insert(0, 0)
    gps_distance.append(gps_map)

# gps_distance
gps_result = sum(gps_distance, [])
gps['distance'] = gps_result
gps.head()

Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed,date,distance
0,5,2023-11-05 02:25:01,95.734328,0.03837,0.028696,0.126473,2023-11-05,0.0
1,5,2023-11-05 02:25:06,95.734328,0.038373,0.028697,0.03271,2023-11-05,0.347951
2,5,2023-11-05 02:25:09,95.734328,0.038373,0.028704,0.237968,2023-11-05,0.836083
3,5,2023-11-05 02:25:12,95.734328,0.038367,0.028711,0.146265,2023-11-05,0.991931
4,5,2023-11-05 02:25:15,95.734328,0.038362,0.028713,0.051259,2023-11-05,0.632271


In [12]:
gps_pt = pd.pivot_table(gps,
                        values=['latitude', 'longitude', 'distance'],
                        index=['subject_id','date'],
                        aggfunc={'latitude':list,
                                'longitude':list,
                                'distance':sum,})
gps_pt['latitude'].apply(np.array)
gps_pt['longitude'].apply(np.array)
gps_pt.reset_index(inplace=True)
gps_pt.columns = ['user', 'date', 'distance', 'lat', 'lon']
display(gps_pt.head())
# gps_pt['distance'] = [sum(x) for x in gps_pt['distance']]


# parquet으로 저장
# print('Saving..')
# table = pa.Table.from_pandas(gps_pt)
# save_dir = os.path.join(sav_dir, target+'_day.parquet')
# pq.write_table(table, save_dir)

  gps_pt = pd.pivot_table(gps,


Unnamed: 0,user,date,distance,lat,lon
0,5,2023-11-05,38208.796994,"[0.03836989999999929, 0.03837299999999999, 0.0...","[0.02869619999999884, 0.028696600000003514, 0...."
1,5,2023-11-06,44822.136676,"[0.038427099999999825, 0.03844070000000244, 0....","[0.02876349999999661, 0.028766099999998573, 0...."
2,5,2023-11-07,46311.068886,"[0.03864459999999781, 0.03863460000000174, 0.0...","[0.028709000000006313, 0.028709100000000376, 0..."
3,5,2023-11-08,38487.174434,"[0.038544600000001594, 0.03853850000000136, 0....","[0.028833899999995083, 0.028838399999997932, 0..."
4,5,2023-11-09,42850.587606,"[0.03850620000000049, 0.038504599999996, 0.038...","[0.02893129999999644, 0.02893240000000219, 0.0..."


# Activity

In [13]:
act['date'] =act['timestamp'].astype(str).apply(lambda x:x.split(" ")[0])
act_pt = pd.pivot_table(act,
                        values=['m_activity'],
                        index=['subject_id','date'],
                        aggfunc={'m_activity':list,})
act_pt['m_activity'].apply(np.array)
act_pt.reset_index(inplace=True)
act_pt.columns = ['user', 'date', 'activity']

# # parquet으로 저장
# print('Saving..')
# table = pa.Table.from_pandas(act_pt)
# save_dir = os.path.join(sav_dir, target+'_day.parquet')
# pq.write_table(table, save_dir)

In [14]:
act_pt.head()

Unnamed: 0,user,date,activity
0,5,2023-11-05,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
1,5,2023-11-06,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,5,2023-11-07,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
3,5,2023-11-08,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
4,5,2023-11-09,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, ..."


In [15]:
#<activity 전처리>
act_pt['len']=act_pt['activity'].apply(len)

# 하루 중 해당 activity의 비율
for i in range(0,7):
    act_pt['act'+str(i)]=[list(x).count(i) for x in act_pt['activity']]/act_pt['len']

act_pt = act_pt.drop(columns=['activity', 'len'])
act_pt

Unnamed: 0,user,date,act0,act1,act2,act3,act4,act5,act6
0,5,2023-11-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,2023-11-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,2023-11-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,2023-11-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,2023-11-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
110,8,2023-11-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,8,2023-11-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,8,2023-11-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
113,8,2023-11-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
act_pt.to_parquet('../ETRI_data/test/test_day/act_day.parquet', engine='pyarrow', compression='gzip')
hr_pt.to_parquet('../ETRI_data/test/test_day/hr_day.parquet', engine='pyarrow', compression='gzip')
gps_pt.to_parquet('../ETRI_data/test/test_day/gps_day.parquet', engine='pyarrow', compression='gzip')

# Accelerator

In [18]:
data_path ="../ETRI_data/test/raw"
for i in tqdm(range(5,9)):
    file_name = "ch2024_test__m_acc_part_"+str(i)+".parquet.gzip"
    acc_df = pd.read_parquet(os.path.join(data_path, file_name))
    acc_df['date']=acc_df['timestamp'].astype(str).apply(lambda x:x.split(".")[0])
    
    # 데이터 프레임에 수정한 timestamp 추가
    acc_sample_df = acc_df.groupby('date').apply(lambda x:x.iloc[0]).reset_index(drop=True)
    acc_sample_df.drop(columns = ['date'], inplace=True)
    display(acc_sample_df.head())
    acc_sample_df.to_parquet('../ETRI_data/test/acc/acc'+str(i)+'_sample.parquet.gzip', engine='pyarrow', compression='gzip')

    del file_name, acc_sample_df, acc_df
    gc.collect()

  0%|                                                                                                                                                                                                | 0/4 [00:00<?, ?it/s]

Unnamed: 0,subject_id,timestamp,x,y,z
0,5,2023-11-05 00:00:00.022,-0.088585,0.105345,9.749489
1,5,2023-11-05 00:00:01.021,-0.081403,0.126893,9.751883
2,5,2023-11-05 00:00:02.000,-0.055067,0.126893,9.759066
3,5,2023-11-05 00:00:03.001,-0.083797,0.095768,9.756671
4,5,2023-11-05 00:00:04.019,-0.055067,0.110133,9.759066


 25%|█████████████████████████████████████████████▊                                                                                                                                         | 1/4 [02:05<06:17, 125.94s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,6,2023-10-06 00:00:00.020,0.069392,0.040678,9.822537
1,6,2023-10-06 00:00:01.018,0.057428,0.045464,9.817751
2,6,2023-10-06 00:00:02.016,0.055035,0.045464,9.829716
3,6,2023-10-06 00:00:03.015,0.055035,0.035892,9.82493
4,6,2023-10-06 00:00:04.013,0.055035,0.064606,9.803394


 50%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 2/4 [05:34<05:48, 174.35s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,7,2023-11-02 00:10:00.003,9.236296,0.334098,-3.279962
1,7,2023-11-02 00:10:01.000,9.230313,0.360419,-3.299104
2,7,2023-11-02 00:10:02.017,9.233903,0.332901,-3.289533
3,7,2023-11-02 00:10:03.016,9.235099,0.349651,-3.289533
4,7,2023-11-02 00:10:04.014,9.247063,0.344866,-3.309872


 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 3/4 [08:56<03:07, 187.22s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,8,2023-10-08 00:00:00.025,0.249005,0.098165,9.478934
1,8,2023-10-08 00:00:01.005,0.227456,0.10056,9.510059
2,8,2023-10-08 00:00:02.003,0.241822,0.122108,9.493299
3,8,2023-10-08 00:00:03.002,0.201119,0.086194,9.536397
4,8,2023-10-08 00:00:04.004,0.237033,0.114925,9.514848


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [11:59<00:00, 179.78s/it]


In [19]:
from scipy.fftpack import fft  
from scipy.fftpack import fftfreq
from scipy.fftpack import ifft
import math 
import scipy as sp

def median(signal):
    array=np.array(signal)   
    med_filtered=sp.signal.medfilt(array, kernel_size=3)
    return  med_filtered

sampling_freq = 50
def components_selection_one_signal(t_signal, sampling_freq):
    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal)
    f_signal=fft(t_signal)
    freqs=np.array(sp.fftpack.fftfreq(t_signal_length, d=1/float(sampling_freq)))# frequency values between [-25hz:+25hz]

    f_DC_signal=[] # DC_component in freq domain
    f_body_signal=[] # body component in freq domain numpy.append(a, a[0])
    f_noise_signal=[] # noise in freq domain

    for i in range(len(freqs)):# iterate over all available frequencies

        # selecting the frequency value
        freq=freqs[i]

        # selecting the f_signal value associated to freq
        value= f_signal[i]

        # Selecting DC_component values 
        if abs(freq)>0.3:# testing if freq is outside DC_component frequency ranges
            f_DC_signal.append(float(0)) # add 0 to  the  list if it was the case (the value should not be added)                                       
        else: # if freq is inside DC_component frequency ranges 
            f_DC_signal.append(value) # add f_signal value to f_DC_signal list

        # Selecting noise component values 
        if (abs(freq)<=20):# testing if freq is outside noise frequency ranges 
            f_noise_signal.append(float(0)) # # add 0 to  f_noise_signal list if it was the case 
        else:# if freq is inside noise frequency ranges 
            f_noise_signal.append(value) # add f_signal value to f_noise_signal

        # Selecting body_component values 
        if (abs(freq)<=0.3 or abs(freq)>20):# testing if freq is outside Body_component frequency ranges
            f_body_signal.append(float(0))# add 0 to  f_body_signal list
        else:# if freq is inside Body_component frequency ranges
            f_body_signal.append(value) # add f_signal value to f_body_signal list

    t_DC_component= ifft(np.array(f_DC_signal)).real
    t_body_component= ifft(np.array(f_body_signal)).real
    t_noise=ifft(np.array(f_noise_signal)).real

    total_component=t_signal-t_noise # extracting the total component(filtered from noise) 

    return (total_component,t_DC_component,t_body_component,t_noise) 

In [22]:
data_path ="../ETRI_data/test/acc"
for i in tqdm(range(5, 9)):
    file_name = "acc"+str(i)+"_sample.parquet.gzip"
    acc_df = pd.read_parquet(os.path.join(data_path, file_name))
    display(acc_df.head())

    x = acc_df['x']
    y = acc_df['y']
    z = acc_df['z']

    #median
    med_f_x = median(x)
    med_f_y = median(y)
    med_f_z = median(z)
    acc_df['x_med'] = med_f_x
    acc_df['y_med'] = med_f_y
    acc_df['z_med'] = med_f_z

    #butter pca
    total_x, gravity_x, body_x, noise_x = components_selection_one_signal(med_f_x, sampling_freq)
    total_y, gravity_y, body_y, noise_y = components_selection_one_signal(med_f_y, sampling_freq)
    total_z, gravity_z, body_z, noise_z = components_selection_one_signal(med_f_z, sampling_freq)
    acc_df['x_gravity'] = gravity_x
    acc_df['y_gravity'] = gravity_y
    acc_df['z_gravity'] = gravity_z
    acc_df['x_body'] = body_x
    acc_df['y_body'] = body_y
    acc_df['z_body'] = body_z

    display(acc_df.head())
    acc_df.to_parquet('../ETRI_data/test/acc/acc'+str(i)+'_final.parquet', engine='pyarrow', compression='gzip')
    
    del file_name, acc_df, x, y, z, med_f_x, med_f_y, med_f_z, total_x, gravity_x, body_x, noise_x, total_y, gravity_y, body_y, noise_y, total_z, gravity_z, body_z, noise_z
    gc.collect()

  0%|                                                                                                                                                                                                | 0/4 [00:00<?, ?it/s]

Unnamed: 0,subject_id,timestamp,x,y,z
0,5,2023-11-05 00:00:00.022,-0.088585,0.105345,9.749489
1,5,2023-11-05 00:00:01.021,-0.081403,0.126893,9.751883
2,5,2023-11-05 00:00:02.000,-0.055067,0.126893,9.759066
3,5,2023-11-05 00:00:03.001,-0.083797,0.095768,9.756671
4,5,2023-11-05 00:00:04.019,-0.055067,0.110133,9.759066


Unnamed: 0,subject_id,timestamp,x,y,z,x_med,y_med,z_med,x_gravity,y_gravity,z_gravity,x_body,y_body,z_body
0,5,2023-11-05 00:00:00.022,-0.088585,0.105345,9.749489,-0.081403,0.105345,9.749489,-0.764265,3.739148,7.725961,0.590733,-2.735827,1.425648
1,5,2023-11-05 00:00:01.021,-0.081403,0.126893,9.751883,-0.081403,0.126893,9.751883,-0.763931,3.640294,7.782702,0.766562,-4.31634,2.508522
2,5,2023-11-05 00:00:02.000,-0.055067,0.126893,9.759066,-0.081403,0.126893,9.756671,-0.76298,3.542039,7.838744,0.616146,-2.804437,1.496831
3,5,2023-11-05 00:00:03.001,-0.083797,0.095768,9.756671,-0.055067,0.110133,9.759066,-0.761418,3.444431,7.894069,0.746488,-3.697305,2.130868
4,5,2023-11-05 00:00:04.019,-0.055067,0.110133,9.759066,-0.081403,0.110133,9.759066,-0.759249,3.347514,7.948656,0.664347,-3.124166,1.706232


 25%|██████████████████████████████████████████████                                                                                                                                          | 1/4 [00:07<00:21,  7.08s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,6,2023-10-06 00:00:00.020,0.069392,0.040678,9.822537
1,6,2023-10-06 00:00:01.018,0.057428,0.045464,9.817751
2,6,2023-10-06 00:00:02.016,0.055035,0.045464,9.829716
3,6,2023-10-06 00:00:03.015,0.055035,0.035892,9.82493
4,6,2023-10-06 00:00:04.013,0.055035,0.064606,9.803394


Unnamed: 0,subject_id,timestamp,x,y,z,x_med,y_med,z_med,x_gravity,y_gravity,z_gravity,x_body,y_body,z_body
0,6,2023-10-06 00:00:00.020,0.069392,0.040678,9.822537,0.057428,0.040678,9.817751,0.087256,0.143909,9.772252,-0.027558,-0.104451,0.044947
1,6,2023-10-06 00:00:01.018,0.057428,0.045464,9.817751,0.057428,0.045464,9.822537,0.090188,0.142855,9.775608,-0.034083,-0.097562,0.047728
2,6,2023-10-06 00:00:02.016,0.055035,0.045464,9.829716,0.055035,0.045464,9.82493,0.09309,0.141648,9.779025,-0.038216,-0.094923,0.045072
3,6,2023-10-06 00:00:03.015,0.055035,0.035892,9.82493,0.055035,0.045464,9.82493,0.095956,0.140292,9.782499,-0.039383,-0.096582,0.043107
4,6,2023-10-06 00:00:04.013,0.055035,0.064606,9.803394,0.055035,0.050249,9.82493,0.098783,0.138789,9.786023,-0.045952,-0.086947,0.038469


 50%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2/4 [00:18<00:18,  9.42s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,7,2023-11-02 00:10:00.003,9.236296,0.334098,-3.279962
1,7,2023-11-02 00:10:01.000,9.230313,0.360419,-3.299104
2,7,2023-11-02 00:10:02.017,9.233903,0.332901,-3.289533
3,7,2023-11-02 00:10:03.016,9.235099,0.349651,-3.289533
4,7,2023-11-02 00:10:04.014,9.247063,0.344866,-3.309872


Unnamed: 0,subject_id,timestamp,x,y,z,x_med,y_med,z_med,x_gravity,y_gravity,z_gravity,x_body,y_body,z_body
0,7,2023-11-02 00:10:00.003,9.236296,0.334098,-3.279962,9.230313,0.334098,-3.279962,4.550727,-0.287852,3.269875,3.731561,0.517392,-5.248849
1,7,2023-11-02 00:10:01.000,9.230313,0.360419,-3.299104,9.233903,0.334098,-3.289533,4.661628,-0.267435,3.11383,5.402027,0.69146,-7.535253
2,7,2023-11-02 00:10:02.017,9.233903,0.332901,-3.289533,9.233903,0.349651,-3.289533,4.772498,-0.246923,2.95775,3.847571,0.533918,-5.41331
3,7,2023-11-02 00:10:03.016,9.235099,0.349651,-3.289533,9.235099,0.344866,-3.289533,4.883285,-0.226335,2.801708,4.700933,0.601147,-6.565612
4,7,2023-11-02 00:10:04.014,9.247063,0.344866,-3.309872,9.247063,0.344866,-3.296711,4.99394,-0.205692,2.645779,4.160033,0.550535,-5.812464


 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 3/4 [00:31<00:11, 11.04s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,8,2023-10-08 00:00:00.025,0.249005,0.098165,9.478934
1,8,2023-10-08 00:00:01.005,0.227456,0.10056,9.510059
2,8,2023-10-08 00:00:02.003,0.241822,0.122108,9.493299
3,8,2023-10-08 00:00:03.002,0.201119,0.086194,9.536397
4,8,2023-10-08 00:00:04.004,0.237033,0.114925,9.514848


Unnamed: 0,subject_id,timestamp,x,y,z,x_med,y_med,z_med,x_gravity,y_gravity,z_gravity,x_body,y_body,z_body
0,8,2023-10-08 00:00:00.025,0.249005,0.098165,9.478934,0.227456,0.098165,9.478934,0.096833,0.111548,9.522628,0.116259,-0.006782,-0.038516
1,8,2023-10-08 00:00:01.005,0.227456,0.10056,9.510059,0.241822,0.10056,9.493299,0.099313,0.116471,9.523413,0.152229,-0.024746,-0.033698
2,8,2023-10-08 00:00:02.003,0.241822,0.122108,9.493299,0.227456,0.10056,9.510059,0.101859,0.121502,9.524227,0.121809,-0.009063,-0.01121
3,8,2023-10-08 00:00:03.002,0.201119,0.086194,9.536397,0.237033,0.114925,9.514848,0.104467,0.126634,9.525069,0.130626,-0.026027,-0.01337
4,8,2023-10-08 00:00:04.004,0.237033,0.114925,9.514848,0.222668,0.114925,9.514848,0.107136,0.13186,9.525935,0.121587,-0.002198,-0.007503


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:41<00:00, 10.30s/it]


In [23]:
data_path ="../ETRI_data/test/acc"
sav_dir = "../ETRI_data/test/test_day"
for i in tqdm(range(4,8)):
    file_name = "acc"+str(i+1)+"_final.parquet"
    user_data = pd.read_parquet(os.path.join(data_path, file_name))
    user_data['date'] = user_data['timestamp'].astype(str).apply(lambda x:x.split(" ")[0])
    user_pt = pd.pivot_table(user_data,
                            values=['x_med','y_med', 'z_med', 
                                    'x_gravity', 'x_body', 'y_gravity', 'y_body', 'z_gravity', 'z_body'],
                            index=['subject_id','date'],
                            aggfunc={'x_med':list,
                                    'y_med':list,
                                    'z_med':list,
                                    'x_gravity':list,
                                    'x_body':list,
                                    'y_gravity':list,
                                    'y_body':list,
                                    'z_gravity':list,
                                    'z_body':list,})
    user_pt['x_med'].apply(np.array)
    user_pt['y_med'].apply(np.array)
    user_pt['z_med'].apply(np.array)
    user_pt['x_gravity'].apply(np.array)
    user_pt['y_gravity'].apply(np.array)
    user_pt['z_gravity'].apply(np.array)
    user_pt['x_body'].apply(np.array)
    user_pt['y_body'].apply(np.array)
    user_pt['z_body'].apply(np.array)
    user_pt.reset_index(inplace=True)
    user_pt.rename(columns = {'subject_id':'user'}, inplace=True)

    # parquet으로 저장
    print('Saving..')
    table = pa.Table.from_pandas(user_pt)
    save_dir = os.path.join(sav_dir, 'user'+str(i+1)+'_'+'acc_day.parquet')
    pq.write_table(table, save_dir)

  0%|                                                                                                                                                                                                | 0/4 [00:00<?, ?it/s]

Saving..


 25%|██████████████████████████████████████████████                                                                                                                                          | 1/4 [00:02<00:06,  2.14s/it]

Saving..


 50%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2/4 [00:05<00:05,  3.00s/it]

Saving..


 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 3/4 [00:09<00:03,  3.36s/it]

Saving..


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:12<00:00,  3.19s/it]


In [24]:
# Acc 통합 파일 생성
data_dir = '../ETRI_data/test/test_day'
user_list = [x.split('_')[0] for x in os.listdir(os.path.join(data_dir)) if 'user' in x]
user_list

['user8', 'user5', 'user7', 'user6']

In [25]:
acc = None
for user in user_list:
    user_acc = pd.read_parquet(os.path.join(data_dir, user+'_acc_day.parquet'))
    if acc is None:
        acc = user_acc
    else:
        acc = pd.concat([acc, user_acc], axis=0)
        
table = pa.Table.from_pandas(acc)
save_dir = os.path.join(data_dir, 'acc_day.parquet')
pq.write_table(table, save_dir)

In [26]:
acc_pt = pd.read_parquet(os.path.join(data_dir, 'acc_day.parquet'))
acc_pt.head()

Unnamed: 0,user,date,x_body,x_gravity,x_med,y_body,y_gravity,y_med,z_body,z_gravity,z_med
0,8,2023-10-08,"[0.11625876611516058, 0.1522292823301403, 0.12...","[0.09683282193466974, 0.09931328205175206, 0.1...","[0.22745611, 0.24182175, 0.22745611, 0.2370332...","[-0.006782113651272816, -0.024745783228710756,...","[0.11154809992911509, 0.1164705879748072, 0.12...","[0.098165266, 0.10055954, 0.10055954, 0.114925...","[-0.038515943486048636, -0.03369831021591531, ...","[9.522627954864719, 9.523412814668664, 9.52422...","[9.478934, 9.4932995, 9.510059, 9.514848, 9.51..."
1,8,2023-10-09,"[1.4673782257656796, 1.282860442961731, 1.4722...","[-1.5390851872429105, -1.4145766691511252, -1....","[-0.09337672, -0.09337672, 0.14605077, 0.14605...","[2.483225325013676, 2.3641493234230464, 2.3347...","[7.402447914746968, 7.492428883406657, 7.58208...","[9.842864, 9.900327, 9.883567, 9.883567, 9.826...","[-1.7652852540700728, -1.3515598996011524, -1....","[-0.6516593582128787, -0.6736209164928868, -0....","[-2.8683412, -1.6089526, -2.6001825, -2.502017..."
2,8,2023-10-10,"[1.5248215057003718, 1.7184884883778389, 1.613...","[2.3137219702071326, 2.385166121155055, 2.4577...","[3.9098508, 4.017593, 4.1373067, 4.187587, 4.2...","[5.558119172751997, 5.309198706632944, 4.75592...","[-4.818885247012737, -4.731542917225931, -4.64...","[0.81644773, 0.45730647, 0.22985038, 0.1747820...","[-3.142818789518796, -2.864775464566731, -2.90...","[-6.436449304003791, -6.511451068541658, -6.58...","[-9.483723, -9.45978, -9.44302, -9.423865, -9...."
3,8,2023-10-11,"[6.074954071925732, 6.489206868684749, 6.18590...","[0.09594784002159537, 0.21543767808950537, 0.3...","[6.4765134, 6.4765134, 6.610593, 6.7470665, 6....","[0.6059936576234247, 0.5293042196737482, 0.397...","[5.339956892969487, 5.367654640590536, 5.39605...","[6.002447, 5.8659735, 5.794145, 5.707951, 5.65...","[-0.3312897979368726, -0.36102212181172416, -0...","[-4.381574062646698, -4.387419067931266, -4.39...","[-4.7191157, -4.747847, -4.7191157, -4.6281333..."
4,8,2023-10-13,"[7.672746466859709, 10.764559854312903, 8.0339...","[-0.628370950075233, -0.49874702678613425, -0....","[8.7534685, 8.775017, 8.775017, 8.775017, 8.78...","[-1.8872447926015956, -2.7462705396314497, -1....","[1.1181719978331792, 1.0512271667432598, 0.984...","[-1.2641771, -1.2665714, -1.2665714, -1.266571...","[-0.28649883256134034, -0.47450769940267956, -...","[-3.8280857500651297, -3.8317630258500985, -3....","[-4.2187123, -4.2187123, -4.2187123, -4.218712..."


# test 유효한 날짜 구하기

In [27]:
acc_date = acc_pt[['user', 'date']]
hr_date = hr_pt[['user', 'date']]
gps_date = gps_pt[['user', 'date']]
act_date = act_pt[['user', 'date']]

In [28]:
from functools import reduce
test_date = reduce(lambda x,y: pd.merge(x, y, on=['user', 'date'], how='outer'),[acc_date, hr_date, gps_date, act_date])
test_date

Unnamed: 0,user,date
0,8,2023-10-08
1,8,2023-10-09
2,8,2023-10-10
3,8,2023-10-11
4,8,2023-10-13
...,...,...
110,6,2023-11-03
111,6,2023-11-04
112,6,2023-11-05
113,6,2023-11-06


In [31]:
# gps 데이터 빈 날짜 채우기
gps_pt = pd.merge(test_date, gps_pt, on=("user","date"), how="outer")#.reset_index(inplace=True)
gps_pt.reset_index(drop=True, inplace=True)
display(gps_pt.head())
print(gps_pt.shape)

# gps 패딩한 길이만큼 배열 생성
test_nan = np.zeros(13590)
mean_distance = np.mean(gps_pt['distance'])

# nan값 대체
gps_pt['lat'] = [x if np.sum(pd.isna(x))<=0 else test_nan for x in gps_pt['lat'] ]
print(gps_pt['lat'].isnull().sum())

gps_pt['lon'] = [x if np.sum(pd.isna(x))<=0 else test_nan for x in gps_pt['lon'] ]
print(gps_pt['lon'].isnull().sum())

gps_pt['distance'] = [x if np.sum(pd.isna(x))<=0 else mean_distance for x in gps_pt['distance'] ]
print(gps_pt['distance'].isnull().sum())

Unnamed: 0,user,date,distance,lat,lon
0,8,2023-10-08,31375.427912,"[1.5571827999999996, 1.557249200000001, 1.5574...","[0.9651157999999924, 0.9650685999999951, 0.965..."
1,8,2023-10-09,66198.197163,"[1.5943275999999997, 1.594319800000001, 1.5943...","[0.9294708999999983, 0.929462700000002, 0.9294..."
2,8,2023-10-10,27181.015946,"[1.594778600000005, 1.594824800000005, 1.59484...","[0.929512299999999, 0.9295051000000001, 0.9295..."
3,8,2023-10-11,42135.734471,"[1.5943427999999997, 1.5943461000000028, 1.594...","[0.9294731999999897, 0.9294883999999968, 0.929..."
4,8,2023-10-13,65799.167304,"[1.7029155000000031, 1.7030918000000028, 1.703...","[0.864241199999995, 0.8644353999999908, 0.8643..."


(115, 5)
0
0
0


In [32]:
data_dir = '../ETRI_data/test'
# test 데이터 저장
pq.write_table(pa.Table.from_pandas(acc_pt), os.path.join(data_dir, 'acc_final.parquet'))
pq.write_table(pa.Table.from_pandas(hr_pt), os.path.join(data_dir, 'hr_final.parquet'))
pq.write_table(pa.Table.from_pandas(act_pt), os.path.join(data_dir, 'act_final.parquet'))
pq.write_table(pa.Table.from_pandas(gps_pt), os.path.join(data_dir, 'gps_final.parquet'))