In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime as dt
import gc
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa

# 각 데이터별 날짜 단위로 모으기

In [2]:
data_dir ="../ETRI_data/valid/raw"

# 스마트폰에서 인식된 행동 분류값. 1분마다 1회씩 기록
file_name = "ch2024_val__m_activity.parquet.gzip"
act = pd.read_parquet(os.path.join(data_dir, file_name))

# 스마트워치에서 측정된 심박 데이터. 1초 간격으로 측정
file_name = "ch2024_val__w_heart_rate.parquet.gzip"
hr = pd.read_parquet(os.path.join(data_dir, file_name))

#스마트폰에서 산출된 GPS 좌표 정보(단, 위도 및 경도는 상대 좌표로 변환됨). 5초 간격(1분당 약 12회)으로 측정됨.
file_name = "ch2024_val__m_gps.parquet.gzip"
gps = pd.read_parquet(os.path.join(data_dir, file_name))

# heart_rate

In [3]:
hr['date'] = hr['timestamp'].astype(str).apply(lambda x:x.split(" ")[0])
hr_pt = pd.pivot_table(hr,
                        values=['heart_rate'],
                        index=['subject_id','date'],
                        aggfunc={'heart_rate':list})

hr_pt['heart_rate'].apply(np.array)
hr_pt.reset_index(inplace=True)
hr_pt.columns = ['user', 'date', 'hr']

# parquet으로 저장
# print('Saving..')
# table = pa.Table.from_pandas(hr_pt)
# save_dir = os.path.join(sav_dir, target+'_day.parquet')
# pq.write_table(table, save_dir)

In [4]:
hr_pt.head()

Unnamed: 0,user,date,hr
0,1,2023-08-20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,2023-08-21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1,2023-08-22,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1,2023-08-23,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,1,2023-08-24,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# GPS

In [5]:
gps['date'] =gps['timestamp'].astype(str).apply(lambda x:x.split(" ")[0])

In [6]:
gps.head()

Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed,date
0,1,2023-08-20 00:00:08,144.217651,0.016095,0.926485,0.143791,2023-08-20
1,1,2023-08-20 00:00:13,144.217651,0.01609,0.926477,0.160771,2023-08-20
2,1,2023-08-20 00:00:18,144.217651,0.016091,0.926478,0.006571,2023-08-20
3,1,2023-08-20 00:00:23,144.217651,0.016091,0.926474,0.05931,2023-08-20
4,1,2023-08-20 00:00:28,144.217651,0.016092,0.926477,0.049454,2023-08-20


In [7]:
import math
def measure(lat1, lon1, lat2, lon2):
    R = 6378.137  # Radius of earth in KM
    dLat = lat2 * math.pi / 180 - lat1 * math.pi / 180
    dLon = lon2 * math.pi / 180 - lon1 * math.pi / 180
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + \
        math.cos(lat1 * math.pi / 180) * math.cos(lat2 * math.pi / 180) * \
        math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = R * c
    return d * 1000  # meters

In [8]:
gps_distance = []
for (k1, k2), df in gps.groupby(['subject_id', 'date']):
    lat = df['latitude'].values
    lon = df['longitude'].values
    gps_map = []
    for i in range(len(lat)-1):
        lat1, lon1 = lat[i], lon[i]
        lat2, lon2 = lat[i+1], lon[i+1]
        distance = measure(lat1, lon1, lat2, lon2)
        gps_map.append(distance)
    gps_map.insert(0, 0)
    gps_distance.append(gps_map)

# gps_distance
gps_result = sum(gps_distance, [])
gps['distance'] = gps_result
gps.head()

Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed,date,distance
0,1,2023-08-20 00:00:08,144.217651,0.016095,0.926485,0.143791,2023-08-20,0.0
1,1,2023-08-20 00:00:13,144.217651,0.01609,0.926477,0.160771,2023-08-20,1.059642
2,1,2023-08-20 00:00:18,144.217651,0.016091,0.926478,0.006571,2023-08-20,0.102631
3,1,2023-08-20 00:00:23,144.217651,0.016091,0.926474,0.05931,2023-08-20,0.412033
4,1,2023-08-20 00:00:28,144.217651,0.016092,0.926477,0.049454,2023-08-20,0.353779


In [9]:
gps_pt = pd.pivot_table(gps,
                        values=['latitude', 'longitude', 'distance'],
                        index=['subject_id','date'],
                        aggfunc={'latitude':list,
                                'longitude':list,
                                'distance':sum,})
gps_pt['latitude'].apply(np.array)
gps_pt['longitude'].apply(np.array)
gps_pt.reset_index(inplace=True)
gps_pt.columns = ['user', 'date', 'distance', 'lat', 'lon']
display(gps_pt.head())
# gps_pt['distance'] = [sum(x) for x in gps_pt['distance']]


# parquet으로 저장
# print('Saving..')
# table = pa.Table.from_pandas(gps_pt)
# save_dir = os.path.join(sav_dir, target+'_day.parquet')
# pq.write_table(table, save_dir)

  gps_pt = pd.pivot_table(gps,


Unnamed: 0,user,date,distance,lat,lon
0,1,2023-08-20,22664.190846,"[0.016095200000002308, 0.01609020000000072, 0....","[0.9264852000000019, 0.9264770999999996, 0.926..."
1,1,2023-08-21,21082.568862,"[0.0160977000000031, 0.016097899999998333, 0.0...","[0.9264854000000042, 0.9264861000000053, 0.926..."
2,1,2023-08-22,22571.905286,"[0.016094500000001233, 0.01609229999999684, 0....","[0.9264813000000061, 0.9264809999999954, 0.926..."
3,1,2023-08-23,28640.403809,"[0.016086000000001377, 0.016083299999998246, 0...","[0.9264792999999969, 0.9264821999999953, 0.926..."
4,1,2023-08-24,15519.20267,"[0.01608759999999876, 0.016084800000001565, 0....","[0.9264881999999943, 0.9264823999999976, 0.926..."


# activity

In [10]:
act['date'] =act['timestamp'].astype(str).apply(lambda x:x.split(" ")[0])
act_pt = pd.pivot_table(act,
                        values=['m_activity'],
                        index=['subject_id','date'],
                        aggfunc={'m_activity':list,})
act_pt['m_activity'].apply(np.array)
act_pt.reset_index(inplace=True)
act_pt.columns = ['user', 'date', 'activity']

# # parquet으로 저장
# print('Saving..')
# table = pa.Table.from_pandas(act_pt)
# save_dir = os.path.join(sav_dir, target+'_day.parquet')
# pq.write_table(table, save_dir)

In [11]:
act_pt.head()

Unnamed: 0,user,date,activity
0,1,2023-08-20,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
1,1,2023-08-21,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
2,1,2023-08-22,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
3,1,2023-08-23,"[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
4,1,2023-08-24,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."


In [12]:
#<activity 전처리>
act_pt['len']=act_pt['activity'].apply(len)

# 하루 중 해당 activity의 비율
for i in range(0,7):
    act_pt['act'+str(i)]=[list(x).count(i) for x in act_pt['activity']]/act_pt['len']

act_pt = act_pt.drop(columns=['activity', 'len'])
act_pt

Unnamed: 0,user,date,act0,act1,act2,act3,act4,act5,act6
0,1,2023-08-20,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2023-08-21,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2023-08-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2023-08-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2023-08-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
100,4,2023-10-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,4,2023-10-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,4,2023-10-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,4,2023-10-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
act_pt.to_parquet('../ETRI_data/valid/valid_day/act_day.parquet', engine='pyarrow', compression='gzip')
hr_pt.to_parquet('../ETRI_data/valid/valid_day/hr_day.parquet', engine='pyarrow', compression='gzip')
gps_pt.to_parquet('../ETRI_data/valid/valid_day/gps_day.parquet', engine='pyarrow', compression='gzip')

# Accelerator

In [19]:
data_path ="../ETRI_data/valid/raw"
for i in tqdm(range(4)):
    file_name = "ch2024_val__m_acc_part_"+str(i+1)+".parquet.gzip"
    acc_df = pd.read_parquet(os.path.join(data_path, file_name))
    acc_df['date'] =acc_df['timestamp'].astype(str).apply(lambda x:x.split(".")[0])

    acc_sample_df = acc_df.groupby('date').apply(lambda x:x.iloc[0]).reset_index(drop=True)
    acc_sample_df.drop(columns = ['date'], inplace=True)
    display(acc_sample_df.head())
    acc_sample_df.to_parquet('../ETRI_data/valid/acc/acc'+str(i+1)+'_sample.parquet.gzip', engine='pyarrow', compression='gzip')

    del file_name, acc_sample_df, acc_df
    gc.collect()

  0%|                                                                                                                                                                                                | 0/4 [00:00<?, ?it/s]

Unnamed: 0,subject_id,timestamp,x,y,z
0,1,2023-08-20 00:00:00.025,0.933201,-3.522235,9.164511
1,1,2023-08-20 00:00:01.000,0.933201,-3.517449,9.159725
2,1,2023-08-20 00:00:02.001,0.933201,-3.507878,9.164511
3,1,2023-08-20 00:00:03.003,0.942772,-3.503092,9.193225
4,1,2023-08-20 00:00:04.003,0.952343,-3.498307,9.174082


 25%|█████████████████████████████████████████████▊                                                                                                                                         | 1/4 [04:11<12:35, 251.99s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,2,2023-09-12 00:00:00.035,8.037635,-4.836288,-2.104503
1,2,2023-09-12 00:00:01.000,8.03524,-7.522881,0.28491
2,2,2023-09-12 00:00:02.008,10.704775,-9.126996,-0.9912
3,2,2023-09-12 00:00:03.008,6.68491,-9.251494,1.170765
4,2,2023-09-12 00:00:04.008,6.988974,-9.402329,0.272939


 50%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 2/4 [05:59<05:33, 166.89s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,3,2023-09-08 00:00:00.029,-2.044711,-9.349643,1.142069
1,3,2023-09-08 00:00:01.000,-2.147664,-9.44302,1.695147
2,3,2023-09-08 00:00:02.012,-2.226675,-9.840469,1.163618
3,3,2023-09-08 00:00:03.011,-2.552297,-9.110215,2.126116
4,3,2023-09-08 00:00:04.000,-2.221887,-8.7822,1.642473


 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                             | 3/4 [08:03<02:27, 147.47s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,4,2023-10-05 00:00:00.012,2.085345,1.870888,-9.445967
1,4,2023-10-05 00:00:01.005,2.098505,1.873281,-9.44836
2,4,2023-10-05 00:00:02.002,2.087738,1.885245,-9.449555
3,4,2023-10-05 00:00:03.003,2.084148,1.881656,-9.441181
4,4,2023-10-05 00:00:04.018,2.078167,1.882852,-9.443574


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [10:47<00:00, 161.86s/it]


In [23]:
from scipy.fftpack import fft  
from scipy.fftpack import fftfreq
from scipy.fftpack import ifft
import math 
import scipy as sp

def median(signal):
    array=np.array(signal)   
    med_filtered=sp.signal.medfilt(array, kernel_size=3)
    return  med_filtered

sampling_freq = 50
def components_selection_one_signal(t_signal, sampling_freq):
    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal)
    f_signal=fft(t_signal)
    freqs=np.array(sp.fftpack.fftfreq(t_signal_length, d=1/float(sampling_freq)))# frequency values between [-25hz:+25hz]

    f_DC_signal=[] # DC_component in freq domain
    f_body_signal=[] # body component in freq domain numpy.append(a, a[0])
    f_noise_signal=[] # noise in freq domain

    for i in range(len(freqs)):# iterate over all available frequencies

        # selecting the frequency value
        freq=freqs[i]

        # selecting the f_signal value associated to freq
        value= f_signal[i]

        # Selecting DC_component values 
        if abs(freq)>0.3:# testing if freq is outside DC_component frequency ranges
            f_DC_signal.append(float(0)) # add 0 to  the  list if it was the case (the value should not be added)                                       
        else: # if freq is inside DC_component frequency ranges 
            f_DC_signal.append(value) # add f_signal value to f_DC_signal list

        # Selecting noise component values 
        if (abs(freq)<=20):# testing if freq is outside noise frequency ranges 
            f_noise_signal.append(float(0)) # # add 0 to  f_noise_signal list if it was the case 
        else:# if freq is inside noise frequency ranges 
            f_noise_signal.append(value) # add f_signal value to f_noise_signal

        # Selecting body_component values 
        if (abs(freq)<=0.3 or abs(freq)>20):# testing if freq is outside Body_component frequency ranges
            f_body_signal.append(float(0))# add 0 to  f_body_signal list
        else:# if freq is inside Body_component frequency ranges
            f_body_signal.append(value) # add f_signal value to f_body_signal list

    t_DC_component= ifft(np.array(f_DC_signal)).real
    t_body_component= ifft(np.array(f_body_signal)).real
    t_noise=ifft(np.array(f_noise_signal)).real

    total_component=t_signal-t_noise # extracting the total component(filtered from noise) 

    return (total_component,t_DC_component,t_body_component,t_noise) 

In [24]:
data_path ="../ETRI_data/valid/acc"
for i in tqdm(range(4)):
    file_name = "acc"+str(i+1)+"_sample.parquet.gzip"
    acc_df = pd.read_parquet(os.path.join(data_path, file_name))
    display(acc_df.head())

    x = acc_df['x']
    y = acc_df['y']
    z = acc_df['z']

    #median
    med_f_x = median(x)
    med_f_y = median(y)
    med_f_z = median(z)
    acc_df['x_med'] = med_f_x
    acc_df['y_med'] = med_f_y
    acc_df['z_med'] = med_f_z

    #butter pca
    total_x, gravity_x, body_x, noise_x = components_selection_one_signal(med_f_x, sampling_freq)
    total_y, gravity_y, body_y, noise_y = components_selection_one_signal(med_f_y, sampling_freq)
    total_z, gravity_z, body_z, noise_z = components_selection_one_signal(med_f_z, sampling_freq)
    acc_df['x_gravity'] = gravity_x
    acc_df['y_gravity'] = gravity_y
    acc_df['z_gravity'] = gravity_z
    acc_df['x_body'] = body_x
    acc_df['y_body'] = body_y
    acc_df['z_body'] = body_z

    display(acc_df.head())
    acc_df.to_parquet('../ETRI_data/valid/acc/acc'+str(i+1)+'_final.parquet', engine='pyarrow')
    
    del file_name, acc_df, x, y, z, med_f_x, med_f_y, med_f_z, total_x, gravity_x, body_x, noise_x, total_y, gravity_y, body_y, noise_y, total_z, gravity_z, body_z, noise_z
    gc.collect()

  0%|                                                                                                                                                                                                | 0/4 [00:00<?, ?it/s]

Unnamed: 0,subject_id,timestamp,x,y,z
0,1,2023-08-20 00:00:00.025,0.933201,-3.522235,9.164511
1,1,2023-08-20 00:00:01.000,0.933201,-3.517449,9.159725
2,1,2023-08-20 00:00:02.001,0.933201,-3.507878,9.164511
3,1,2023-08-20 00:00:03.003,0.942772,-3.503092,9.193225
4,1,2023-08-20 00:00:04.003,0.952343,-3.498307,9.174082


Unnamed: 0,subject_id,timestamp,x,y,z,x_med,y_med,z_med,x_gravity,y_gravity,z_gravity,x_body,y_body,z_body
0,1,2023-08-20 00:00:00.025,0.933201,-3.522235,9.164511,0.933201,-3.517449,9.159725,0.209231,0.279428,8.968776,1.376205,-3.493572,-0.688619
1,1,2023-08-20 00:00:01.000,0.933201,-3.517449,9.159725,0.933201,-3.517449,9.164511,0.221515,0.193323,8.982017,0.129065,-3.99836,0.858477
2,1,2023-08-20 00:00:02.001,0.933201,-3.507878,9.164511,0.933201,-3.507878,9.164511,0.234091,0.10692,8.995046,1.140433,-3.386566,-0.252339
3,1,2023-08-20 00:00:03.003,0.942772,-3.503092,9.193225,0.942772,-3.503092,9.174082,0.246953,0.02027,9.007853,0.434108,-3.665149,0.337339
4,1,2023-08-20 00:00:04.003,0.952343,-3.498307,9.174082,0.952343,-3.503092,9.183653,0.260094,-0.066575,9.020428,0.776772,-3.386639,0.191833


 25%|██████████████████████████████████████████████                                                                                                                                          | 1/4 [00:14<00:44, 14.82s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,2,2023-09-12 00:00:00.035,8.037635,-4.836288,-2.104503
1,2,2023-09-12 00:00:01.000,8.03524,-7.522881,0.28491
2,2,2023-09-12 00:00:02.008,10.704775,-9.126996,-0.9912
3,2,2023-09-12 00:00:03.008,6.68491,-9.251494,1.170765
4,2,2023-09-12 00:00:04.008,6.988974,-9.402329,0.272939


Unnamed: 0,subject_id,timestamp,x,y,z,x_med,y_med,z_med,x_gravity,y_gravity,z_gravity,x_body,y_body,z_body
0,2,2023-09-12 00:00:00.035,8.037635,-4.836288,-2.104503,8.03524,-4.836288,0.0,4.185926,-0.600782,1.714311,3.006626,-3.816936,-1.046625
1,2,2023-09-12 00:00:01.000,8.03524,-7.522881,0.28491,8.037635,-7.522881,-0.9912,4.320035,-0.76956,1.651099,4.421636,-7.121981,-3.155002
2,2,2023-09-12 00:00:02.008,10.704775,-9.126996,-0.9912,8.03524,-9.126996,0.28491,4.453315,-0.93638,1.587504,3.088824,-7.920174,-0.986075
3,2,2023-09-12 00:00:03.008,6.68491,-9.251494,1.170765,6.988974,-9.251494,0.272939,4.585667,-1.101145,1.523605,2.664696,-8.295793,-1.373125
4,2,2023-09-12 00:00:04.008,6.988974,-9.402329,0.272939,6.68491,-9.251494,1.170765,4.716992,-1.263764,1.459478,1.908461,-7.968492,-0.319773


 50%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2/4 [00:17<00:15,  7.79s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,3,2023-09-08 00:00:00.029,-2.044711,-9.349643,1.142069
1,3,2023-09-08 00:00:01.000,-2.147664,-9.44302,1.695147
2,3,2023-09-08 00:00:02.012,-2.226675,-9.840469,1.163618
3,3,2023-09-08 00:00:03.011,-2.552297,-9.110215,2.126116
4,3,2023-09-08 00:00:04.000,-2.221887,-8.7822,1.642473


Unnamed: 0,subject_id,timestamp,x,y,z,x_med,y_med,z_med,x_gravity,y_gravity,z_gravity,x_body,y_body,z_body
0,3,2023-09-08 00:00:00.029,-2.044711,-9.349643,1.142069,-2.044711,-9.349643,1.142069,-1.765673,-0.798389,5.61517,-0.049837,-7.695076,-3.645124
1,3,2023-09-08 00:00:01.000,-2.147664,-9.44302,1.695147,-2.147664,-9.44302,1.163618,-1.84297,-0.723551,5.521655,-0.507219,-9.445853,-5.037635
2,3,2023-09-08 00:00:02.012,-2.226675,-9.840469,1.163618,-2.226675,-9.44302,1.695147,-1.920876,-0.646383,5.428171,-0.144137,-8.248378,-3.286604
3,3,2023-09-08 00:00:03.011,-2.552297,-9.110215,2.126116,-2.226675,-9.110215,1.642473,-1.999341,-0.566913,5.334745,-0.344849,-8.906903,-3.873189
4,3,2023-09-08 00:00:04.000,-2.221887,-8.7822,1.642473,-2.528354,-9.110215,2.015979,-2.078314,-0.485171,5.241405,-0.370804,-8.419709,-3.284887


 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 3/4 [00:21<00:05,  5.94s/it]

Unnamed: 0,subject_id,timestamp,x,y,z
0,4,2023-10-05 00:00:00.012,2.085345,1.870888,-9.445967
1,4,2023-10-05 00:00:01.005,2.098505,1.873281,-9.44836
2,4,2023-10-05 00:00:02.002,2.087738,1.885245,-9.449555
3,4,2023-10-05 00:00:03.003,2.084148,1.881656,-9.441181
4,4,2023-10-05 00:00:04.018,2.078167,1.882852,-9.443574


Unnamed: 0,subject_id,timestamp,x,y,z,x_med,y_med,z_med,x_gravity,y_gravity,z_gravity,x_body,y_body,z_body
0,4,2023-10-05 00:00:00.012,2.085345,1.870888,-9.445967,2.085345,1.870888,-9.445967,5.648188,1.065704,-6.368291,-2.843242,0.64197,-2.455158
1,4,2023-10-05 00:00:01.005,2.098505,1.873281,-9.44836,2.087738,1.873281,-9.44836,5.56201,1.084388,-6.442514,-4.10016,0.930979,-3.549386
2,4,2023-10-05 00:00:02.002,2.087738,1.885245,-9.449555,2.087738,1.881656,-9.44836,5.475871,1.103072,-6.516685,-2.927541,0.673643,-2.529402
3,4,2023-10-05 00:00:03.003,2.084148,1.881656,-9.441181,2.084148,1.882852,-9.443574,5.389814,1.121748,-6.590769,-3.566543,0.821202,-3.083205
4,4,2023-10-05 00:00:04.018,2.078167,1.882852,-9.443574,2.084148,1.882852,-9.441181,5.303878,1.140407,-6.664732,-3.150245,0.725368,-2.711742


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:26<00:00,  6.55s/it]


In [29]:
data_path ="../ETRI_data/valid/acc"
sav_dir = "../ETRI_data/valid/valid_day"
for i in tqdm(range(4)):
    file_name = "acc"+str(i+1)+"_final.parquet"
    user_data = pd.read_parquet(os.path.join(data_path, file_name))
    user_data['date'] = user_data['timestamp'].astype(str).apply(lambda x:x.split(" ")[0])
    user_pt = pd.pivot_table(user_data,
                            values=['x_med','y_med', 'z_med', 
                                    'x_gravity', 'x_body', 'y_gravity', 'y_body', 'z_gravity', 'z_body'],
                            index=['subject_id','date'],
                            aggfunc={'x_med':list,
                                    'y_med':list,
                                    'z_med':list,
                                    'x_gravity':list,
                                    'x_body':list,
                                    'y_gravity':list,
                                    'y_body':list,
                                    'z_gravity':list,
                                    'z_body':list,})
    user_pt['x_med'].apply(np.array)
    user_pt['y_med'].apply(np.array)
    user_pt['z_med'].apply(np.array)
    user_pt['x_gravity'].apply(np.array)
    user_pt['y_gravity'].apply(np.array)
    user_pt['z_gravity'].apply(np.array)
    user_pt['x_body'].apply(np.array)
    user_pt['y_body'].apply(np.array)
    user_pt['z_body'].apply(np.array)

    user_pt.reset_index(inplace=True)
    user_pt.rename(columns = {'subject_id':'user'}, inplace=True)

    # parquet으로 저장
    print('Saving..')
    table = pa.Table.from_pandas(user_pt)
    save_dir = os.path.join(sav_dir, 'user'+str(i+1)+'_'+'acc_day.parquet')
    pq.write_table(table, save_dir)

  0%|                                                                                                                                                                                                | 0/4 [00:00<?, ?it/s]

Saving..


 25%|██████████████████████████████████████████████                                                                                                                                          | 1/4 [00:04<00:12,  4.26s/it]

Saving..


 50%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                            | 2/4 [00:06<00:05,  2.95s/it]

Saving..


 75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                              | 3/4 [00:08<00:02,  2.64s/it]

Saving..


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:11<00:00,  2.89s/it]


In [31]:
# Acc 통합 파일 생성
data_dir = '../ETRI_data/valid/valid_day'
user_list = [x.split('_')[0] for x in os.listdir(os.path.join(data_dir)) if 'user' in x]
user_list

['user4', 'user2', 'user3', 'user1']

In [32]:
acc = None
for user in user_list:
    user_acc = pd.read_parquet(os.path.join(data_dir, user+'_acc_day.parquet'))
    if acc is None:
        acc = user_acc
    else:
        acc = pd.concat([acc, user_acc], axis=0)
        
table = pa.Table.from_pandas(acc)
save_dir = os.path.join(data_dir, 'acc_day.parquet')
pq.write_table(table, save_dir)

In [33]:
acc_pt = pd.read_parquet(os.path.join(data_dir, 'acc_day.parquet'))
acc_pt.head()

Unnamed: 0,user,date,x_body,x_gravity,x_med,y_body,y_gravity,y_med,z_body,z_gravity,z_med
0,4,2023-10-05,"[-2.843242035714697, -4.100159519663082, -2.92...","[5.648188213992783, 5.562009699185556, 5.47587...","[2.085345, 2.0877378, 2.0877378, 2.0841484, 2....","[0.6419704231766171, 0.9309794927525092, 0.673...","[1.06570411174384, 1.0843884205156702, 1.10307...","[1.8708882, 1.873281, 1.8816559, 1.8828523, 1....","[-2.45515785165495, -3.549385977366609, -2.529...","[-6.368291108796945, -6.442514206652606, -6.51...","[-9.445967, -9.4483595, -9.4483595, -9.443574,..."
1,4,2023-10-06,"[-0.2657094210209332, -0.27839262505446316, -0...","[-1.360124861846307, -1.3560397148278274, -1.3...","[-1.6292131, -1.6304095, -1.6328024, -1.632802...","[-0.2013936290836716, -0.16535285512555434, -0...","[3.726850739380659, 3.722467121790583, 3.71780...","[3.5336008, 3.5503507, 3.5503507, 3.5491543, 3...","[-0.16788764834370787, -0.16905012293708968, -...","[-8.949083848552299, -8.913241613854941, -8.87...","[-9.098708, -9.0951185, -9.0951185, -9.116654,..."
2,4,2023-10-07,"[-0.017179139077145657, -0.027804372321925674,...","[-0.29578374412283215, -0.2971562944874837, -0...","[-0.31974092, -0.31974092, -0.3269194, -0.3269...","[0.08515203802235018, 0.09016235616345471, 0.0...","[0.32429429378930896, 0.3224087920396755, 0.32...","[0.40947178, 0.4106682, 0.4142574, 0.4214359, ...","[0.038433562068290135, 0.01957793310844592, 0....","[-9.869384186008626, -9.869932376872542, -9.87...","[-9.841082, -9.841082, -9.841082, -9.841082, -..."
3,4,2023-10-08,"[-0.0009804511448509414, -0.004886243106895436...","[-0.0044227824368331765, -0.001743862108083933...","[-0.011964113, -0.0047856453, -0.0047856453, -...","[-0.07333106274202315, -0.05678774537216136, -...","[-0.42716371500670547, -0.43081621635429057, -...","[-0.4968098, -0.49082774, -0.49082774, -0.4908...","[0.06652960094575348, 0.06678734797383659, 0.0...","[9.67493095494238, 9.677020039698146, 9.679198...","[9.740284, 9.743873, 9.743873, 9.7450695, 9.74..."
4,4,2023-10-09,"[0.06909993033147345, 0.06216169847278393, 0.0...","[-0.07221442742009418, -0.07379880687775228, -...","[-0.0023928226, -0.013160525, -0.015553347, -0...","[-0.03110573420150428, -0.024731207764935276, ...","[0.11097903396199685, 0.11074758300664948, 0.1...","[0.08015956, 0.0849452, 0.086141616, 0.0861416...","[-0.05749965531189401, -0.060020133662384914, ...","[-9.79855464029878, -9.79744630012038, -9.7964...","[-9.856635, -9.856635, -9.861421, -9.861421, -..."


# vaild 유효한 날짜 구하기

In [35]:
val_label = pd.read_csv("../ETRI_data/val_label.csv")
val_label.rename(columns = {'subject_id':'user'}, inplace=True)

In [36]:
acc_date = acc_pt[['user', 'date']]
hr_date = hr_pt[['user', 'date']]
gps_date = gps_pt[['user', 'date']]
act_date = act_pt[['user', 'date']]
lable_date = val_label[['user', 'date']]

In [37]:
from functools import reduce

val_date = reduce(lambda x,y: pd.merge(x, y, on=['user', 'date'], how='outer'), [acc_date, hr_date, gps_date, act_date, lable_date])
val_date

Unnamed: 0,user,date
0,4,2023-10-05
1,4,2023-10-06
2,4,2023-10-07
3,4,2023-10-08
4,4,2023-10-09
...,...,...
100,1,2023-09-25
101,1,2023-09-26
102,1,2023-09-27
103,1,2023-09-28


In [41]:
# gps 데이터 빈 날짜 채우기
gps_pt = pd.merge(val_date, gps_pt, on=("user","date"), how="outer")#.reset_index(inplace=True)
gps_pt.reset_index(drop=True, inplace=True)
display(gps_pt.head())
print(gps_pt.shape)

# gps 패딩한 길이만큼 배열 생성
val_nan = np.zeros(16202)
mean_distance = np.mean(gps_pt['distance'])

# nan값 대체
gps_pt['lat'] = [x if np.sum(pd.isna(x))<=0 else val_nan for x in gps_pt['lat'] ]
print(gps_pt['lat'].isnull().sum())

gps_pt['lon'] = [x if np.sum(pd.isna(x))<=0 else val_nan for x in gps_pt['lon'] ]
print(gps_pt['lon'].isnull().sum())

gps_pt['distance'] = [x if np.sum(pd.isna(x))<=0 else mean_distance for x in gps_pt['distance'] ]
print(gps_pt['distance'].isnull().sum())

display(gps_pt.head())

Unnamed: 0,user,date,distance,lat,lon
0,4,2023-10-05,3656.519151,"[0.5239363000000026, 0.523933999999997, 0.5239...","[0.18187559999999792, 0.18189619999999707, 0.1..."
1,4,2023-10-06,31956.886634,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4,2023-10-07,214.304096,"[0.49205010000000016, 0.49210149999999686, 0.4...","[0.23755710000000363, 0.23767999999999745, 0.2..."
3,4,2023-10-08,5800.192502,"[0.5083223000000032, 0.5083193999999978, 0.508...","[0.18963080000000332, 0.18961609999999496, 0.1..."
4,4,2023-10-09,11297.852775,"[0.49205109999999763, 0.4920468999999983, 0.49...","[0.2375667999999962, 0.23757659999999703, 0.23..."


(105, 5)
0
0
0


Unnamed: 0,user,date,distance,lat,lon
0,4,2023-10-05,3656.519151,"[0.5239363000000026, 0.523933999999997, 0.5239...","[0.18187559999999792, 0.18189619999999707, 0.1..."
1,4,2023-10-06,31956.886634,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4,2023-10-07,214.304096,"[0.49205010000000016, 0.49210149999999686, 0.4...","[0.23755710000000363, 0.23767999999999745, 0.2..."
3,4,2023-10-08,5800.192502,"[0.5083223000000032, 0.5083193999999978, 0.508...","[0.18963080000000332, 0.18961609999999496, 0.1..."
4,4,2023-10-09,11297.852775,"[0.49205109999999763, 0.4920468999999983, 0.49...","[0.2375667999999962, 0.23757659999999703, 0.23..."


In [43]:
data_dir = '../ETRI_data/valid'
# valid 데이터 저장
pq.write_table(pa.Table.from_pandas(acc_pt), os.path.join(data_dir, 'acc_final.parquet'))
pq.write_table(pa.Table.from_pandas(hr_pt), os.path.join(data_dir, 'hr_final.parquet'))
pq.write_table(pa.Table.from_pandas(act_pt), os.path.join(data_dir, 'act_final.parquet'))
pq.write_table(pa.Table.from_pandas(gps_pt), os.path.join(data_dir, 'gps_final.parquet'))

# Vaild 데이터 레이블 처리

In [44]:
val_date['user-date'] = [str(x) + '_' + str(y) for x, y in zip(val_date['user'], val_date['date'])]
val_label['user-date'] = [str(x) + '_' + str(y) for x, y in zip(val_label['user'], val_label['date'])]
val_label_valid = val_label[val_label['user-date'].isin(val_date['user-date'])]
val_label_valid.drop(columns=['user-date'], inplace=True)

In [45]:
# 모든 날짜가 유효함
print(val_label_valid.shape)
display(val_label_valid.head())

(105, 9)


Unnamed: 0,user,date,Q1,Q2,Q3,S1,S2,S3,S4
0,1,2023-08-20,1,1,1,0,0,0,0
1,1,2023-08-21,1,1,1,0,0,1,0
2,1,2023-08-22,0,1,1,0,1,1,0
3,1,2023-08-23,0,1,1,0,0,1,0
4,1,2023-08-24,1,1,1,0,0,1,0


In [46]:
val_label_valid.to_csv('../ETRI_data/val_label_valid.csv', index=False)