In [None]:
from scipy.signal import medfilt
import numpy as np
import pandas as pd
import os
import scipy as sp
from scipy.fftpack import fft  
from scipy.fftpack import fftfreq
from scipy.fftpack import ifft
import math 
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa
from datetime import datetime
import gc
from functools import reduce
import math
from pathlib import Path

In [None]:
train_datadir = '../ETRI_dataset'
label_dir = '../ETRI_dataset/train'
data_dir = '../ETRI_dataset/train'
acc_dir = os.path.join(data_dir, 'mAcc')

# 1. Parquet 데이터로 변환

In [None]:
paths = [x for x in os.listdir(train_datadir) if 'user' in x and '-' in x]
target_list = ['e4Hr', 'e4Temp', 'mAcc', 'mGps']
# 유저 그룹 폴더
for path in paths:
    user_group =os.path.join(train_datadir, path)
    
    # 개별 유저 폴더
    for user in os.listdir(user_group):
        print('='*10 + user + '='*10)
        ts_group = os.path.join(user_group, user)
        
        # 타임스탬프 폴더
        for timestamp in tqdm(os.listdir(ts_group)):
            timestamp_path = os.path.join(ts_group, timestamp)
            ts_to_date = datetime.strftime(datetime.fromtimestamp(int(timestamp)), '%Y-%m-%d')

            for target in target_list:
                target_path = os.path.join(timestamp_path, target)
                target_dirlist = os.listdir(target_path)

                if len(target_dirlist) == 0:
                    continue
                
                # 타겟 파일 폴더
                timestamp_df = None
                ts_list = []
                date_list = []
                user_list = []
                for t_file in target_dirlist:
                    t_sample = pd.read_csv(os.path.join(target_path, t_file))
                    ts_list.extend([t_file.replace('.csv','')]*len(t_sample))
                    date_list.extend([ts_to_date]*len(t_sample))
                    user_list.extend([user]*len(t_sample))

                    if timestamp_df is None:
                        timestamp_df = t_sample
                    else:
                        timestamp_df = pd.concat([timestamp_df, t_sample])

                timestamp_df['timestamp_head'] = ts_list
                timestamp_df['user'] = user_list
                timestamp_df['date'] = date_list
                timestamp_df['timestamp_sum'] = timestamp_df['timestamp_head'].apply(int)+timestamp_df['timestamp']
                timestamp_df = timestamp_df[['user', 'timestamp_head', 'timestamp_sum', 'date']+list(timestamp_df.columns)[:-4]]

                # 타임스탬프당 하나씩 생성
                table = pa.Table.from_pandas(timestamp_df)
                save_dir = os.path.join(timestamp_path, timestamp+'_'+target+'.parquet')
                pq.write_table(table, save_dir)

# 유저별 데이터 통합

In [None]:
paths = [x for x in os.listdir(train_datadir) if 'user' in x and '-' in x]

target_list = ['e4Hr', 'mAcc', 'mGps']
timestamps = []
for path in paths:
    user_group =os.path.join(train_datadir, path)
    
    for user in [x for x in os.listdir(user_group) if 'parquet' not in x]:
        print('='*10 + user + '='*10)
        ts_group = os.path.join(user_group, user)
        for target in target_list:
            user_total = None
            for timestamp in tqdm(os.listdir(ts_group)):
                t_file = timestamp + '_' +target+'.parquet'
                if not os.path.isfile(os.path.join(ts_group, timestamp, t_file)):
                    continue

                timestamp_df = pd.read_parquet(os.path.join(ts_group, timestamp, t_file))
                if user_total is None:
                    user_total = timestamp_df
                else:
                    user_total = pd.concat([user_total, timestamp_df])

            # parquet으로 저장
            table = pa.Table.from_pandas(user_total)
            save_dir = os.path.join(data_dir, target, user+'_'+target+'_v1.parquet')
            pq.write_table(table, save_dir)

# 유저 모아서 하나의 파일로 통합

In [None]:
targets = ['mGps', 'e4Hr'] # mAcc는 메모리 오류 발생
timestamps = []
version = 'v1'

for target in targets:
    print('='*10 + target + '='*10)
    target_path = os.path.join(data_dir, target)
    all_total = None
    for user_data in tqdm([x for x in os.listdir(target_path) if version in x]):
        user_df = pd.read_parquet(os.path.join(target_path, user_data))
        if all_total is None:
            all_total = user_df
        else:
            all_total = pd.concat([all_total, user_df], axis=0)

    table = pa.Table.from_pandas(all_total)
    save_dir = os.path.join(data_dir, target+'_'+version+'.parquet')
    pq.write_table(table, save_dir)

# Heatrate 전처리
## 1) 분 단위 샘플링

In [27]:
hr = pd.read_parquet(os.path.join(data_dir, 'e4Hr_v1.parquet'))
hr['datetime'] = hr['timestamp_sum'].apply(datetime.fromtimestamp).apply(str)
hr['datetime'] = [':'.join(x.split(':')[:-1]) for x in hr['datetime']]
hr = hr.groupby(by=['user','datetime']).apply(lambda x:x.iloc[0]).reset_index(drop=True)
hr.columns.name=None

# 데이터 저장하기
table = pa.Table.from_pandas(hr)
save_dir = os.path.join(data_dir, 'e4Hr_v2.parquet')
pq.write_table(table, save_dir)

## 2) 일(day) 단위 그룹핑

In [58]:
hr = pd.read_parquet(os.path.join(data_dir, 'e4Hr_v2.parquet'))
hr['date'] = hr['datetime'].astype(str).apply(lambda x:x.split(" ")[0])
hr_pt = pd.pivot_table(hr,
                        values=['hr'],
                        index=['user','date'],
                        aggfunc={'hr':list})
hr_pt['hr'].apply(np.array)
hr_pt.reset_index(inplace=True)
hr_pt.head()

# parquet으로 저장
print('Saving..')
table = pa.Table.from_pandas(hr_pt)
save_dir = os.path.join(data_dir, 'hr_day.parquet')
pq.write_table(table, save_dir)

Saving..


# GPS 전처리
## 1) 분 단위 샘플링

In [28]:
gps = pd.read_parquet(os.path.join(data_dir, 'mGps_v1.parquet'))
gps['datetime'] = gps['timestamp_sum'].apply(datetime.fromtimestamp).apply(str)
# h은 분단위로 샘플링
gps['second'] = [(int(x.split(':')[-1])%5)*5 for x in tqdm(gps['datetime'])]
gps['datetime'] = [':'.join(x.split(':')[:-1])+':'+('0'+str(y) if y<10 else str(y)) for x, y in tqdm(zip(gps['datetime'], gps['second']))]
gps = gps.groupby(by=['user','datetime']).apply(lambda x:x.iloc[0]).reset_index(drop=True)
gps.columns.name=None

# 데이터 저장하기
table = pa.Table.from_pandas(gps)
save_dir = os.path.join(data_dir, 'mGps_v2.parquet')
pq.write_table(table, save_dir)

100%|███████████████████████████████████████████████████████████████████| 5538301/5538301 [00:03<00:00, 1715855.45it/s]
5538301it [00:05, 1034578.24it/s]


## 2) 위도, 경도 데이터를 거리 데이터로 변경

In [60]:
gps = pd.read_parquet(os.path.join(data_dir, 'mGps_v2.parquet'))
gps['date'] =gps['datetime'].astype(str).apply(lambda x:x.split(" ")[0])


def measure(lat1, lon1, lat2, lon2):
    R = 6378.137  # Radius of earth in KM
    dLat = lat2 * math.pi / 180 - lat1 * math.pi / 180
    dLon = lon2 * math.pi / 180 - lon1 * math.pi / 180
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + \
        math.cos(lat1 * math.pi / 180) * math.cos(lat2 * math.pi / 180) * \
        math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = R * c
    return d * 1000  # meters

gps_distance = []
for (k1, k2), df in gps.groupby(['user', 'date']):
    lat = df['lat'].values
    lon = df['lon'].values
    gps_map = []
    for i in range(len(lat)-1):
        lat1, lon1 = lat[i], lon[i]
        lat2, lon2 = lat[i+1], lon[i+1]
        distance = measure(lat1, lon1, lat2, lon2)
        gps_map.append(distance)
    gps_map.insert(0, 0)
    gps_distance.append(gps_map)

# gps_distance
gps_result = sum(gps_distance, [])
gps['distance'] = gps_result

## 3) 일(day) 단위 그룹핑

In [61]:
gps_pt = pd.pivot_table(gps,
                        values=['lat', 'lon', 'distance'],
                        index=['user','date'],
                        aggfunc={'lat':list,
                                'lon':list,
                                'distance':sum,})
gps_pt['lat'].apply(np.array)
gps_pt['lon'].apply(np.array)
gps_pt.reset_index(inplace=True)

display(gps_pt)
# gps_pt['distance'] = [sum(x) for x in gps_pt['distance']]

# parquet으로 저장
print('Saving..')
table = pa.Table.from_pandas(gps_pt)
save_dir = os.path.join(data_dir, 'gps_day.parquet')
pq.write_table(table, save_dir)

Unnamed: 0,user,date,distance,lat,lon
0,user01,2020-08-30,40845.066277,"[37.5449524, 37.5448505, 37.5448253, 37.544725...","[127.0543909, 127.0544208, 127.0544846, 127.05..."
1,user01,2020-08-31,162968.472317,"[37.482426, 37.4824378, 37.4824378, 37.4824278...","[126.9562902, 126.9563047, 126.9563047, 126.95..."
2,user01,2020-09-05,73245.140240,"[37.4824361, 37.482437, 37.4824358, 37.4824369...","[126.9562791, 126.9562791, 126.9562818, 126.95..."
3,user01,2020-09-07,77638.140799,"[37.4783778, 37.4782302, 37.4783796, 37.478275...","[126.9589058, 126.9588167, 126.9588194, 126.95..."
4,user01,2020-09-08,42525.744187,"[37.4824293, 37.4824099, 37.4824084, 37.482417...","[126.9562864, 126.9562735, 126.9562748, 126.95..."
...,...,...,...,...,...
529,user30,2020-09-23,19653.356680,"[37.4870101, 37.4869945, 37.4870033, 37.486874...","[126.8534891, 126.8534442, 126.8534882, 126.85..."
530,user30,2020-09-24,22787.895150,"[37.4873852, 37.4873713, 37.4873286, 37.487309...","[126.8535275, 126.8535532, 126.8535211, 126.85..."
531,user30,2020-09-25,37377.087670,"[37.4869972, 37.486993, 37.4869952, 37.4870105...","[126.8535627, 126.8535065, 126.8535046, 126.85..."
532,user30,2020-09-26,1886.150746,"[37.4869938, 37.4869938, 37.4869949, 37.487002...","[126.8534891, 126.8534891, 126.853489, 126.853..."


Saving..


# Activity 전처리
## 1) 유저 별 통합

In [None]:
# Activity 전처리
paths = [x for x in os.listdir(data_dir) if 'user' in x and '-' in x]

user_label = None
print('Get labels from each user...')
# 유저 그룹 폴더
for path in paths:
    user_group =os.path.join(path)
    
    # 개별 유저 폴더
    for user in os.listdir(user_group):
        print('='*10 + user + '='*10)
        ts_group = os.path.join(user_group, user)
        
        # 타임스탬프 폴더
        for timestamp in tqdm(os.listdir(ts_group)):
            labels = pd.read_csv(os.path.join(ts_group, timestamp, timestamp+'_label.csv'))
            user_list = [user]*len(labels)
            datetime_list = [':'.join(str(datetime.fromtimestamp(x)).split(':')[:-1]) for x in labels['ts']]
            
            labels['user'] = user_list
            labels['datetime'] = datetime_list
            
            labels = labels[['user','datetime', 'activity']]
            
            if user_label is None:
                user_label = labels
            else:
                user_label = pd.concat([user_label, labels], axis=0)
                
    del user_group, ts_group, user_list, datetime_list, labels
    gc.collect()
    
print('v1 Saving...')
table = pa.Table.from_pandas(user_label)
save_dir = os.path.join(data_dir, 'labels_v1.parquet')
pq.write_table(table, save_dir)

# 유저 라벨 리스트화
print('Conver to list...')
label_pt = pd.pivot_table(user_label,
                        values=['activity'],
                        index=['user','datetime'],
                        aggfunc={'activity':list,})
label_pt.reset_index(inplace=True)

                
# 유저 통합 라벨 생성
print('v2 Saving...')
table = pa.Table.from_pandas(label_pt)
save_dir = os.path.join(data_dir, 'activity_v2.parquet')
pq.write_table(table, save_dir)

label_pt.head()

## 2) Activity가 2개 이상인 경우
 - 빈도 수가 적은 값으로 대체

In [None]:
def select_activity(x):
    for k in x:
        if k not in activity_count.keys():
            return k
    
    if activity_count[x[0]] < activity_count[x[1]]:
        return x[0]
    else:
        return x[1]

label_pt = pd.read_parquet(os.path.join(data_dir, 'activity_v2.parquet'))
label_pt['activity'] = [select_activity(x) if len(x)==2 else x[0] for x in label_pt['activity']]

# 유저 통합 라벨 생성
print('v3 Saving...')
table = pa.Table.from_pandas(label_pt)
save_dir = os.path.join(data_dir, 'activity_v3.parquet')
pq.write_table(table, save_dir)

## 3) 일(day 단위로 합치기)

In [62]:
act = pd.read_parquet(os.path.join(data_dir, 'activity_v3.parquet'))
act['date'] =act['datetime'].astype(str).apply(lambda x:x.split(" ")[0])
act_pt = pd.pivot_table(act,
                        values=['activity'],
                        index=['user','date'],
                        aggfunc={'activity':list,})
act_pt['activity'].apply(np.array)
act_pt.reset_index(inplace=True)

# parquet으로 저장
print('Saving..')
table = pa.Table.from_pandas(act_pt)
save_dir = os.path.join(data_dir, 'act_day.parquet')
pq.write_table(table, save_dir)

Saving..


## 4) 비율로 변환

In [81]:
act_df = pd.read_parquet(os.path.join(data_dir,'act_day.parquet'))
#<activity 전처리>
act_df['len']=act_df['activity'].apply(len)
# 하루 중 해당 activity의 비율
for i in range(0,7):
    act_df['act'+str(i)]=[list(x).count(i) for x in act_df['activity']]/act_df['len']

act_df = act_df.drop(columns=['activity', 'len'])

table = pa.Table.from_pandas(act_df)
save_dir = os.path.join(data_dir, 'act_day.parquet')
pq.write_table(table, save_dir)

# Accelerator 전처리
## 1) 초 단위 샘플링

In [None]:
user_list = [x for x in os.listdir(acc_dir) if 'v1' in x]

for user in tqdm(user_list):
    user_data = pd.read_parquet(os.path.join(acc_dir, user))
    user_nm = str(user).split('_')[0]
    
    user_data['datetime'] = [datetime.fromtimestamp(x) for x in user_data['timestamp_sum']]
    user_data['datetime'] = user_data['datetime'].apply(str)
    user_data['datetime'] = [x.split('.')[0] for x in user_data['datetime']]
    
    user_data = user_data.groupby('datetime').apply(lambda x:x.iloc[0]).reset_index(drop=True)
    
    table = pa.Table.from_pandas(user_data)
    save_dir = os.path.join(acc_dir, user_nm+'_mAcc_v2.parquet')
    pq.write_table(table, save_dir)
    
    del user_data, user_nm, table, save_dir
    gc.collect()

## 2) median 필터 및 버터워스 필터 적용

In [31]:
def median(signal):
    array=np.array(signal)   
    med_filtered=sp.signal.medfilt(array, kernel_size=3)
    return  med_filtered

In [32]:
sampling_freq = 30
nyq=sampling_freq/float(2)
freq1 = 0.3
freq2 = 20

# Function name: components_selection_one_signal

# Inputs: t_signal:1D numpy array (time domain signal); 

# Outputs: (total_component,t_DC_component , t_body_component, t_noise) 
#           type(1D array,1D array, 1D array)

# cases to discuss: if the t_signal is an acceleration signal then the t_DC_component is the gravity component [Grav_acc]
#                   if the t_signal is a gyro signal then the t_DC_component is not useful
# t_noise component is not useful
# if the t_signal is an acceleration signal then the t_body_component is the body's acceleration component [Body_acc]
# if the t_signal is a gyro signal then the t_body_component is the body's angular velocity component [Body_gyro]

def components_selection_one_signal(t_signal,freq1,freq2):
    t_signal=np.array(t_signal)
    t_signal_length=len(t_signal)
    f_signal=fft(t_signal)
    freqs=np.array(sp.fftpack.fftfreq(t_signal_length, d=1/float(sampling_freq)))# frequency values between [-25hz:+25hz]
    
    # DC_component: f_signal values having freq between [-0.3 hz to 0 hz] and from [0 hz to 0.3hz] 
    #                                                             (-0.3 and 0.3 are included)
    
    # noise components: f_signal values having freq between [-25 hz to 20 hz[ and from ] 20 hz to 25 hz] 
    #                                                               (-25 and 25 hz inculded 20hz and -20hz not included)
    
    # selecting body_component: f_signal values having freq between [-20 hz to -0.3 hz] and from [0.3 hz to 20 hz] 
    #                                                               (-0.3 and 0.3 not included , -20hz and 20 hz included)
    
    
    f_DC_signal=[] # DC_component in freq domain
    f_body_signal=[] # body component in freq domain numpy.append(a, a[0])
    f_noise_signal=[] # noise in freq domain
    
    for i in range(len(freqs)):# iterate over all available frequencies
        
        # selecting the frequency value
        freq=freqs[i]
        
        # selecting the f_signal value associated to freq
        value= f_signal[i]
        
        # Selecting DC_component values 
        if abs(freq)>0.3:# testing if freq is outside DC_component frequency ranges
            f_DC_signal.append(float(0)) # add 0 to  the  list if it was the case (the value should not be added)                                       
        else: # if freq is inside DC_component frequency ranges 
            f_DC_signal.append(value) # add f_signal value to f_DC_signal list
    
        # Selecting noise component values 
        if (abs(freq)<=20):# testing if freq is outside noise frequency ranges 
            f_noise_signal.append(float(0)) # # add 0 to  f_noise_signal list if it was the case 
        else:# if freq is inside noise frequency ranges 
            f_noise_signal.append(value) # add f_signal value to f_noise_signal

        # Selecting body_component values 
        if (abs(freq)<=0.3 or abs(freq)>20):# testing if freq is outside Body_component frequency ranges
            f_body_signal.append(float(0))# add 0 to  f_body_signal list
        else:# if freq is inside Body_component frequency ranges
            f_body_signal.append(value) # add f_signal value to f_body_signal list
    
    ################### Inverse the transformation of signals in freq domain ########################
    # applying the inverse fft(ifft) to signals in freq domain and put them in float format
    t_DC_component= ifft(np.array(f_DC_signal)).real
    t_body_component= ifft(np.array(f_body_signal)).real
    t_noise=ifft(np.array(f_noise_signal)).real
    
    total_component=t_signal-t_noise # extracting the total component(filtered from noise) 
                                     #  by substracting noise from t_signal (the original signal).
    
    # return outputs mentioned earlier
    return (total_component,t_DC_component,t_body_component,t_noise) 

In [None]:
user_list = [x for x in os.listdir(acc_dir) if 'v2' in x]

for user in tqdm(user_list):
    user_data = pd.read_parquet(os.path.join(acc_dir, user))
    user_nm = str(user).split('_')[0]
    
    # median filter 적용
    user_data['x'] = median(user_data['x'])
    user_data['y'] = median(user_data['y'])
    user_data['z'] = median(user_data['z'])
    
    user_data.rename(columns={'x':'x_med', 'y':'y_med', 'z':'z_med'}, inplace=True)
    
    # 성분 분해
    (_,t_DC_component,t_body_component,_) = components_selection_one_signal(user_data['x_med'], freq1, freq2)
    user_data['x_gravity'] = t_DC_component
    user_data['x_body'] = t_body_component
    (_,t_DC_component,t_body_component,_) = components_selection_one_signal(user_data['y_med'], freq1, freq2)
    user_data['y_gravity'] = t_DC_component
    user_data['y_body'] = t_body_component
    (_,t_DC_component,t_body_component,_) = components_selection_one_signal(user_data['z_med'], freq1, freq2)
    user_data['z_gravity'] = t_DC_component
    user_data['z_body'] = t_body_component
    
    table = pa.Table.from_pandas(user_data)
    save_dir = os.path.join(acc_dir, user_nm+'_mAcc_v3.parquet')
    pq.write_table(table, save_dir)
    
    del user_data, user_nm, t_DC_component, t_body_component, table, save_dir
    gc.collect()

## 3) 일(day 단위로 합치기)

In [37]:
user_list = [x for x in os.listdir(acc_dir) if 'v3' in x]

for user in user_list:
    user_nm = user.split('_')[0]
    print('='*10+user_nm+'='*10)
    user_data = pd.read_parquet(os.path.join(acc_dir, user))
    user_data['date'] = user_data['datetime'].astype(str).apply(lambda x:x.split(" ")[0])
    user_pt = pd.pivot_table(user_data,
                            values=['x_med','y_med', 'z_med', 
                                    'x_gravity', 'x_body', 'y_gravity', 'y_body', 'z_gravity', 'z_body', ],
                            index=['user','date'],
                            aggfunc={'x_med':list,
                                    'y_med':list,
                                    'z_med':list,
                                    'x_gravity':list,
                                    'x_body':list,
                                    'y_gravity':list,
                                    'y_body':list,
                                    'z_gravity':list,
                                    'z_body':list,})
    user_pt['x_med'].apply(np.array)
    user_pt['y_med'].apply(np.array)
    user_pt['z_med'].apply(np.array)
    user_pt['x_gravity'].apply(np.array)
    user_pt['y_gravity'].apply(np.array)
    user_pt['z_gravity'].apply(np.array)
    user_pt['x_body'].apply(np.array)
    user_pt['y_body'].apply(np.array)
    user_pt['z_body'].apply(np.array)
    user_pt.reset_index(inplace=True)

    # parquet으로 저장
    print('Saving..')
    table = pa.Table.from_pandas(user_pt)
    save_dir = os.path.join(data_dir, 'mAcc', user_nm+'_acc_day.parquet')
    pq.write_table(table, save_dir)

Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..
Saving..


## 4) Accelerator 파일 하나로 합치기

In [41]:
# Acc 통합 파일 생성
user_list = [x.split('_')[0] for x in os.listdir(acc_dir) if 'user' in x and 'day' in x]

acc = None
for user in user_list:
    user_acc = pd.read_parquet(os.path.join(acc_dir, user+'_acc_day.parquet'))
    if acc is None:
        acc = user_acc
    else:
        acc = pd.concat([acc, user_acc], axis=0)
        
table = pa.Table.from_pandas(acc)
save_dir = os.path.join(data_dir, 'acc_day.parquet')
pq.write_table(table, save_dir)

# train 유효한 유저-날짜 구하기
 - heatrate, accelerator, activity가 모두 있는 데이터를 사용

In [66]:
# Accelerator
acc_date = pd.read_parquet(os.path.join(data_dir, 'acc_day.parquet'))       
acc_date = acc_date.groupby(['user', 'date'])['x_med'].count().reset_index()[['user', 'date']]
acc_date.rename(columns={'date':'date'}, inplace=True)

# Heartrate
hr_df = pd.read_parquet(os.path.join(data_dir, 'hr_day.parquet'))
hr_df['datetime'] = hr_df['date'].apply(lambda x: x.split()[0])
hr_date = hr_df.groupby(['user', 'date']).count().reset_index()[['user', 'date']]

# GPS - gps 데이터 제외
# gps_df = pd.read_parquet(os.path.join(data_dir, 'gps_day.parquet'))
# gps_date = gps_df.groupby(['user', 'date'])['timestamp'].count().reset_index()[['user', 'date']]
# gps_date.rename(columns={'date':'datetime'}, inplace=True)

# Activity
activity_df = pd.read_parquet(os.path.join(data_dir, 'act_day.parquet'))
activity_df['date'] = activity_df['date'].apply(lambda x:x.split()[0])
activity_date = activity_df.groupby(['user', 'date']).count().reset_index()[['user', 'date']]

# train 레이블
label_valid_date = pd.read_csv(os.path.join(label_dir, 'train_label.csv'), index_col=[0])
label_valid_date = label_valid_date[['subject_id', 'date']]
label_valid_date.rename(columns={'subject_id':'user'}, inplace=True)

# 유효한 날짜 구하기
valid_date = reduce(lambda x,y: pd.merge(x, y, on=['user', 'date'], how='inner'),[acc_date, hr_date, activity_date, label_valid_date])

valid_date.to_csv(os.path.join(data_dir, 'train_valid_date.csv'), index=False)

## 데이터 최종 정리

In [88]:
valid_date = pd.read_csv(os.path.join(data_dir, 'train_valid_date.csv'))
acc_df = pd.read_parquet(os.path.join(data_dir, 'acc_day.parquet'))
hr_df = pd.read_parquet(os.path.join(data_dir, 'hr_day.parquet'))
act_df = pd.read_parquet(os.path.join(data_dir, 'act_day.parquet'))
gps_df = pd.read_parquet(os.path.join(data_dir, 'gps_day.parquet'))

acc_df = acc_df[['user', 'date', 'x_med',
       'y_med', 'z_med', 'x_gravity', 'x_body', 'y_gravity',
       'y_body', 'z_gravity', 'z_body']]

# hr_df['date'] = hr_df['datetime'].apply(lambda x: x.split()[0])
# act_df['date'] = act_df['datetime'].apply(lambda x: x.split()[0])
# gps_df = gps_df[['user','timestamp_sum', 'date', 'lat', 'lon', 'accuracy']]

# 유효한 날짜만 남기기
valid_date['user-date'] = [str(x) + '_' + str(y) for x, y in zip(valid_date['user'], valid_date['date'])]
acc_df['user-date'] = [str(x) + '_' + str(y) for x, y in zip(acc_df['user'], acc_df['date'])]
hr_df['user-date'] = [str(x) + '_' + str(y) for x, y in zip(hr_df['user'], hr_df['date'])]
act_df['user-date'] = [str(x) + '_' + str(y) for x, y in zip(act_df['user'], act_df['date'])]
gps_df['user-date'] = [str(x) + '_' + str(y) for x, y in zip(gps_df['user'], gps_df['date'])]

# valid_date에 해당하는 데이터만 추출
acc_valid = acc_df[acc_df['user-date'].isin(valid_date['user-date'])]
hr_valid = hr_df[hr_df['user-date'].isin(valid_date['user-date'])]
act_valid = act_df[act_df['user-date'].isin(valid_date['user-date'])]
gps_valid = gps_df[gps_df['user-date'].isin(valid_date['user-date'])]

acc_valid.drop(columns=['user-date'], inplace=True)
hr_valid.drop(columns=['user-date'], inplace=True)
act_valid.drop(columns=['user-date'], inplace=True)
gps_valid.drop(columns=['user-date'], inplace=True)

# valid 데이터 저장
pq.write_table(pa.Table.from_pandas(acc_valid), os.path.join(data_dir, 'acc_final.parquet'))
pq.write_table(pa.Table.from_pandas(hr_valid), os.path.join(data_dir, 'hr_final.parquet'))
pq.write_table(pa.Table.from_pandas(act_valid), os.path.join(data_dir, 'act_final.parquet'))
pq.write_table(pa.Table.from_pandas(gps_valid), os.path.join(data_dir, 'gps_final.parquet'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acc_valid.drop(columns=['user-date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hr_valid.drop(columns=['user-date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  act_valid.drop(columns=['user-date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gps_valid.dro