In [15]:
import pandas as pd
import numpy as np
import ast


In [2]:
""" mActivity 데이터를 하루 단위로 요약한 후 활동 비율 feature를 반환"""
def process_m_activity(file_path: str) -> pd.DataFrame:
    df = pd.read_parquet(file_path)
    # timestamp에서 날짜만 추출하여 groupby 기준으로 사용
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date

    # 활동 종류별 개수를 비율로 변환
    """
    m_activity의 횟수를 세서 subject, date 기준으로 정리
    unstack()은 활동 종류를 컬럼으로 펼침
    fill_value=0은 없는 활동에 대하여 0으로 채움
    """
    activity_counts = df.groupby(['subject_id', 'date', 'm_activity']).size().unstack(fill_value=0)
    """비율로 정규화(비율이 개수보다 모델에 적합)"""
    activity_ratio = activity_counts.div(activity_counts.sum(axis=1), axis=0).reset_index()

    # 컬럼 이름 정리
    activity_ratio.columns = ['subject_id', 'date'] + [f'activity_ratio_{col}' for col in activity_ratio.columns[2:]]

    return activity_ratio

In [3]:
activity_df = process_m_activity("../data/ch2025_mActivity.parquet")
activity_df.head()

Unnamed: 0,subject_id,date,activity_ratio_0,activity_ratio_1,activity_ratio_3,activity_ratio_4,activity_ratio_7,activity_ratio_8
0,id01,2024-06-26,0.125176,0.001406,0.672293,0.157525,0.043601,0.0
1,id01,2024-06-27,0.146528,0.0,0.611111,0.220833,0.021528,0.0
2,id01,2024-06-28,0.111806,0.000694,0.861806,0.000694,0.025,0.0
3,id01,2024-06-29,0.065972,0.0,0.916667,0.0,0.017361,0.0
4,id01,2024-06-30,0.138194,0.0,0.853472,0.0,0.008333,0.0


In [4]:
""" mLight 데이터를 하루 단위로 요약"""
def process_m_light(file_path: str) -> pd.DataFrame:
    df = pd.read_parquet(file_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date

    # 밝기가 0인 경우(어두운 환경)
    df['is_dark'] = (df['m_light'] == 0).astype(int)

    # 하루 단위 평균, 표준편차, 최대, 최소, 어두운 비율 집계
    daily_summary = df.groupby([
        'subject_id',
        'date'
    ]).agg({
        'm_light': ['mean', 'std', 'min', 'max'],
        'is_dark': 'mean' # 비율로 해석
    }).reset_index()

    # 컬럼명 정리
    daily_summary.columns = [
        'subject_id',
        'date',
        'light_mean',
        'light_std',
        'light_min',
        'light_max',
        'dark_ratio'
    ]

    return daily_summary

In [5]:
m_light_df = process_m_light("../data/ch2025_mLight.parquet")
m_light_df.head()

Unnamed: 0,subject_id,date,light_mean,light_std,light_min,light_max,dark_ratio
0,id01,2024-06-26,364.506849,395.65944,0.0,1886.0,0.136986
1,id01,2024-06-27,332.069444,1300.535681,0.0,11248.0,0.326389
2,id01,2024-06-28,219.201389,260.6829,0.0,1834.0,0.305556
3,id01,2024-06-29,91.416667,312.065205,0.0,3498.0,0.347222
4,id01,2024-06-30,98.909722,300.448148,0.0,2691.0,0.416667


In [23]:
""" wHr 데이터를 하루 단위로 요약"""
def process_w_hr(file_path: str) -> pd.DataFrame:
    df = pd.read_parquet(file_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date

    # heart_rate 값이 np.ndarray인 경우 → 리스트로 변환
    df['heart_rate'] = df['heart_rate'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

    # 리스트 평균값 등을 계산하는 함수
    def extract_stats(x):
        if isinstance(x, list) and len(x) > 0:
            return pd.Series({
                'hr_mean': np.mean(x),
                'hr_std': np.std(x),
                'hr_min': np.min(x),
                'hr_max': np.max(x),
                'hr_q1': np.percentile(x, 25),
                'hr_q3': np.percentile(x, 75)
            })
        else:
            return pd.Series({
                'hr_mean': np.nan,
                'hr_std': np.nan,
                'hr_min': np.nan,
                'hr_max': np.nan,
                'hr_q1': np.nan,
                'hr_q3': np.nan
            })

    # 각 행에 대해 통계값 요약
    stats_df = df['heart_rate'].apply(extract_stats)
    df = pd.concat([df[['subject_id', 'date']], stats_df], axis=1)

    # 하루 단위로 다시 요약 (평균 처리)
    daily_summary = df.groupby(['subject_id', 'date']).agg('mean').reset_index()
    daily_summary['hr_iqr'] = daily_summary['hr_q3'] - daily_summary['hr_q1']

    return daily_summary

In [24]:
w_hr_df = process_w_hr("../data/ch2025_wHr.parquet")
w_hr_df.head()

Unnamed: 0,subject_id,date,hr_mean,hr_std,hr_min,hr_max,hr_q1,hr_q3,hr_iqr
0,id01,2024-06-26,81.40295,3.491878,75.164151,88.009434,78.798113,84.045283,5.24717
1,id01,2024-06-27,81.074421,3.986584,75.056575,87.237003,78.732798,83.441896,4.709098
2,id01,2024-06-28,80.164122,3.450684,74.633427,85.941011,77.826896,82.411166,4.58427
3,id01,2024-06-29,80.4,1.855499,77.0,82.0,80.375,81.375,1.0
4,id01,2024-06-30,89.986993,3.973682,83.143529,96.898824,87.138824,92.825882,5.687059
