In [1]:
import pandas as pd
import os
from typing import Optional

In [2]:
#root folder
BASE_DIRECTORY = 'pmdata' 

#extract participant IDs from root
#eliminate participant p12
try:
    all_dirs = [d for d in os.listdir(BASE_DIRECTORY) if os.path.isdir(os.path.join(BASE_DIRECTORY, d))]
    PARTICIPANT_IDS = sorted([d for d in all_dirs if d.startswith('p') and d != 'p12'])
    print(f"Participants found for processing: {PARTICIPANT_IDS}")
except FileNotFoundError:
    print(f"Error: Directory '{BASE_DIRECTORY}' not found. Please ensure you are running the script in the correct folder.")
    PARTICIPANT_IDS = []


#final output filename
OUTPUT_FILENAME = 'PMData0.csv'

Participants found for processing: ['p01', 'p02', 'p03', 'p04', 'p05', 'p06', 'p07', 'p08', 'p09', 'p10', 'p11', 'p13', 'p14', 'p15', 'p16']


In [3]:
# process json files
def process_activity_json(file_path: str, column_name: str) -> Optional[pd.DataFrame]:
    
    if not os.path.exists(file_path):
        print(f"Warning: File {file_path} not found.")
        return None
    
    df = pd.read_json(file_path)
    df['dateTime'] = pd.to_datetime(df['dateTime'])
    df['date'] = df['dateTime'].dt.date
    df['value'] = pd.to_numeric(df['value'])
    daily_data = df.groupby('date')['value'].sum().reset_index()
    daily_data.rename(columns={'value': column_name}, inplace=True)
    daily_data['date'] = pd.to_datetime(daily_data['date'])
    return daily_data

In [4]:
# process sleep csv files
def process_sleep_csv(file_path: str) -> Optional[pd.DataFrame]:

    if not os.path.exists(file_path):
        print(f"Warning: File {file_path} not found.")
        return None
        
    columns_to_use = ['timestamp', 'duration_score', 'deep_sleep_in_minutes', 'resting_heart_rate', 'restlessness']
    df = pd.read_csv(file_path)
    # Check for required columns
    if not all(col in df.columns for col in columns_to_use):
        print(f"Error: All required columns are not present in file {file_path}.")
        return None
        
    df = df[columns_to_use]
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df.drop(columns=['timestamp'], inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    return df

In [5]:
def process_participant_data(participant_id: str, base_dir: str) -> Optional[pd.DataFrame]:
    
    print(f"--- Starting processing for participant: {participant_id} ---")

    # Construct path to 'fitbit' folder for each participant
    fitbit_dir = os.path.join(base_dir, participant_id, 'fitbit')
    
    if not os.path.isdir(fitbit_dir):
        print(f"Warning: Fitbit folder for {participant_id} not found at '{fitbit_dir}'.")
        return None

    # Process each file from the correct path
    file_mappings = {
        'steps': process_activity_json(os.path.join(fitbit_dir, 'steps.json'), 'steps'),
        'sedentary': process_activity_json(os.path.join(fitbit_dir, 'sedentary_minutes.json'), 'sedentary'),
        'LPA': process_activity_json(os.path.join(fitbit_dir, 'lightly_active_minutes.json'), 'LPA'),
        'MPA': process_activity_json(os.path.join(fitbit_dir, 'moderately_active_minutes.json'), 'MPA'),
        'VPA': process_activity_json(os.path.join(fitbit_dir, 'very_active_minutes.json'), 'VPA'),
        'sleep': process_sleep_csv(os.path.join(fitbit_dir, 'sleep_score.csv'))
    }
    
    valid_dfs = [df for df in file_mappings.values() if df is not None]
    
    if len(valid_dfs) < 2:
        print(f"Warning: Not enough data found for merging for participant {participant_id}.")
        return None

    # Merge all valid dataframes
    merged_df = valid_dfs[0]
    for df in valid_dfs[1:]:
        # Use 'outer' join to avoid losing days that are not present in some files
        merged_df = pd.merge(merged_df, df, on='date', how='outer')
        
    if merged_df.empty:
        print(f"Warning: No data remaining after merging for participant {participant_id}.")
        return None
        
    merged_df['item_id'] = participant_id
    return merged_df

In [6]:
def main():
    """Main function to execute the entire process."""
    all_participants_data = []
    
    if not PARTICIPANT_IDS:
        return
    
    for pid in PARTICIPANT_IDS:
        participant_df = process_participant_data(pid, BASE_DIRECTORY)
        if participant_df is not None:
            all_participants_data.append(participant_df)
            
    if not all_participants_data:
        print("No data found for processing. The program will terminate.")
        return
        
    final_df = pd.concat(all_participants_data, ignore_index=True)
    #Convert to datetime
    #final_df['date'] = pd.to_datetime(final_df['date'], format='%Y-%m-%d', errors='coerce')
    #Ensure dtype is correct
    #print("dtype of date:", final_df['date'].dtype)
    
    final_df['sleep_duration'] = 1440 - (final_df['VPA'] + final_df['MPA'] + final_df['LPA'] + final_df['sedentary'])

    # Add new column to final columns list
    final_columns = [
        'item_id', 'date', 'steps', 'sedentary', 'LPA', 'MPA', 'VPA', 
        'sleep_duration', 
        'duration_score', 'deep_sleep_in_minutes', 'resting_heart_rate', 'restlessness'
    ]
    # Include columns that may not be present in some files to avoid errors during reindexing
    final_df['date'] = pd.to_datetime(final_df['date'], format='%Y-%m-%d', errors='coerce')
    final_df = final_df.reindex(columns=final_columns)

    # Final sorting by item_id and date
    final_df.sort_values(by=['item_id', 'date'], inplace=True)
    
    final_df.to_csv(OUTPUT_FILENAME, index=False)
    print("\n" + "="*55)
    print("Processing completed successfully!")
    print(f"Final file saved at: {os.path.abspath(OUTPUT_FILENAME)}")
    print(f"Total number of data rows: {len(final_df)}")
    print("="*55)

if __name__ == '__main__':
    main()

--- Starting processing for participant: p01 ---
--- Starting processing for participant: p02 ---
--- Starting processing for participant: p03 ---
--- Starting processing for participant: p04 ---
--- Starting processing for participant: p05 ---
--- Starting processing for participant: p06 ---
--- Starting processing for participant: p07 ---
--- Starting processing for participant: p08 ---
--- Starting processing for participant: p09 ---
--- Starting processing for participant: p10 ---
--- Starting processing for participant: p11 ---
--- Starting processing for participant: p13 ---
--- Starting processing for participant: p14 ---
--- Starting processing for participant: p15 ---
--- Starting processing for participant: p16 ---

Processing completed successfully!
Final file saved at: /Users/fatemeh/Thises/exprimentalResult/ActivityClassification/PMData_Analysis/Classification/PMData0.csv
Total number of data rows: 2250


In [7]:
df = pd.read_csv('PMData0.csv')
df.head()

Unnamed: 0,item_id,date,steps,sedentary,LPA,MPA,VPA,sleep_duration,duration_score,deep_sleep_in_minutes,resting_heart_rate,restlessness
0,p01,2019-11-01,17873.0,636,245,58,72,429,38.0,27.0,53.0,0.053635
1,p01,2019-11-02,13118.0,709,202,43,56,430,37.0,36.0,53.0,0.078978
2,p01,2019-11-03,14312.0,648,262,63,45,422,38.0,38.0,52.0,0.093491
3,p01,2019-11-04,10970.0,760,230,23,28,399,38.0,37.0,53.0,0.064851
4,p01,2019-11-05,16186.0,737,263,22,56,362,36.0,26.0,54.0,0.093185


In [8]:
#General information about Data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2250 entries, 0 to 2249
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   item_id                2250 non-null   object 
 1   date                   2250 non-null   object 
 2   steps                  2056 non-null   float64
 3   sedentary              2250 non-null   int64  
 4   LPA                    2250 non-null   int64  
 5   MPA                    2250 non-null   int64  
 6   VPA                    2250 non-null   int64  
 7   sleep_duration         2250 non-null   int64  
 8   duration_score         1835 non-null   float64
 9   deep_sleep_in_minutes  1835 non-null   float64
 10  resting_heart_rate     1835 non-null   float64
 11  restlessness           1835 non-null   float64
dtypes: float64(5), int64(5), object(2)
memory usage: 211.1+ KB


In [9]:
df.describe()

Unnamed: 0,steps,sedentary,LPA,MPA,VPA,sleep_duration,duration_score,deep_sleep_in_minutes,resting_heart_rate,restlessness
count,2056.0,2250.0,2250.0,2250.0,2250.0,2250.0,1835.0,1835.0,1835.0,1835.0
mean,11254.648833,790.165333,191.121778,20.538222,42.66,395.514667,38.249046,73.239782,58.582561,0.090708
std,6425.950745,288.829593,113.764116,23.098348,46.26357,198.174315,5.14507,27.724424,7.091328,0.03751
min,0.0,204.0,0.0,0.0,0.0,0.0,3.0,0.0,44.0,0.015385
25%,6687.0,599.25,119.0,2.0,2.0,364.0,35.0,55.0,53.0,0.062411
50%,11196.0,702.0,196.0,14.0,30.0,448.5,39.0,72.0,59.0,0.084524
75%,15791.75,874.75,261.0,30.0,68.0,515.75,42.0,91.0,65.0,0.110441
max,39835.0,1440.0,621.0,209.0,336.0,953.0,47.0,183.0,76.0,0.294766
