In [8]:
import pandas as pd
import requests
import numpy as np

In [14]:
participant_ids = ["LL", "ZM", "VV", "YL", "YT"] 
activity_ids = ["sitting", "walking", "running", "lying"]
sensor_ids = ["ax", "ay", "az"]
columns_order = ['participant_id', 'activity_id', 'timestamp', 'ax', 'ay', 'az']

In [3]:
def get_file_list():
    """
    Fetch the list of files from the GitHub repository
    """
    url = f"https://api.github.com/repos/COEN498-691-PROJECT/ML_project/contents/data/raw?ref=main"
    response = requests.get(url)
    files = response.json()
    return files

def filter_files(files, participant_ids, activity_ids, sensor_ids):
    """
    Filter the list of files based on the list of participant IDs, activity IDs, and sensor IDs
    """
    filtered_files_url = []
    for file in files:
        file_name = file['name']
        if any(pid.upper() in file_name for pid in participant_ids) and \
           any(aid in file_name for aid in activity_ids) and \
           any(sid.upper() in file_name for sid in sensor_ids):
            filtered_files_url.append( (file['download_url'], file_name) )
    return filtered_files_url

def add_columns(df, file_url):
    """
    Add participant ID and activity ID to the dataframe
    """
    parts = file_url.split('_')
    participant_id = parts[0]
    activity_id = parts[1]
    df['participant_id'] = participant_id
    df['activity_id'] = activity_id
    return df

def load_dataframes(file_urls: tuple):
    """
    Load dataframes from the list of file URLs and add participant and activity IDs columns
    """
    df_list = []
    for file_url, file_name in file_urls:
        df = pd.read_csv(file_url)
        df = add_columns(df, file_name)
        df_list.append(df)
    return df_list

In [None]:
def align_timestamps(df_list):
    """
    Merge axes per participant-activity on a single timeline by using interpolation
    """
    # Step 1: identify all participant-activity combinations
    combos = {(df['participant_id'].iloc[0], df['activity_id'].iloc[0]) for df in df_list}
    
    merged_dfs = []

    for pid, aid in combos:
        # Select AX, AY, AZ for this participant-activity
        dfs = [df for df in df_list if df['participant_id'].iloc[0] == pid and df['activity_id'].iloc[0] == aid]
        
        ax_df = next(df for df in dfs if 'AX' in df.columns)
        ay_df = next(df for df in dfs if 'AY' in df.columns)
        az_df = next(df for df in dfs if 'AZ' in df.columns)

        # Sort by timestamp
        ax_df = ax_df.sort_values('LocalTimestamp').set_index('LocalTimestamp')
        ay_df = ay_df.sort_values('LocalTimestamp').set_index('LocalTimestamp')
        az_df = az_df.sort_values('LocalTimestamp').set_index('LocalTimestamp')

        # Interpolate AY and AZ to AX timestamps
        df_acc = pd.DataFrame(index=ax_df.index)
        df_acc['ax'] = ax_df['AX'].values
        df_acc['ay'] = np.interp(ax_df.index.values, ay_df.index.values, ay_df['AY'].values)
        df_acc['az'] = np.interp(ax_df.index.values, az_df.index.values, az_df['AZ'].values)

        # Add participant_id and activity_id
        df_acc['participant_id'] = pid
        df_acc['activity_id'] = aid

        # Reset index and rename timestamp
        df_acc.reset_index(inplace=True)
        df_acc.rename(columns={'index':'timestamp'}, inplace=True)

        # Only append if timestamp column exists
        if 'timestamp' in df_acc.columns:
            # Safe reorder (skip missing columns)
            df_acc = df_acc[[col for col in columns_order if col in df_acc.columns]]
            merged_dfs.append(df_acc)
        else:
            print(f"Warning: participant {pid}, activity {aid} skipped because 'timestamp' missing")

    # Combine all
    combined_df = pd.concat(merged_dfs, ignore_index=True)

    # Sort by timestamp
    combined_df.sort_values(by=['timestamp'], inplace=True)

    # Save CSV
    combined_df.to_csv('../../data/processed/COEN498-691_HAR_dataset.csv',
                       index=False, date_format='%Y-%m-%d %H:%M:%S.%f')

    return combined_df


In [None]:
def create_dataset_file(df_list):
    """
    Create the final dataset file by combining all dataframes, cleaning, and saving to CSV
    """
    combined_df = pd.concat(df_list, ignore_index=True) # Combine all dataframes
    combined_df.drop(columns=['PacketNumber', 'DataLength', 'TypeTag', 'ProtocolVersion', 'EmotiBitTimestamp', 'DataReliability'], errors='ignore', inplace=True) # Drop unnecessary columns
    combined_df.rename(columns={'LocalTimestamp': 'timestamp',
                                'AX': 'ax',
                                'AY': 'ay',
                                'AZ': 'az',}, inplace=True) # Rename columns for consistency
    combined_df = combined_df[columns_order] # Reorder columns
    combined_df = combined_df.groupby(['timestamp', 'activity_id', 'participant_id']).mean().reset_index() # Handle duplicates by averaging
    combined_df.sort_values(by=['timestamp'], inplace=True) # Sort by timestamp (ascending)
    combined_df.to_csv('../../data/processed/COEN498-691_HAR_dataset.csv', index=False, date_format='%Y-%m-%d %H:%M:%S.%f') # Save to CSV
    return combined_df

In [5]:
file_list = get_file_list()
filtered_files_url = filter_files(file_list, participant_ids, activity_ids, sensor_ids)

In [6]:
df_list = load_dataframes(filtered_files_url)
print(df_list[0].head(10))

   LocalTimestamp  EmotiBitTimestamp  PacketNumber  DataLength TypeTag  \
0    1.760382e+09          1325026.0         21641           1      AX   
1    1.760382e+09          1325066.0         21657           2      AX   
2    1.760382e+09          1325106.0         21657           2      AX   
3    1.760382e+09          1325146.0         21675           3      AX   
4    1.760382e+09          1325186.0         21675           3      AX   
5    1.760382e+09          1325226.0         21675           3      AX   
6    1.760382e+09          1325266.0         21691           2      AX   
7    1.760382e+09          1325306.0         21691           2      AX   
8    1.760382e+09          1325346.0         21708           3      AX   
9    1.760382e+09          1325386.0         21708           3      AX   

   ProtocolVersion  DataReliability     AX participant_id activity_id  
0                1              100  0.596             LL       lying  
1                1              100  0.71

In [16]:
df_final = align_timestamps(df_list)
print(df_final.head(10))

KeyError: 'timestamp'