In [1]:
import pandas as pd
import requests

In [2]:
participant_ids = ["LL", "ZM", "VV", "YL", "YT"] 
activity_ids = ["sitting", "walking", "running", "lying"]
sensor_ids = ["ax", "ay", "az", "gx", "gy", "gz","mx", "my", "mz"]
columns_order = ['participant_id', 'activity_id', 'timestamp', 'ax', 'ay', 'az', 'gx', 'gy', 'gz', 'mx', 'my', 'mz']

In [None]:
def get_file_list():
    """
    Fetch the list of files from the GitHub repository
    """
    url = f"https://api.github.com/repos/COEN498-691-PROJECT/ML_project/contents/data/raw?ref=main"
    response = requests.get(url)
    files = response.json()
    return files

def filter_files(files, participant_ids, activity_ids, sensor_ids):
    """
    Filter the list of files based on the list of participant IDs, activity IDs, and sensor IDs
    """
    filtered_files_url = []
    for file in files:
        file_name = file['name']
        if any(pid.upper() in file_name for pid in participant_ids) and \
           any(aid in file_name for aid in activity_ids) and \
           any(sid.upper() in file_name for sid in sensor_ids):
            filtered_files_url.append( (file['download_url'], file_name) )
    return filtered_files_url

def add_columns(df, file_url):
    """
    Add participant ID and activity ID to the dataframe
    """
    parts = file_url.split('_')
    participant_id = parts[0]
    activity_id = parts[1]
    df['participant_id'] = participant_id
    df['activity_id'] = activity_id
    return df

def load_dataframes(file_urls: tuple):
    """
    Load dataframes from the list of file URLs and add participant and activity IDs columns
    """
    df_list = []
    for file_url, file_name in file_urls:
        df = pd.read_csv(file_url)
        df = add_columns(df, file_name)
        df_list.append(df)
    return df_list

In [None]:
def create_dataset_file(df_list):
    """
    Create the final dataset file by combining all dataframes, cleaning, and saving to CSV
    """
    combined_df = pd.concat(df_list, ignore_index=True) # Combine all dataframes
    combined_df.drop(columns=['PacketNumber', 'DataLength', 'TypeTag', 'ProtocolVersion', 'EmotiBitTimestamp', 'DataReliability'], errors='ignore', inplace=True) # Drop unnecessary columns
    combined_df.rename(columns={'LocalTimestamp': 'timestamp',
                                'AX': 'ax',
                                'AY': 'ay',
                                'AZ': 'az',
                                'GX': 'gx',
                                'GY': 'gy',
                                'GZ': 'gz',
                                'MX': 'mx',
                                'MY': 'my',
                                'MZ': 'mz'}, inplace=True) # Rename columns for consistency
    combined_df = combined_df[columns_order] # Reorder columns
    combined_df = combined_df.groupby(['timestamp', 'activity_id', 'participant_id']).mean().reset_index() # Handle duplicates by averaging
    combined_df.sort_values(by=['timestamp'], inplace=True) # Sort by timestamp (ascending)
    combined_df.to_csv('../../data/processed/COEN498-691_HAR_dataset.csv', index=False, date_format='%Y-%m-%d %H:%M:%S.%f') # Save to CSV
    return combined_df

In [6]:
file_list = get_file_list()
filtered_files_url = filter_files(file_list, participant_ids, activity_ids, sensor_ids)

In [7]:
df_list = load_dataframes(filtered_files_url)
print(df_list[0].head(10))

   LocalTimestamp  EmotiBitTimestamp  PacketNumber  DataLength TypeTag  \
0    1.760382e+09          1325026.0         21641           1      AX   
1    1.760382e+09          1325066.0         21657           2      AX   
2    1.760382e+09          1325106.0         21657           2      AX   
3    1.760382e+09          1325146.0         21675           3      AX   
4    1.760382e+09          1325186.0         21675           3      AX   
5    1.760382e+09          1325226.0         21675           3      AX   
6    1.760382e+09          1325266.0         21691           2      AX   
7    1.760382e+09          1325306.0         21691           2      AX   
8    1.760382e+09          1325346.0         21708           3      AX   
9    1.760382e+09          1325386.0         21708           3      AX   

   ProtocolVersion  DataReliability     AX participant_id activity_id  
0                1              100  0.596             LL       lying  
1                1              100  0.71

In [9]:
df_final = create_dataset_file(df_list)
print(df_final.head(10))

      timestamp activity_id participant_id     ax     ay        az        gx  \
0  1.760380e+09     sitting             ZM -0.001  0.849  0.484333 -0.569667   
1  1.760380e+09     sitting             ZM -0.008  0.850  0.483000 -0.610000   
2  1.760380e+09     sitting             ZM  0.006  0.852  0.480000 -0.519000   
3  1.760380e+09     sitting             ZM  0.001  0.851  0.477000 -0.610000   
4  1.760380e+09     sitting             ZM -0.002  0.853  0.481000 -0.671000   
5  1.760380e+09     sitting             ZM  0.008  0.841  0.479000 -0.427000   
6  1.760380e+09     sitting             ZM -0.005  0.851  0.484000 -0.610000   
7  1.760380e+09     sitting             ZM -0.002  0.857  0.481000 -0.366000   
8  1.760380e+09     sitting             ZM  0.002  0.847  0.478000 -0.519000   
9  1.760380e+09     sitting             ZM -0.004  0.853  0.467000 -0.977000   

      gy        gz    mx         my    mz  
0 -0.305  0.305333  91.0  14.666667 -27.0  
1 -0.214  0.214000  90.0  15.00