In [1]:
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
from datetime import datetime, timedelta

In [2]:
activpal_folder = r"C:\Users\Panda\OneDrive\Desktop\WAVES Data\activPal Data"
pals_folder = r"C:\Users\Panda\OneDrive\Desktop\WAVES Data\Pals Extracted Time-Series Data"

In [3]:
# pipeline to process all files (returns 30 mismatched ID's due to time format)

# extract ID from filename (5 digits after "PALS")
def extract_id(filename):
    match = re.search(r'PALS(\d{5})', filename, re.IGNORECASE)
    return match.group(1) if match else None

# process a single file pair
def process_pair(pals_file, activpal_file, id_value):
    # PALS file
    pals_df = pd.read_csv(pals_file)
    pals_df = pals_df[["time", "bicycling"]]
    pals_df = pals_df.rename(columns={"bicycling": "cycle_pals"})
    pals_df["time"] = pals_df["time"].str.replace(".500", "", regex=False)
    pals_df["cycle_pals"] = pals_df["cycle_pals"] * 30
    
    # ActivPal file
    activpal_df = pd.read_csv(activpal_file, sep=";", skiprows=1)
    activpal_df = activpal_df[["Time(approx)", "Cycling Time (s)"]]
    activpal_df = activpal_df.rename(columns={
        "Time(approx)": "time",
        "Cycling Time (s)": "cycle_activPal"
    })
    
    # Merge
    merged_df = pd.merge(pals_df, activpal_df, on="time", how="left")
    merged_df.dropna(inplace=True)
    
    # Add ID column
    merged_df["ID"] = id_value
    
    return merged_df

# Get all files and organize by ID
pals_files = {extract_id(f): os.path.join(pals_folder, f) 
              for f in os.listdir(pals_folder) if f.endswith('.csv')}
activpal_files = {extract_id(f): os.path.join(activpal_folder, f) 
                  for f in os.listdir(activpal_folder) if f.endswith('.csv')}

# Find matching IDs
matching_ids = set(pals_files.keys()) & set(activpal_files.keys())
print(f"Found {len(matching_ids)} matching file pairs")

# Process all pairs
all_merged = []
for id_val in sorted(matching_ids):
    try:
        merged_df = process_pair(pals_files[id_val], activpal_files[id_val], id_val)
        all_merged.append(merged_df)
        print(f"Processed ID {id_val}: {len(merged_df)} rows")
    except Exception as e:
        print(f"Error processing ID {id_val}: {e}")

# Combine all DataFrames
final_df = pd.concat(all_merged, ignore_index=True)
print(f"\nTotal rows in final merged DataFrame: {len(final_df)}")
print(f"Columns: {list(final_df.columns)}")

# Export to CSV
final_df.to_csv("PALS_Cycling_Merged.csv", index=False)
print(f"Exported to PALS_Cycling_Merged.csv")

final_df.head()

Found 78 matching file pairs
Processed ID 10001: 0 rows
Processed ID 10002: 0 rows
Processed ID 10003: 28561 rows
Processed ID 10005: 23698 rows
Processed ID 10007: 0 rows
Processed ID 10010: 34181 rows
Processed ID 10011: 7384 rows
Processed ID 10014: 0 rows
Processed ID 10017: 0 rows
Processed ID 10020: 29242 rows
Processed ID 10022: 32279 rows
Processed ID 10024: 0 rows
Processed ID 10027: 0 rows
Processed ID 10033: 31099 rows
Processed ID 10036: 20241 rows
Processed ID 10037: 0 rows
Processed ID 10046: 0 rows
Processed ID 10050: 29229 rows
Processed ID 10051: 32144 rows
Processed ID 10054: 21687 rows
Processed ID 10059: 32728 rows
Processed ID 10061: 28777 rows
Processed ID 10071: 28940 rows
Processed ID 10072: 30999 rows
Processed ID 10073: 32354 rows
Processed ID 10074: 5121 rows
Processed ID 10079: 0 rows
Processed ID 10083: 0 rows
Processed ID 10084: 31530 rows
Processed ID 10085: 15235 rows
Processed ID 10086: 1 rows
Processed ID 10087: 33298 rows
Processed ID 10088: 31760 row

Unnamed: 0,time,cycle_pals,cycle_activPal,ID
0,2022-03-10 10:00:00,0.0,0.0,10003
1,2022-03-10 10:00:30,0.0,0.0,10003
2,2022-03-10 10:01:00,0.0,0.0,10003
3,2022-03-10 10:01:30,0.0,0.0,10003
4,2022-03-10 10:02:00,0.0,0.0,10003


In [4]:
# summary for "pipeline to process all files (returns 30 mismatched ID's due to time format)"
def get_event_groups(series, min_duration=4):
    """Return dict mapping event_id to set of row indices"""
    is_biking = (series > 0).astype(int)
    groups = (is_biking != is_biking.shift()).cumsum()
    event_mask = is_biking == 1
    event_groups = groups[event_mask]
    event_lengths = event_groups.value_counts()
    valid_events = event_lengths[event_lengths >= min_duration].index
    return {eid: set(event_groups[event_groups == eid].index) for eid in valid_events}

# each ID
summary_results = []

for id_val in sorted(final_df["ID"].unique()):
    id_df = final_df[final_df["ID"] == id_val].copy()
    
    # events for each column
    pals_events = get_event_groups(id_df["cycle_pals"])
    ap_events = get_event_groups(id_df["cycle_activPal"])
    
    # overlapping events
    pals_matched = set()
    ap_matched = set()
    for pals_id, pals_indices in pals_events.items():
        for ap_id, ap_indices in ap_events.items():
            if pals_indices & ap_indices:  # any overlap
                pals_matched.add(pals_id)
                ap_matched.add(ap_id)
    
    both_count = len(pals_matched)
    only_pals_count = len(pals_events) - both_count
    only_ap_count = len(ap_events) - both_count
    
    # total cycling time
    total_pals_time = id_df["cycle_pals"].sum().round()
    total_ap_time = id_df["cycle_activPal"].sum().round()
    
    summary_results.append({
        "ID": id_val,
        "Total events AG (cycle_pals)": len(pals_events),
        "Total events AP (cycle_activPal)": len(ap_events),
        "Events both identified": both_count,
        "Events AP identified but not AG": only_ap_count,
        "Events AG identified but not AP": only_pals_count,
        "Total cycling time AG (s)": total_pals_time,
        "Total cycling time AP (s)": total_ap_time
    })

# summary DataFrame
summary_df = pd.DataFrame(summary_results)

# summary display
print("Summary per ID:\n")
print(summary_df.to_string(index=False))

# Export to CSV
summary_df.to_csv("PALS_Cycling_Summary.csv", index=False)
print(f"\nExported to PALS_Cycling_Summary.csv")


Summary per ID:

   ID  Total events AG (cycle_pals)  Total events AP (cycle_activPal)  Events both identified  Events AP identified but not AG  Events AG identified but not AP  Total cycling time AG (s)  Total cycling time AP (s)
10003                             0                                 0                       0                                0                                0                        0.0                        0.0
10005                             2                                 0                       0                                0                                2                     1200.0                        0.0
10010                             3                                 0                       0                                0                                3                      840.0                        0.0
10011                             0                                 1                       0                                1 

In [5]:
# Pipeline with time rounding to nearest 30 seconds + Summary

# extract ID from filename (5 digits after "PALS")
def extract_id(filename):
    match = re.search(r'PALS(\d{5})', filename, re.IGNORECASE)
    return match.group(1) if match else None

# round time to nearest 30 seconds
def round_time_to_30sec(time_str):
    """Round time string to nearest 30-second mark. Handles both 'HH:MM:SS' and 'YYYY-MM-DD HH:MM:SS' formats."""
    try:
        # Try parsing as datetime with date first (PALS format)
        try:
            time_obj = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
            has_date = True
        except:
            # Try parsing as time only (ActivPal format)
            time_obj = datetime.strptime(time_str, "%H:%M:%S")
            has_date = False
        
        seconds = time_obj.second
        # Round to nearest 30 seconds
        if seconds < 15:
            rounded_seconds = 0
        elif seconds < 45:
            rounded_seconds = 30
        else:
            rounded_seconds = 0
            time_obj += timedelta(minutes=1)  # Round up to next minute
        
        rounded_time = time_obj.replace(second=rounded_seconds)
        
        if has_date:
            return rounded_time.strftime("%Y-%m-%d %H:%M:%S")
        else:
            return rounded_time.strftime("%H:%M:%S")
    except:
        # If parsing fails, return original
        return time_str

# process a single file pair with time rounding
def process_pair_rounded(pals_file, activpal_file, id_value):
    # PALS file
    pals_df = pd.read_csv(pals_file)
    pals_df = pals_df[["time", "bicycling"]]
    pals_df = pals_df.rename(columns={"bicycling": "cycle_pals"})
    pals_df["time"] = pals_df["time"].str.replace(".500", "", regex=False)
    # Round time to nearest 30 seconds
    pals_df["time"] = pals_df["time"].apply(round_time_to_30sec)
    pals_df["cycle_pals"] = pals_df["cycle_pals"] * 30
    
    # ActivPal file
    activpal_df = pd.read_csv(activpal_file, sep=";", skiprows=1)
    activpal_df = activpal_df[["Time(approx)", "Cycling Time (s)"]]
    activpal_df = activpal_df.rename(columns={
        "Time(approx)": "time",
        "Cycling Time (s)": "cycle_activPal"
    })
    # Round time to nearest 30 seconds
    activpal_df["time"] = activpal_df["time"].apply(round_time_to_30sec)
    
    # Merge
    merged_df = pd.merge(pals_df, activpal_df, on="time", how="left")
    merged_df.dropna(inplace=True)
    
    # ID column
    merged_df["ID"] = id_value
    
    return merged_df

# all files and organize by ID
pals_files = {extract_id(f): os.path.join(pals_folder, f) 
              for f in os.listdir(pals_folder) if f.endswith('.csv')}
activpal_files = {extract_id(f): os.path.join(activpal_folder, f) 
                  for f in os.listdir(activpal_folder) if f.endswith('.csv')}

# matching IDs
matching_ids = set(pals_files.keys()) & set(activpal_files.keys())
print(f"Found {len(matching_ids)} matching file pairs")

# all pairs
all_merged_assumed = []
for id_val in sorted(matching_ids):
    try:
        merged_df = process_pair_rounded(pals_files[id_val], activpal_files[id_val], id_val)
        all_merged_assumed.append(merged_df)
        print(f"Processed ID {id_val}: {len(merged_df)} rows")
    except Exception as e:
        print(f"Error processing ID {id_val}: {e}")

# Combine all DataFrames
final_df_assumed = pd.concat(all_merged_assumed, ignore_index=True)
print(f"\nTotal rows in final merged DataFrame: {len(final_df_assumed)}")
print(f"Columns: {list(final_df_assumed.columns)}")

# Export to CSV
final_df_assumed.to_csv("ASSUMED_PALS_Cycling_Merged.csv", index=False)
print(f"Exported to ASSUMED_PALS_Cycling_Merged.csv")

final_df_assumed.head()

# Summary per ID: Events and Total Cycling Time (for assumed/rounded data)
def get_event_groups(series, min_duration=4):
    """Return dict mapping event_id to set of row indices"""
    is_biking = (series > 0).astype(int)
    groups = (is_biking != is_biking.shift()).cumsum()
    event_mask = is_biking == 1
    event_groups = groups[event_mask]
    event_lengths = event_groups.value_counts()
    valid_events = event_lengths[event_lengths >= min_duration].index
    return {eid: set(event_groups[event_groups == eid].index) for eid in valid_events}

# Process each ID
summary_results_assumed = []

for id_val in sorted(final_df_assumed["ID"].unique()):
    id_df = final_df_assumed[final_df_assumed["ID"] == id_val].copy()
    
    # Get events for each column
    pals_events = get_event_groups(id_df["cycle_pals"])
    ap_events = get_event_groups(id_df["cycle_activPal"])
    
    # Find overlapping events
    pals_matched = set()
    ap_matched = set()
    for pals_id, pals_indices in pals_events.items():
        for ap_id, ap_indices in ap_events.items():
            if pals_indices & ap_indices:  # any overlap
                pals_matched.add(pals_id)
                ap_matched.add(ap_id)
    
    both_count = len(pals_matched)
    only_pals_count = len(pals_events) - both_count
    only_ap_count = len(ap_events) - both_count
    
    # Calculate total cycling time
    total_pals_time = id_df["cycle_pals"].sum().round()
    total_ap_time = id_df["cycle_activPal"].sum().round()
    
    summary_results_assumed.append({
        "ID": id_val,
        "Total events AG (cycle_pals)": len(pals_events),
        "Total events AP (cycle_activPal)": len(ap_events),
        "Events both identified": both_count,
        "Events AP identified but not AG": only_ap_count,
        "Events AG identified but not AP": only_pals_count,
        "Total cycling time AG (s)": total_pals_time,
        "Total cycling time AP (s)": total_ap_time
    })

# summary DataFrame
summary_df_assumed = pd.DataFrame(summary_results_assumed)

# Count IDs with 0 rows vs > 0 rows
row_counts_per_id = final_df_assumed.groupby("ID").size()
ids_with_rows = len(row_counts_per_id[row_counts_per_id > 0])
# IDs with 0 rows are those in matching_ids but not in final_df_assumed
ids_in_final = set(final_df_assumed["ID"].unique())
ids_with_zero_rows = len(matching_ids) - len(ids_in_final)

# summary display
print("\nSummary per ID (with time rounding):\n")
print(summary_df_assumed.to_string(index=False))
print(f"\nTotal IDs with 0 rows: {ids_with_zero_rows}")
print(f"Total IDs with > 0 rows: {ids_with_rows}")

# Export to CSV
summary_df_assumed.to_csv("ASSUMED_PALS_Cycling_Summary.csv", index=False)
print(f"\nExported to ASSUMED_PALS_Cycling_Summary.csv")


Found 78 matching file pairs
Processed ID 10001: 30381 rows
Processed ID 10002: 24745 rows
Processed ID 10003: 28562 rows
Processed ID 10005: 23699 rows
Processed ID 10007: 30469 rows
Processed ID 10010: 34181 rows
Processed ID 10011: 7384 rows
Processed ID 10014: 5106 rows
Processed ID 10017: 34795 rows
Processed ID 10020: 29243 rows
Processed ID 10022: 32279 rows
Processed ID 10024: 32290 rows
Processed ID 10027: 31630 rows
Processed ID 10033: 31099 rows
Processed ID 10036: 20242 rows
Processed ID 10037: 31363 rows
Processed ID 10046: 13565 rows
Processed ID 10050: 29229 rows
Processed ID 10051: 32144 rows
Processed ID 10054: 21687 rows
Processed ID 10059: 32728 rows
Processed ID 10061: 28777 rows
Processed ID 10071: 28940 rows
Processed ID 10072: 30999 rows
Processed ID 10073: 32354 rows
Processed ID 10074: 5121 rows
Processed ID 10079: 28599 rows
Processed ID 10083: 30121 rows
Processed ID 10084: 31530 rows
Processed ID 10085: 15235 rows
Processed ID 10086: 34774 rows
Processed ID 

In [6]:
# check for issues with file pairs

activpal_folder = r"C:\Users\Panda\OneDrive\Desktop\WAVES Data\activPal Data"
pals_folder = r"C:\Users\Panda\OneDrive\Desktop\WAVES Data\Pals Extracted Time-Series Data"

def extract_id(filename):
    match = re.search(r'PALS(\d{5})', filename, re.IGNORECASE)
    return match.group(1) if match else None

pals_files = {extract_id(f): os.path.join(pals_folder, f) 
              for f in os.listdir(pals_folder) if f.endswith('.csv')}
activpal_files = {extract_id(f): os.path.join(activpal_folder, f) 
                  for f in os.listdir(activpal_folder) if f.endswith('.csv')}
matching_ids = set(pals_files.keys()) & set(activpal_files.keys())

# checks
zero_row_ids = []
time_type_mismatch_ids = []
time_format_issue_ids = []
empty_file_ids = []

for id_val in sorted(matching_ids):
    try:
        # PALS file
        pals_df = pd.read_csv(pals_files[id_val])
        pals_empty = len(pals_df) == 0
        pals_has_time = "time" in pals_df.columns
        pals_has_bicycling = "bicycling" in pals_df.columns
        
        # ActivPal file
        activpal_df = pd.read_csv(activpal_files[id_val], sep=";", skiprows=1)
        ap_empty = len(activpal_df) == 0
        ap_has_time = "Time(approx)" in activpal_df.columns
        ap_has_cycling = "Cycling Time (s)" in activpal_df.columns
        
        # Check for empty files
        if pals_empty or ap_empty:
            empty_file_ids.append(id_val)
            if pals_empty:
                print(f"ID {id_val}: PALS file is empty")
            if ap_empty:
                print(f"ID {id_val}: ActivPal file is empty")
            continue
        
        # Check for missing columns
        if not (pals_has_time and pals_has_bicycling):
            time_format_issue_ids.append(id_val)
            print(f"ID {id_val}: PALS file missing expected columns. Has 'time': {pals_has_time}, Has 'bicycling': {pals_has_bicycling}")
            continue
            
        if not (ap_has_time and ap_has_cycling):
            time_format_issue_ids.append(id_val)
            print(f"ID {id_val}: ActivPal file missing expected columns. Has 'Time(approx)': {ap_has_time}, Has 'Cycling Time (s)': {ap_has_cycling}")
            continue
        
        # Process files to check time types
        pals_df_proc = pals_df[["time", "bicycling"]].copy()
        pals_df_proc["time"] = pals_df_proc["time"].str.replace(".500", "", regex=False)
        
        activpal_df_proc = activpal_df[["Time(approx)", "Cycling Time (s)"]].copy()
        activpal_df_proc = activpal_df_proc.rename(columns={
            "Time(approx)": "time",
            "Cycling Time (s)": "cycle_activPal"
        })
        
        # Check time column dtypes
        pals_time_dtype = str(pals_df_proc["time"].dtype)
        ap_time_dtype = str(activpal_df_proc["time"].dtype)
        
        # Check for time type mismatch
        if pals_time_dtype != ap_time_dtype:
            time_type_mismatch_ids.append(id_val)
            print(f"ID {id_val}: Time type mismatch - PALS: {pals_time_dtype}, ActivPal: {ap_time_dtype}")
        
        # Check if merge results in 0 rows
        merged_df = pd.merge(pals_df_proc, activpal_df_proc, on="time", how="left")
        merged_df.dropna(inplace=True)
        
        if len(merged_df) == 0:
            zero_row_ids.append(id_val)
            # Show sample time values to help debug
            print(f"ID {id_val}: Merge resulted in 0 rows")
            print(f"  PALS time sample (first 3): {pals_df_proc['time'].head(3).tolist()}")
            print(f"  ActivPal time sample (first 3): {activpal_df_proc['time'].head(3).tolist()}")
            print(f"  PALS unique times: {pals_df_proc['time'].nunique()}, ActivPal unique times: {activpal_df_proc['time'].nunique()}")
            # Check if there's any overlap
            pals_times = set(pals_df_proc['time'].astype(str))
            ap_times = set(activpal_df_proc['time'].astype(str))
            overlap = pals_times & ap_times
            print(f"  Overlapping time values: {len(overlap)}")
            
    except Exception as e:
        print(f"ID {id_val}: Error during debugging - {e}")

# output
print("\n" + "="*60)
print("SUMMARY:")
print(f"IDs with 0 rows after merge: {len(zero_row_ids)}")
if zero_row_ids:
    print(f"  IDs: {zero_row_ids}")
    
print(f"\nIDs with time type mismatch: {len(time_type_mismatch_ids)}")
if time_type_mismatch_ids:
    print(f"  IDs: {time_type_mismatch_ids}")
    
print(f"\nIDs with time format/column issues: {len(time_format_issue_ids)}")
if time_format_issue_ids:
    print(f"  IDs: {time_format_issue_ids}")
    
print(f"\nIDs with empty files: {len(empty_file_ids)}")
if empty_file_ids:
    print(f"  IDs: {empty_file_ids}")


ID 10001: Merge resulted in 0 rows
  PALS time sample (first 3): ['2022-02-25 11:23:22', '2022-02-25 11:23:52', '2022-02-25 11:24:22']
  ActivPal time sample (first 3): ['2022-02-25 11:26:42', '2022-02-25 11:27:00', '2022-02-25 11:27:30']
  PALS unique times: 31709, ActivPal unique times: 31701
  Overlapping time values: 0
ID 10002: Merge resulted in 0 rows
  PALS time sample (first 3): ['2022-03-01 10:49:02', '2022-03-01 10:49:32', '2022-03-01 10:50:02']
  ActivPal time sample (first 3): ['2022-03-01 10:50:49', '2022-03-01 10:51:00', '2022-03-01 10:51:30']
  PALS unique times: 29985, ActivPal unique times: 39984
  Overlapping time values: 0
ID 10007: Merge resulted in 0 rows
  PALS time sample (first 3): ['2022-04-19 13:13:32', '2022-04-19 13:14:02', '2022-04-19 13:14:32']
  ActivPal time sample (first 3): ['2022-04-20 07:59:52', '2022-04-20 08:00:00', '2022-04-20 08:00:30']
  PALS unique times: 40320, ActivPal unique times: 40338
  Overlapping time values: 0
ID 10014: Merge resulted 