In [None]:
import os
import re
import zipfile
import pandas as pd

# Configuration: folder containing the zip files
DATA_PARENT_DIR = '../data_cleaned/'
SOURCE_DIR = 'Timo'
DATA_DIR = os.path.join(DATA_PARENT_DIR, SOURCE_DIR)

# Pattern to identify and parse filenames
# Examples:
#   base 2025-06-14 12-25-22_trimmed.zip
#   360ml 2025-06-14 12-57-02_trimmed.zip
FNAME_REGEX = re.compile(r'^(?P<label>base|(?P<intake>\d+)ml)\s+(?P<date>\d{4}-\d{2}-\d{2})')

# Sensor file names inside each zip
SENSOR_FILES = {
    'Accelerometer.csv': 'accelerometer',
    'Gyroscope.csv': 'gyroscope',
    'Linear Acceleration.csv': 'linear_acceleration'
}

# Collect data per experiment date
daily_data = {}

# Walk through DATA_DIR to include subfolders (sources)
for root, dirs, files in os.walk(DATA_DIR):
    # Determine source name as the immediate folder name relative to DATA_DIR
    rel_path = os.path.relpath(root, DATA_DIR)
    source = SOURCE_DIR

    print(f"Processing source: {source} in {root}")

    for fname in files:
        if not fname.lower().endswith('.zip'):
            continue

        match = FNAME_REGEX.match(fname)
        if not match:
            print(f"Skipping unknown-format file: {fname} in {root}")
            continue

        info = match.groupdict()
        date_key = info['date']           # experiment_id
        is_base = 1 if info['label'] == 'base' else 0
        intake = 0 if is_base else int(info['intake'])

        zip_path = os.path.join(root, fname)

        # Read each sensor CSV from the zip
        dfs = []
        with zipfile.ZipFile(zip_path, 'r') as z:
            for internal_name, sensor_key in SENSOR_FILES.items():
                try:
                    with z.open(internal_name) as f:
                        df = pd.read_csv(f)
                except KeyError:
                    print(f"  - Warning: {internal_name} not found in {fname}")
                    continue

                # Rename x/y/z columns to include sensor prefix
                rename_map = {col: f"{sensor_key}_{col}"
                              for col in ['x', 'y', 'z']
                              if col in df.columns}
                if rename_map:
                    df = df.rename(columns=rename_map)

                dfs.append(df)

        if not dfs:
            print(f"No sensor data for {fname}, skipping.")
            continue

        # Merge all sensor data on 'time'
        merged = dfs[0]
        for df in dfs[1:]:
            merged = pd.merge(merged, df, on='time', how='outer')

        # Add experiment metadata
        merged['experiment_id'] = date_key
        merged['base'] = is_base
        merged['caffeine_ml'] = intake
        merged['source'] = source

        # Reorder columns: experiment_id, base, caffeine_ml, source, time, sensors...
        cols = ['experiment_id', 'base', 'caffeine_ml', 'source', 'time'] + \
               [c for c in merged.columns if c not in ['experiment_id', 'base', 'caffeine_ml', 'source', 'time']]
        merged = merged[cols]

        # Store for daily aggregation
        daily_data.setdefault(date_key, []).append(merged)

# Write combined CSV per day
for date_key, frames in daily_data.items():
    combined = pd.concat(frames, ignore_index=True)
    out_fname = f"output-{date_key}.csv"
    out_path = os.path.join(DATA_DIR, out_fname)
    combined.to_csv(out_path, index=False)
    print(f"Wrote {out_path}")



Processing source: Timo in ../data_cleaned/Timo
Wrote ../data_cleaned/Timo/output-2025-06-11.csv
Wrote ../data_cleaned/Timo/output-2025-06-10.csv
Wrote ../data_cleaned/Timo/output-2025-06-08.csv
Wrote ../data_cleaned/Timo/output-2025-06-14.csv
Wrote ../data_cleaned/Timo/output-2025-06-12.csv
Wrote ../data_cleaned/Timo/output-2025-06-07.csv
Wrote ../data_cleaned/Timo/output-2025-06-09.csv
Wrote ../data_cleaned/Timo/output-2025-06-13.csv
Wrote ../data_cleaned/Timo/output-2025-06-06.csv
