In [15]:
import os
import re
import zipfile
import pandas as pd

# Configuration: folder containing the zip files
DATA_PARENT_DIR = '../data_cleaned/'
SOURCE_DIR = 'Timo'
DATA_DIR = os.path.join(DATA_PARENT_DIR, SOURCE_DIR)

# NEW: define where to write the Parquet outputs
# e.g. OUTPUT_DIR = '../data_processed/parquet_outputs'
OUTPUT_DIR = os.path.join(DATA_PARENT_DIR, 'parquet_outputs', SOURCE_DIR)

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Pattern to identify and parse filenames
FNAME_REGEX = re.compile(r'^(?P<label>base|(?P<intake>\d+)ml)\s+(?P<date>\d{4}-\d{2}-\d{2})')

# Sensor file names inside each zip
SENSOR_FILES = {
    'Accelerometer.csv': 'accelerometer',
    'Gyroscope.csv': 'gyroscope',
    'Linear Acceleration.csv': 'linear_acceleration'
}

# Collect data per experiment date
daily_data = {}

# Walk through DATA_DIR to include subfolders (sources)
for root, dirs, files in os.walk(DATA_DIR):
    source = SOURCE_DIR
    print(f"Processing source: {source} in {root}")

    for fname in files:
        if not fname.lower().endswith('.zip'):
            continue

        match = FNAME_REGEX.match(fname)
        if not match:
            print(f"Skipping unknown-format file: {fname} in {root}")
            continue

        info = match.groupdict()
        date_key = info['date']
        is_base = 1 if info['label'] == 'base' else 0
        intake = 0 if is_base else int(info['intake'])

        zip_path = os.path.join(root, fname)
        dfs = []

        with zipfile.ZipFile(zip_path, 'r') as z:
            for internal_name, sensor_key in SENSOR_FILES.items():
                try:
                    with z.open(internal_name) as f:
                        df = pd.read_csv(f)
                except KeyError:
                    print(f"  - Warning: {internal_name} not found in {fname}")
                    continue

                # Rename x/y/z columns to include sensor prefix
                rename_map = {
                    col: f"{sensor_key}_{col}"
                    for col in ['x', 'y', 'z']
                    if col in df.columns
                }
                if rename_map:
                    df = df.rename(columns=rename_map)

                dfs.append(df)

        if not dfs:
            print(f"No sensor data for {fname}, skipping.")
            continue

        # Merge all sensor data on 'time'
        merged = dfs[0]
        for df in dfs[1:]:
            merged = pd.merge(merged, df, on='time', how='outer')

        # Add experiment metadata
        merged['experiment_id'] = date_key
        merged['base'] = is_base
        merged['caffeine_ml'] = intake
        merged['source'] = source

        # Reorder columns
        cols = [
            'experiment_id', 'base', 'caffeine_ml', 'source', 'time'
        ] + [
            c for c in merged.columns
            if c not in ['experiment_id', 'base', 'caffeine_ml', 'source', 'time']
        ]
        merged = merged[cols]

        daily_data.setdefault(date_key, []).append(merged)

# Write combined Parquet per day into OUTPUT_DIR
for date_key, frames in daily_data.items():
    combined = pd.concat(frames, ignore_index=True)
    out_fname = f"output-{date_key}.parquet"
    out_path = os.path.join(OUTPUT_DIR, out_fname)
    combined.to_parquet(out_path, index=False, compression='snappy')
    print(f"Wrote {out_path}")

    out_csv = os.path.join(OUTPUT_DIR, f"output-{date_key}.csv")
    combined.to_csv(out_csv, index=False)
    print(f"Wrote {out_csv}")


# 2) Write master Parquet and master CSV combining all days
all_frames = [df for frames in daily_data.values() for df in frames]
if all_frames:
    all_combined = pd.concat(all_frames, ignore_index=True)

    # Master Parquet
    master_parquet = os.path.join(OUTPUT_DIR, 'output-all.parquet')
    all_combined.to_parquet(master_parquet, index=False, compression='snappy')
    print(f"Wrote master Parquet: {master_parquet}")

    # Master CSV
    master_csv = os.path.join(OUTPUT_DIR, 'output-all.csv')
    all_combined.to_csv(master_csv, index=False)
    print(f"Wrote master CSV: {master_csv}")
else:
    print("No data available to write master files.")

Processing source: Timo in ../data_cleaned/Timo
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-11.parquet
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-11.csv
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-10.parquet
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-10.csv
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-08.parquet
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-08.csv
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-14.parquet
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-14.csv
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-12.parquet
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-12.csv
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-07.parquet
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-07.csv
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-09.parquet
Wrote ../data_cleaned/parquet_outputs/Timo/output-2025-06-09.csv
Wrote ../data_