# Compile Training Chunks into a Single File
Read all Parquet chunk files from `training_chunks/`, concatenate them into a single DataFrame, and save as one consolidated Parquet file.

In [1]:
import pandas as pd
from pathlib import Path

# Path to the chunks directory
CHUNKS_DIR = Path("../training_chunks")
OUTPUT_FILE = Path("../training_set.parquet")

# Read and concatenate all chunk files
chunk_files = sorted(CHUNKS_DIR.glob("chunk_*.parquet"))
print(f"Found {len(chunk_files)} chunk files")

df = pd.concat([pd.read_parquet(f) for f in chunk_files], ignore_index=True)
print(f"Combined DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nLabel distribution:\n{df['label'].value_counts()}")

# Save as a single Parquet file
df.to_parquet(OUTPUT_FILE, index=False)
print(f"\nSaved consolidated training set to: {OUTPUT_FILE.resolve()}")
print(f"File size: {OUTPUT_FILE.stat().st_size / (1024**2):.1f} MB")

Found 31 chunk files
Combined DataFrame shape: (4405108, 76)
Columns: ['ch1_LS', 'ch1_MFL', 'ch1_MSR', 'ch1_WAMP', 'ch1_ZC', 'ch1_RMS', 'ch1_IAV', 'ch1_DASDV', 'ch1_VAR', 'ch2_LS', 'ch2_MFL', 'ch2_MSR', 'ch2_WAMP', 'ch2_ZC', 'ch2_RMS', 'ch2_IAV', 'ch2_DASDV', 'ch2_VAR', 'ch3_LS', 'ch3_MFL', 'ch3_MSR', 'ch3_WAMP', 'ch3_ZC', 'ch3_RMS', 'ch3_IAV', 'ch3_DASDV', 'ch3_VAR', 'ch4_LS', 'ch4_MFL', 'ch4_MSR', 'ch4_WAMP', 'ch4_ZC', 'ch4_RMS', 'ch4_IAV', 'ch4_DASDV', 'ch4_VAR', 'ch5_LS', 'ch5_MFL', 'ch5_MSR', 'ch5_WAMP', 'ch5_ZC', 'ch5_RMS', 'ch5_IAV', 'ch5_DASDV', 'ch5_VAR', 'ch6_LS', 'ch6_MFL', 'ch6_MSR', 'ch6_WAMP', 'ch6_ZC', 'ch6_RMS', 'ch6_IAV', 'ch6_DASDV', 'ch6_VAR', 'ch7_LS', 'ch7_MFL', 'ch7_MSR', 'ch7_WAMP', 'ch7_ZC', 'ch7_RMS', 'ch7_IAV', 'ch7_DASDV', 'ch7_VAR', 'ch8_LS', 'ch8_MFL', 'ch8_MSR', 'ch8_WAMP', 'ch8_ZC', 'ch8_RMS', 'ch8_IAV', 'ch8_DASDV', 'ch8_VAR', 'label', 'user', 'sample_id', 'window_start']

Label distribution:
label
pinch        734363
noGesture    734239
fist         734