In [6]:
import pandas as pd
import numpy as np
import re
import copy
import os

from pathlib import Path
from datetime import datetime, timedelta


class CreateDataset:

    base_dir = ''
    granularity = 0
    data_table = None

    def __init__(self, base_dir, granularity):
        self.base_dir = base_dir
        self.granularity = granularity

    # Create an initial data table with entries from start till end time, with steps
    # of size granularity. Granularity is specified in milliseconds
    def create_time_index(self, start_time, end_time):
        return np.arange(start_time, end_time + self.granularity, self.granularity)

    def create_dataset(self, start_time, end_time, cols, prefix):
        c = copy.deepcopy(cols)
        if not prefix == '':
            for i in range(0, len(c)):
                c[i] = str(prefix) + str(c[i])
        
        time_index = self.create_time_index(start_time, end_time)
        
        # Create DataFrame with time elapsed as index
        self.data_table = pd.DataFrame(index=time_index, columns=c, dtype=object)
        self.data_table.index.name = 'time_elapsed_seconds'

    # Add numerical data, we assume timestamps in the form of nanoseconds from the epoch
    def add_numerical_dataset(self, file, time_col, value_cols, aggregation='avg', prefix=''):
        print(f'Reading data from {file}')
        dataset = pd.read_csv(self.base_dir / file, skipinitialspace=True)

        # Ensure time column is numeric (seconds)
        dataset[time_col] = pd.to_numeric(dataset[time_col])

        # Create a table based on the times found in the dataset
        if self.data_table is None:
            self.create_dataset(dataset[time_col].min(), dataset[time_col].max(), value_cols, prefix)
        else:
            for col in value_cols:
                self.data_table[str(prefix) + str(col)] = np.nan

        # Over all rows in the new table
        for i in range(0, len(self.data_table.index)):
            current_time = self.data_table.index[i]
            # Select the relevant measurements within the time window
            relevant_rows = dataset[
                (dataset[time_col] >= current_time) &
                (dataset[time_col] < (current_time + self.granularity))
            ]
            for col in value_cols:
                # Take the average value
                if len(relevant_rows) > 0:
                    if aggregation == 'avg':
                        self.data_table.loc[self.data_table.index[i], str(prefix)+str(col)] = np.average(relevant_rows[col])
                    else:
                        raise ValueError(f"Unknown aggregation {aggregation}")
                else:
                    self.data_table.loc[self.data_table.index[i], str(prefix)+str(col)] = np.nan

    # Remove undesired value from the names.
    def clean_name(self, name):
        return re.sub('[^0-9a-zA-Z]+', '', name)

    # Add data in which we have rows that indicate the occurrence of a certain event with a given start and end time.
    # 'aggregation' can be 'sum' or 'binary'.
    def add_event_dataset(self, file, start_timestamp_col, end_timestamp_col, value_col, aggregation='sum'):
        print(f'Reading data from {file}')
        dataset = pd.read_csv(self.base_dir / file)

        # Convert timestamps to datetime.
        dataset[start_timestamp_col] = pd.to_datetime(dataset[start_timestamp_col])
        dataset[end_timestamp_col] = pd.to_datetime(dataset[end_timestamp_col])

        # Clean the event values in the dataset
        dataset[value_col] = dataset[value_col].apply(self.clean_name)
        event_values = dataset[value_col].unique()

        # Add columns for all possible values (or create a new dataset if empty), set the default to 0 occurrences
        if self.data_table is None:
            self.create_dataset(min(dataset[start_timestamp_col]), max(dataset[end_timestamp_col]), event_values, value_col)
        for col in event_values:
            self.data_table[(str(value_col) + str(col))] = 0

        # Now we need to start counting by passing along the rows....
        for i in range(0, len(dataset.index)):
            # identify the time points of the row in our dataset and the value
            start = dataset[start_timestamp_col][i]
            end = dataset[end_timestamp_col][i]
            value = dataset[value_col][i]
            border = (start - timedelta(milliseconds=self.granularity))

            # get the right rows from our data table
            relevant_rows = self.data_table[(start <= (self.data_table.index +timedelta(milliseconds=self.granularity))) & (end > self.data_table.index)]

            # and add 1 to the rows if we take the sum
            if aggregation == 'sum':
                self.data_table.loc[relevant_rows.index, str(value_col) + str(value)] += 1
            # or set to 1 if we just want to know it happened
            elif aggregation == 'binary':
                self.data_table.loc[relevant_rows.index, str(value_col) + str(value)] = 1
            else:
                raise ValueError("Unknown aggregation '" + aggregation + "'")

    # This function returns the column names that have one of the strings expressed by 'ids' in the column name.
    def get_relevant_columns(self, ids):
        relevant_dataset_cols = []
        cols = list(self.data_table.columns)

        for id in ids:
            relevant_dataset_cols.extend([col for col in cols if id in col])

        return relevant_dataset_cols

In [None]:

path = Path('./data_cleaned/Timo')
onlyfiles = [f for f in os.listdir(path) if '.Identifier' not in f]

# Use all zips
zip_files = {}
for filename in onlyfiles:
    name = filename.removesuffix('.zip')
    parts = name.split()
    if len(parts) >= 3:
        key = f"{parts[0]}_{parts[1]}"
        zip_files[key] = filename

print(zip_files)


DATASET_PATH = Path('../data_cleaned/Timo')# for testing
#unzip the dataset if it is not already unzipped

RESULT_PATH = Path('./intermediate_datafiles/')
RESULT_FNAME = 'date_base_or_measure.csv'

# Set a granularity (the discrete step size of our time series data). We'll use a course-grained granularity of one
# instance per minute, and a fine-grained one with four instances per second.
GRANULARITIES = [0.1] #granularity of 10 instances per second


datasets = []
for milliseconds_per_instance in GRANULARITIES:
    print(f'Creating numerical datasets from files in {DATASET_PATH} using granularity {milliseconds_per_instance}.')

    # Create an initial dataset object with the base directory for our data and a granularity
    dataset = CreateDataset(DATASET_PATH, milliseconds_per_instance)

    # Add the selected measurements to it.

    # We add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch
    # and aggregate the values per timestep by averaging the values
    dataset.add_numerical_dataset('Accelerometer.csv', 'time', ['x','y','z'], 'avg', 'Accel_')

    # We add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch
    # and aggregate the values per timestep by averaging the values
    dataset.add_numerical_dataset('Gyroscope.csv', 'time', ['x','y','z'], 'avg', 'Gyro_')

    # We add the heart rate (continuous numerical measurements) and aggregate by averaging again
    dataset.add_numerical_dataset('Linear Acceleration.csv', 'time', ['x','y','z'], 'avg', 'Linear_')
    # Get the resulting pandas data table
    dataset = dataset.data_table
# Finally, store the last dataset we generated (250 ms).
dataset.to_csv(RESULT_FNAME)



{'150ml_2025-06-12': '150ml 2025-06-12 17-30-26_trimmed.zip_Linear Acceleration.png', '200_ml': '200 ml 2025-06-07 12-52-34_trimmed.zip_Linear Acceleration.png', '200ml_2025-06-07': '200ml 2025-06-07 12-52-34_trimmed.zip_Linear Acceleration.png', '230ml_2025-06-06': '230ml 2025-06-06 16-22-45_trimmed.zip_Linear Acceleration.png', '230ml_2025-06-09': '230ml 2025-06-09 11-42-04_trimmed.zip_Linear Acceleration.png', '230ml_2025-06-10': '230ml 2025-06-10 20-56-58_trimmed.zip_Linear Acceleration.png', '230ml_2025-06-11': '230ml 2025-06-11 15-48-34_trimmed.zip_Linear Acceleration.png', '360ml_2025-06-14': '360ml 2025-06-14 12-57-02_trimmed.zip_Linear Acceleration.png', '470ml_2025-06-08': '470ml 2025-06-08 15-38-54_trimmed.zip_Linear Acceleration.png', '85ml_2025-06-13': '85ml 2025-06-13 18-16-35_trimmed.zip_Linear Acceleration.png', 'base_2025-06-06': 'base 2025-06-06 15-45-11_trimmed.zip_Linear Acceleration.png', 'base_2025-06-07': 'base 2025-06-07 12-17-44_trimmed.zip_Linear Acceleration.

FileNotFoundError: [Errno 2] No such file or directory: '..\\data_cleaned\\Timo\\Accelerometer.csv'