In [80]:
from typing import List
from matplotlib import pyplot as plt
import numpy as np
import polars as pl

In [82]:
from pisces.experiments import DataSetObject


sets = DataSetObject.find_data_sets("../data_sets")
hfd = sets['henry_ford_disordered']

In [83]:
feature = "accelerometer"
hf_id0 = hfd.ids[0]
hf0 = hfd.get_feature_data(feature, hf_id0)

In [84]:
hf0.head()

Timestamp,x,y,z
i64,f64,f64,f64
1614393486,-0.200729,-0.513245,-0.959885
1614393486,-0.364395,-0.522568,-0.994354
1614393486,-0.221451,-0.559998,-1.007111
1614393486,-0.036346,-0.347076,-0.85527
1614393486,0.066513,-0.413788,-0.847763


Many (but not _all_ 😅) of the Henry Ford subjects have accelerometer recordings that are ~50 (timestamp, x, y, z) samples with the same unix timestamp (i.e. seconds since 1970-01-01 00:00 UTC+0), with the meaning that these samples were taken approximately equally spaced. For example:

```csv
// timestamp == 1614393486
1614393486,-0.20072937,-0.513244629,-0.959884644
1614393486,-0.364395142,-0.522567749,-0.994354248
// ... 39 lines removed
1614393486,0.041000366,-0.529266357,-0.912322998
1614393486,0.041442871,-0.453567505,-0.929244995
// 1614393486 -> ....87
1614393487,0.091384888,-0.477371216,-0.872970581
1614393487,0.069549561,-0.510955811,-0.865447998
...
```

Our first objective is to process these CSVs such that the timestamp monotonically increases. The 43 samples with timestamp `1614393486` above would instead have timestamps `[1614393486 + i / 43 for i in range(43)]`.

In [85]:
timestamp_col = 'Timestamp'
hf0 = hf0.with_columns(pl.col(timestamp_col).cast(pl.datatypes.Int64)).with_columns(pl.arange(0, hf0.height).alias("index"))
hf_count = hf0.groupby(timestamp_col, maintain_order=True).agg(pl.count(), pl.first("index").alias("group_start_index"))

  hf_count = hf0.groupby(timestamp_col, maintain_order=True).agg(pl.count(), pl.first("index").alias("group_start_index"))
  hf_count = hf0.groupby(timestamp_col, maintain_order=True).agg(pl.count(), pl.first("index").alias("group_start_index"))


In [86]:
hf_count.head()

Timestamp,count,group_start_index
i64,u32,i64
1614393486,42,0
1614393487,51,42
1614393488,50,93
1614393489,49,143
1614393490,51,192


In [87]:
hf0

Timestamp,x,y,z,index
i64,f64,f64,f64,i64
1614393486,-0.200729,-0.513245,-0.959885,0
1614393486,-0.364395,-0.522568,-0.994354,1
1614393486,-0.221451,-0.559998,-1.007111,2
1614393486,-0.036346,-0.347076,-0.85527,3
1614393486,0.066513,-0.413788,-0.847763,4
1614393486,0.14679,-0.553528,-0.758621,5
1614393486,0.050262,-0.559738,-0.766861,6
1614393486,0.082016,-0.518539,-0.856354,7
1614393486,0.163315,-0.437668,-0.845825,8
1614393486,0.08284,-0.424179,-0.805283,9


In [88]:
hf0_group_rank = hf0.join(hf_count, on=timestamp_col).with_columns((pl.col("index") - pl.col("group_start_index")).alias("rank_in_group")).drop('group_start_index')

In [89]:
hf0_group_rank

Timestamp,x,y,z,index,count,rank_in_group
i64,f64,f64,f64,i64,u32,i64
1614393486,-0.200729,-0.513245,-0.959885,0,42,0
1614393486,-0.364395,-0.522568,-0.994354,1,42,1
1614393486,-0.221451,-0.559998,-1.007111,2,42,2
1614393486,-0.036346,-0.347076,-0.85527,3,42,3
1614393486,0.066513,-0.413788,-0.847763,4,42,4
1614393486,0.14679,-0.553528,-0.758621,5,42,5
1614393486,0.050262,-0.559738,-0.766861,6,42,6
1614393486,0.082016,-0.518539,-0.856354,7,42,7
1614393486,0.163315,-0.437668,-0.845825,8,42,8
1614393486,0.08284,-0.424179,-0.805283,9,42,9


In [90]:
hf0_fixed = hf0_group_rank.with_columns(pl.col(timestamp_col) + pl.col("rank_in_group") / pl.col("count")).drop("rank_in_group", "count")

In [91]:
[hf0_fixed[j, 0] for j in range(10)]

[1614393486.0,
 1614393486.0238094,
 1614393486.047619,
 1614393486.0714285,
 1614393486.0952382,
 1614393486.1190476,
 1614393486.142857,
 1614393486.1666667,
 1614393486.1904762,
 1614393486.2142856]

In [102]:
# Let's record this process in a function

def fix_timestamps(df: pl.DataFrame, timestamp_col: str = 'timestamp') -> pl.DataFrame:
    # get an absolute row index, to keep the samples in order
    df = df.with_columns(pl.col(timestamp_col).cast(pl.datatypes.Int64)).with_columns(pl.arange(0, df.height).alias("index"))

    # count the number of samples in each group, and the index at which the group starts
    df_count = df.groupby(timestamp_col, maintain_order=True).agg(pl.count(), pl.first("index").alias("group_start_index"))

    # calculate the rank of each sample in its group
    df_group_rank = df.join(df_count, on=timestamp_col).with_columns((pl.col("index") - pl.col("group_start_index")).alias("rank_in_group")).drop('group_start_index')

    # calculate the new timestamp
    df_fixed = df_group_rank.with_columns(pl.col(timestamp_col) + pl.col("rank_in_group") / pl.col("count")).drop("rank_in_group", "count", "index")

    return df_fixed

In [103]:
hfd_fixed = {
    id: fix_timestamps(hfd.get_feature_data("accelerometer", id), timestamp_col)
    for id in hfd.ids
}

  df_count = df.groupby(timestamp_col, maintain_order=True).agg(pl.count(), pl.first("index").alias("group_start_index"))
  df_count = df.groupby(timestamp_col, maintain_order=True).agg(pl.count(), pl.first("index").alias("group_start_index"))


In [104]:
hfd.get_feature_files("accelerometer")

{'AWS001': 'AWS001_motion_data.csv',
 'AWS002': 'AWS002_motion_data.csv',
 'AWS003': 'AWS003_motion_data.csv',
 'AWS004': 'AWS004_motion_data.csv',
 'AWS005': 'AWS005_motion_data.csv',
 'AWS006': 'AWS006_motion_data.csv',
 'AWS007': 'AWS007_motion_data.csv',
 'AWS008': 'AWS008_motion_data.csv',
 'AWS009': 'AWS009_motion_data.csv',
 'AWS010': 'AWS010_motion_data.csv',
 'AWS011': 'AWS011_motion_data.csv',
 'AWS012': 'AWS012_motion_data.csv',
 'AWS013': 'AWS013_motion_data.csv',
 'AWS014': 'AWS014_motion_data.csv',
 'AWS015': 'AWS015_motion_data.csv',
 'AWS016': 'AWS016_motion_data.csv',
 'AWS017': 'AWS017_motion_data.csv',
 'AWS018': 'AWS018_motion_data.csv',
 'AWS019': 'AWS019_motion_data.csv',
 'AWS020': 'AWS020_motion_data.csv',
 'AWS021': 'AWS021_motion_data.csv',
 'AWS022': 'AWS022_motion_data.csv',
 'AWS023': 'AWS023_motion_data.csv',
 'AWS024': 'AWS024_motion_data.csv',
 'AWS025': 'AWS025_motion_data.csv',
 'AWS026': 'AWS026_motion_data.csv',
 'AWS028': 'AWS028_motion_data.csv',
 

In [105]:
hfd.get_feature_path("accelerometer")

PosixPath('../data_sets/henry_ford_disordered/cleaned_accelerometer')

In [106]:
paths = {
    id: hfd.get_feature_path("accelerometer").joinpath(hfd.get_feature_files("accelerometer")[id])
    for id in hfd.ids
}

In [107]:
for id in hfd.ids:
    hfd_fixed[id].write_csv(paths[id], include_header=True, float_precision=9)