# Readers

> Defines several methods for analyzing, plotting, and exporting wereable data, including a Pandas accessor for wereable dataframes

In [None]:
#| default_exp readers

In [None]:
#| hide 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *
from fastcore.basics import *

In [None]:
#| export
import json
import numpy as np
import pandas as pd
from typing import Dict

#| hide
# Pandas Accessor

In [None]:
#| export
#| hide
VALID_WEREABLE_STREAMS = ['steps', 'heartrate', 'wake', 'light_estimate', 'activity']

In [None]:
#| export
#| hide
@pd.api.extensions.register_dataframe_accessor("wereable")
class WereableData:
    "pd.DataFrame accessor implementing wereable-specific methods"
    def __init__(self, pandas_obj):
        self._validate_columns(pandas_obj)
        self._obj = pandas_obj

    @staticmethod
    def _validate_columns(obj):
        if 'datetime' not in obj.columns:
            if 'start' not in obj.columns and 'end' not in obj.columns:
                raise AttributeError("DataFrame must have 'datetime' column or 'start' and 'end' columns")

        if not any([col in obj.columns for col in VALID_WEREABLE_STREAMS]):
            raise AttributeError(f"DataFrame must have at least one wereable data column from: {VALID_WEREABLE_STREAMS}.")
        
    @staticmethod
    def _validate_metadata(metadata):
        if metadata:
            if not isinstance(metadata, dict):
                raise AttributeError("Metadata must be a dictionary.")
            if not any([key in metadata.keys() for key in ['data_id', 'subject_id']]):
                raise AttributeError("Metadata must have at least one of the following keys: data_id, subject_id.")
            if not all([isinstance(value, str) for value in metadata.values()]):
                raise AttributeError("Metadata values must be strings.")
    
    @staticmethod
    def rename_columns(df, 
                       inplace: bool = False
                       ):
        "Standardize column names by making them lowercase and replacing spaces with underscores"
        columns = [col.lower().replace(' ', '_') for col in df.columns]
        if inplace:
            df.columns = columns
        else:
            new_df = df.copy()
            new_df.columns = columns
            return new_df

    def is_valid(self):
        self._validate_columns(self._obj)
        self._validate_metadata(self._obj.attrs)
        return True

    def add_metadata(self,
                     metadata: Dict[str, str], # metadata containing data_id, subject_id, or other_info
                     inplace: bool = False, # whether to return a new dataframe or modify the current one
                     ):
        self._validate_metadata(metadata)
        if inplace:
            for key, value in metadata.items():
                self._obj.attrs[key] = value
        else:
            obj = self._obj.copy()
            for key, value in metadata.items():
                obj.attrs[key] = value
            return obj

  class WereableData:


#| hide
# Load files

In [None]:
#| export
#| hide
def load_json(filepath: str, # path to file
              metadata: Dict[str, str] = None, # metadata containing data_id, subject_id, or other_info
              ) -> Dict[str, pd.DataFrame]: # dictionary of wereable dataframes, one key:value pair per wereable data stream
    "Create a dataframe from a json containing a single or multiple streams of wereable data"
    # validate inputs
    if not isinstance(filepath, str):
        raise AttributeError("Filepath must be a string.")
    if metadata is not None:
        WereableData._validate_metadata(metadata)
    # load json
    jdict = json.load(open(filepath, 'r'))
    # check that it contains valid keys
    if not np.all([key in VALID_WEREABLE_STREAMS for key in jdict.keys()]):
        raise AttributeError("Invalid keys in JSON file. At least one key must be steps, heartrate, wake, light_estimate, or activity.")
    # create a df for each wereable stream
    df_dict = {}
    for key in jdict.keys():
        if key in VALID_WEREABLE_STREAMS:
            df_dict[key] = pd.DataFrame.from_dict(jdict[key])
        else:
            print(f"Excluded key: {key} because it's not a valid wereable stream column name.")
    for key in df_dict.keys():
        df = df_dict[key]
        if 'timestamp' in df.columns:
            df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
        elif 'start' in df.columns and 'end' in df.columns:
            df['start'] = pd.to_datetime(df['start'], unit='s')
            df['end'] = pd.to_datetime(df['end'], unit='s')
        if metadata is not None:
            df.wereable.add_metadata(metadata, inplace=True)
        else:
            df.wereable.add_metadata({'data_id': 'unknown', 'subject_id': 'unknown'}, inplace=True)
        df_dict[key] = df
    return df_dict

In [None]:
#| export
#| hide
def load_csv(filepath: str, # full path to csv file to be loaded
             metadata: Dict[str, str] = None, # metadata containing data_id, subject_id, or other_info
             timestamp_col: str = None, # name of the column to be used as timestamp. If None, it is assumed that a `datetime` column exists
             *args, # arguments to pass to pd.read_csv
             **kwargs, # keyword arguments to pass to pd.read_csv
             ):
    "Create a dataframe from a csv containing wereable data"
    # validate inputs
    if not isinstance(filepath, str):
        raise AttributeError("Filepath must be a string.")
    if not isinstance(timestamp_col, str) and timestamp_col is not None:
        raise AttributeError("Timestamp column must be a string.")
    if metadata is not None:
        WereableData._validate_metadata(metadata)
    # load csv
    df = pd.read_csv(filepath, *args, **kwargs)
    # create datetime column
    if timestamp_col is not None:
        df['datetime'] = pd.to_datetime(df[timestamp_col], unit='s')
    if timestamp_col is None:
        if 'datetime' in df.columns:
            df['datetime'] = pd.to_datetime(df['datetime'])
        elif 'start' in df.columns and 'end' in df.columns:
            df['start'] = pd.to_datetime(df['start'])
            df['end'] = pd.to_datetime(df['end'])
        if 'datetime' not in df.columns and 'start' not in df.columns and 'end' not in df.columns:
            raise AttributeError("CSV file must have a column named 'datetime' or 'start' and 'end'")
    # add metadata
    if metadata is not None:
        df.wereable.add_metadata(metadata, inplace=True)
    else:
        df.wereable.add_metadata({'data_id': 'unknown', 'subject_id': 'unknown'}, inplace=True)
    return df

In [None]:
#| export
#| hide
ACTIWATCH_COLUMN_RENAMING = {
    'White Light': 'light_estimate',
    'Sleep/Wake': 'wake',
    'Activity': 'activity',
} 

In [None]:
#| export
#| hide
def load_actiwatch(filepath: str, # full path to csv file to be loaded
                   metadata: Dict[str, str] = None, # metadata containing data_id, subject_id, or other_info
                   *args, # arguments to pass to pd.read_csv
                   **kwargs, # keyword arguments to pass to pd.read_csv
                   ) -> pd.DataFrame: # dataframe with the wereable data
    "Create a dataframe from an actiwatch csv file"
    # validate inputs
    if not isinstance(filepath, str):
        raise AttributeError("Filepath must be a string.")
    if metadata is not None:
        WereableData._validate_metadata(metadata)
    # load csv
    df = pd.read_csv(filepath, *args, **kwargs)
    df['datetime'] = pd.to_datetime(df['Date']+" "+df['Time'])
    # drop unnecessary columns
    df.drop(columns=['Date', 'Time'], inplace=True)
    # rename columns
    df.rename(columns=ACTIWATCH_COLUMN_RENAMING, inplace=True)
    # add metadata
    if metadata is not None:
        df.wereable.add_metadata(metadata, inplace=True)
    else:
        df.wereable.add_metadata({'data_id': 'unknown', 'subject_id': 'unknown'}, inplace=True)
    return df

#| hide
# Resample

In [None]:
#| export
#| hide
def resample_df(df: pd.DataFrame, # dataframe to be resampled
                name: str, # name of the wereable data to resample (one of steps, heartrate, wake, light_estimate, or activity)
                freq: str, # frequency to resample to
                agg_method: str, # aggregation method to use when resampling
                initial_datetime: pd.Timestamp = None, # initial datetime to use when resampling. If None, the minimum datetime in the dataframe is used
                final_datetime: pd.Timestamp = None, # final datetime to use when resampling. If None, the maximum datetime in the dataframe is used
                ) -> pd.DataFrame: # resampled dataframe
    "Resample a wereable dataframe. If data is specified in intervals, returns the density of the quantity per minute."
    # validate inputs
    if not df.wereable.is_valid():
        raise AttributeError("Dataframe must be a valid wereable dataframe.")
    if not isinstance(df, pd.DataFrame):
        raise AttributeError("Dataframe must be a pandas dataframe.")
    if not isinstance(freq, str):
        raise AttributeError("Frequency must be a string.")
    if name is not None and name not in VALID_WEREABLE_STREAMS:
        raise AttributeError(f"Name must be one of: {VALID_WEREABLE_STREAMS}.")
    if name not in df.columns:
        raise AttributeError(f"Name must be one of: {df.columns}.")
    if agg_method not in ['sum', 'mean', 'max', 'min']:
        raise AttributeError("Aggregation method must be one of: sum, mean, max, min.")
    if initial_datetime is not None and not isinstance(initial_datetime, pd.Timestamp):
        raise AttributeError("Initial datetime must be a pandas timestamp.")
    if final_datetime is not None and not isinstance(final_datetime, pd.Timestamp):
        raise AttributeError("Final datetime must be a pandas timestamp.")
    # resample
    values = df[name]
    if 'start' in df.columns and 'end' in df.columns:
        # data is specified in intervals
        starts = df.start
        stops = df.end
        if initial_datetime is None:
            initial_datetime = starts.min()
        if final_datetime is None:
            final_datetime = stops.max()
        new_datetime = pd.date_range(initial_datetime, final_datetime, freq=freq)
        new_values = np.zeros(len(new_datetime))
        for idx, datetime in enumerate(new_datetime):
            next_datetime = datetime + pd.to_timedelta(freq)
            mask = (starts <= next_datetime) & (stops > datetime)
            if len(values[mask]) > 0:
                # NOTE: returns the density of the quantity per minute
                time_interval = (stops[mask] - starts[mask]).apply(lambda x: x.seconds / 60.0)
                new_values[idx] = (values[mask] / time_interval).agg(agg_method)
    else:
        # data is specified per datetime
        data_datetimes = df.datetime
        if initial_datetime is None:
            initial_datetime = data_datetimes.min()
        if final_datetime is None:
            final_datetime = data_datetimes.max()
        new_datetime = pd.date_range(initial_datetime, final_datetime, freq=freq)
        new_values = np.zeros(len(new_datetime))
        for idx, datetime in enumerate(new_datetime):
            next_datetime = datetime + pd.to_timedelta(freq)
            mask = (data_datetimes <= next_datetime) & (data_datetimes >= datetime)
            if len(values[mask]) > 0:
                new_values[idx] = values[mask].agg(agg_method)

    return pd.DataFrame({'datetime': new_datetime, name: new_values})

#| hide
# Combine

In [None]:
#| export
#| hide
WEREABLE_RESAMPLE_METHOD = {
    'steps': 'sum',
    'wake': 'max',
    'heartrate': 'mean',
    'light_estimate': 'mean',
    'activity': 'mean',
}

In [None]:
#| export
#| hide
def combine_wereable_dataframes(df_dict: Dict[str, pd.DataFrame], # dictionary of wereable dataframes 
                                resample_freq: str, # resampling frequency (e.g. '10min' for 10 minutes, see Pandas Offset aliases: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases)
                                metadata: Dict[str, str] = None, # metadata for the combined dataframe
                                ) -> pd.DataFrame: # combined wereable dataframe
    "Combine a dictionary of wereable dataframes into a single dataframe with resampling"
    df_list = []
    # find common initial and final datetimes
    initial_datetimes = []
    final_datetimes = []
    for name in df_dict.keys():
        df = df_dict[name]
        df.wereable.is_valid()
        if 'start' in df.columns:
            initial_datetimes.append(df.start.min())
            final_datetimes.append(df.end.max())
        else:
            initial_datetimes.append(df.datetime.min())
            final_datetimes.append(df.datetime.max())
    initial_datetime = min(initial_datetimes)
    final_datetime = max(final_datetimes)
    # resample each df
    for name in df_dict.keys():
        df = df_dict[name]
        new_df = resample_df(df, name, resample_freq, 
                             WEREABLE_RESAMPLE_METHOD[name],
                             initial_datetime=initial_datetime,
                             final_datetime=final_datetime)
        df_list.append(new_df)
    # merge all dfs by datetime
    df = df_list[0]
    for i in range(1, len(df_list)):
        df = df.merge(df_list[i], on='datetime', how='outer')
    # sort by datetime
    df.sort_values(by='datetime', inplace=True)
    # add metadata
    if metadata is not None:
        df.wereable.add_metadata(metadata, inplace=True)
    else:
        df.wereable.add_metadata({'data_id': 'combined_dataframe'}, inplace=True)
    return df

# Overview

The `circadian.readers` module contains several methods for working with wereable data such as step counts, heart rate, and sleep. It also defines a Pandas accessor called `WereableData` to standardize and validate wereable dataframes.

# Loading wereable data

The `circadian.readers` module provides functionality to import files in several formats, including raw CSV counts, JSON files, and data coming from Actiwatch readers in CSV format. For example, to load a CSV file with heart rate data we can do:

```python
from circadian.readers import load_csv
file_path = 'circadian/sample_data/hr_data.csv'
df_hr = load_csv(file_path, timestamp_col='timestamp')
```

In [None]:
#| echo: false
file_path = '../../circadian/sample_data/hr_data.csv'
df_hr = load_csv(file_path, timestamp_col='timestamp')
display(df_hr)

Unnamed: 0,heartrate,timestamp,datetime
0,79.0,4.688359e+07,1971-06-27 15:13:12.693424232
1,80.0,4.688329e+07,1971-06-27 15:08:09.693448064
2,81.0,4.688306e+07,1971-06-27 15:04:20.692736632
3,80.0,4.688273e+07,1971-06-27 14:58:46.686474800
4,85.0,4.688257e+07,1971-06-27 14:56:08.187120912
...,...,...,...
99995,97.0,3.271680e+07,1971-01-14 15:59:56.779711960
99996,95.0,3.271679e+07,1971-01-14 15:59:49.779711960
99997,95.0,3.271679e+07,1971-01-14 15:59:48.779711960
99998,95.0,3.271678e+07,1971-01-14 15:59:43.779711960


by indicating which column contains the unix timestamp information, `load_csv` automatically generates a new column with the datetime information. If no timestamp column is provided, it is assumed that a column named 'datetime' (or 'start' and 'end') is present in the file. For data specified via time intervals, such as step counts, no new column is generated and the user can choose how to process the data. For example, to load a CSV file with step counts we can do:

```python
file_path = 'circadian/sample_data/steps_data.csv'
df_steps = load_csv(file_path)
```

In [None]:
#| echo: false
file_path = '../../circadian/sample_data/steps_data.csv'
df_steps = load_csv(file_path)
display(df_steps)

Unnamed: 0,start,end,steps
0,1970-01-01 00:00:00,1970-01-01 00:01:00,21.000000
1,1970-01-01 00:49:00,1970-01-01 00:50:00,8.183578
2,1970-01-01 00:50:00,1970-01-01 00:51:00,19.816422
3,1970-01-01 01:51:00,1970-01-01 01:52:00,0.571419
4,1970-01-01 01:52:00,1970-01-01 01:53:00,26.499032
...,...,...,...
222765,1971-06-27 14:24:00,1971-06-27 14:25:00,28.006870
222766,1971-06-27 14:25:00,1971-06-27 14:26:00,15.957981
222767,1971-06-27 14:26:00,1971-06-27 14:27:00,14.000000
222768,1971-06-27 14:37:00,1971-06-27 14:38:00,72.642453


Additionally, we can import data in JSON format. For example, to load a JSON file with multiple streams of wereable data we can do:

```python
file_path = 'circadian/sample_data/sample_data.json'
df_dict = load_json(file_path)
print(df_dict.keys())
```

In [None]:
#| echo: false
file_path = '../../circadian/sample_data/sample_data.json'
df_dict = load_json(file_path, metadata={'data_id': 'sample_wereable_data', 'subject_id': 'sample_subject'})
print(df_dict.keys())

dict_keys(['wake', 'steps', 'heartrate'])


where `df_dict` is a dictionary with the dataframes for each stream. The keys of the dictionary are the names of the streams. For example, to access the dataframe with the wake data we can do:

```python
df_wake = df_dict['wake']
```

In [None]:
#| echo: false
df_wake = df_dict['wake']
display(df_wake)

Unnamed: 0,start,end,wake
0,1970-02-03 04:49:01.000000,1970-02-03 09:01:00.000000,0
1,1970-02-03 09:02:00.000000,1970-02-03 11:25:00.000000,0
2,1970-02-04 04:51:01.000000,1970-02-04 12:35:00.000000,0
3,1970-02-04 12:36:00.000000,1970-02-04 12:37:00.000000,0
4,1970-02-04 12:38:00.000000,1970-02-04 12:39:00.000000,0
...,...,...,...
2750,1971-06-27 07:38:31.105829,1971-06-27 08:01:01.105829,0
2751,1971-06-27 08:03:01.105829,1971-06-27 08:55:31.105829,0
2752,1971-06-27 09:05:31.105829,1971-06-27 09:07:01.105829,0
2753,1971-06-27 09:08:01.105829,1971-06-27 12:06:01.105829,0


The `circadian.readers` module only accepts specific column names for wereable data. The accepted column names are stored in `VALID_WEREABLE_STREAMS`:

In [None]:
#| echo: false
print(VALID_WEREABLE_STREAMS)

['steps', 'heartrate', 'wake', 'light_estimate', 'activity']


Finally, we can import data from Actiwatch readers. For example, to load a CSV file with data from an Actiwatch reader we can do:

```python
file_path = 'circadian/sample_data/sample_actiwatch.csv'
df_actiwatch = load_actiwatch(file_path)
```

In [None]:
#| echo: false
file_path = '../../circadian/sample_data/sample_actiwatch.csv'
df_actiwatch = load_actiwatch(file_path)
display(df_actiwatch)

Unnamed: 0,activity,light_estimate,wake,datetime
0,91.0,318.16,1.0,2019-02-20 12:32:00
1,125.0,285.38,1.0,2019-02-20 12:32:30
2,154.0,312.05,1.0,2019-02-20 12:33:00
3,424.0,294.61,1.0,2019-02-20 12:33:30
4,385.0,285.06,1.0,2019-02-20 12:34:00
...,...,...,...,...
55646,0.0,5.02,0.0,2019-03-11 08:15:00
55647,56.0,4.56,1.0,2019-03-11 08:15:30
55648,30.0,2.85,1.0,2019-03-11 08:16:00
55649,9.0,2.39,0.0,2019-03-11 08:16:30


note that `load_actiwatch` automatically generates a new column with the datetime information and standardizes column names.

# Resampling wereable data

The `circadian.readers` module provides functionality to resample both data that is specified via time intervals or via timestamps. For example, to resample a dataframe with step counts we can do:
```python
name = 'steps'
resample_freq = '1D'
agg_method = 'sum'
resampled_steps = resample_df(df_steps, name, resample_freq, agg_method)
```

In [None]:
#| echo: false
name = 'steps'
resample_freq = '1D'
agg_method = 'sum'
resampled_steps = resample_df(df_steps, name, resample_freq, agg_method)
display(resampled_steps)

Unnamed: 0,datetime,steps
0,1970-01-01,847.000000
1,1970-01-02,1097.000000
2,1970-01-03,1064.000000
3,1970-01-04,2076.000000
4,1970-01-05,2007.000000
...,...,...
538,1971-06-23,9372.098478
539,1971-06-24,10142.402971
540,1971-06-25,15012.305396
541,1971-06-26,5747.457876


where `resample_freq` is a string indicating the frequency of the resampling in [Pandas offset aliases notation](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases). Under `name`, the column to be resampled is specified and the `agg_method` parameter indicates how to aggregate the data.

# Combining wereable data

We can combine wereable data from different streams into a single dataframe. To achieve this we can use the `combine_wereable_dataframes` method which resamples and aggregates data to produce a dataframe with a single `datetime` index and columns for each stream. For example, to combine all the loaded dataframes from the previous section we would do:

```python
df_dict = {
    'heartrate': df_hr,
    'steps': df_steps,
    'wake': df_wake
}
resample_freq = '1D'
combined_data = combine_wereable_dataframes(df_dict, resample_freq)
```

In [None]:
#| echo: false
df_dict = {
    'heartrate': df_hr,
    'steps': df_steps,
    'wake': df_wake
}
resample_freq = '1W'
combined_data = combine_wereable_dataframes(df_dict, resample_freq)
display(combined_data)

Unnamed: 0,datetime,heartrate,steps,wake
0,1970-01-04,0.000000,16188.000000,0.0
1,1970-01-11,0.000000,19199.000000,0.0
2,1970-01-18,0.000000,17888.000000,0.0
3,1970-01-25,0.000000,31880.133432,0.0
4,1970-02-01,0.000000,55150.172358,0.0
...,...,...,...,...
73,1971-05-30,79.914844,63341.399888,0.0
74,1971-06-06,97.080529,96297.437512,0.0
75,1971-06-13,93.772603,58357.605829,0.0
76,1971-06-20,99.018829,75479.093737,0.0


For resampling, each wereable stream has a defaul aggregation method. The default methods are defined in the variable `WEREABLE_RESAMPLE_METHOD`:

In [None]:
#| echo: false
print(WEREABLE_RESAMPLE_METHOD)

{'steps': 'sum', 'wake': 'max', 'heartrate': 'mean', 'light_estimate': 'mean', 'activity': 'mean'}


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()