# Activity Monitor

This notebook serves as an introduction to working with the Activity Monitor data in mHealth format. It will open one file and make a few plots.

Information on this format may be found at https://github.com/openmhealth


File organization is expected to follow this pattern:

pilot_data_root           
&emsp;wearable_activity_monitor    
&emsp;&emsp;manifest.tsv    
&emsp;&emsp;heart_rate    
&emsp;&emsp;&emsp;garmin_vivosmart5    
&emsp;&emsp;&emsp;&emsp;0001    
&emsp;&emsp;&emsp;&emsp;&emsp;0001_heartrate.json     
&emsp;&emsp;&emsp;&emsp;0002    
&emsp;&emsp;&emsp;&emsp;&emsp;0002_heartrate.json   
&emsp;&emsp;sleep    
&emsp;&emsp;&emsp;garmin_vivosmart5    
&emsp;&emsp;&emsp;&emsp;0001    
&emsp;&emsp;&emsp;&emsp;&emsp;0001_sleep.json     
&emsp;&emsp;&emsp;&emsp;0002    
&emsp;&emsp;&emsp;&emsp;&emsp;0002_sleep.json  
&emsp;&emsp;&emsp;&emsp;... etc.

In [None]:
import json
from datetime import datetime, timedelta

import matplotlib.dates as mdates  # to use ConciseDateFormatter
import matplotlib.pyplot as plt  # to make plots
import pandas as pd

## custom path -- change to match your file structure

In [None]:
data_root = "/Volumes/data/datasets/AIREADI/YEAR2"  # change this to your own path

# Read the manifest

In [None]:
manifest_path = data_root + "/wearable_activity_monitor/manifest.tsv"
print(manifest_path)

In [None]:
dfm = pd.read_csv(manifest_path, sep="\t")
print(dfm.columns)

In [None]:
filepath_cols = [x for x in dfm.columns if "filepath" in x]
print(filepath_cols)

In [None]:
npid = dfm["participant_id"].nunique()  # number of unique participants
npid_list = dfm["participant_id"].unique()
print(f"{npid} unique participant_ids:\n{npid_list}")

In [None]:
dfm.head()

In [None]:
key_columns = [
    "participant_id",
    "sensor_sampling_duration_days",
    "average_heartrate_bpm",
]  # optionally view only a few columns

dfm[key_columns].head(2)

# Select a set of data to explore

In [None]:
pid = 4038  # select a participant ID
# 1043, 4038, 7060 are example of participants with missing data

In [None]:
def mk_full(xpath):
    if (pd.isna(xpath)) or (xpath == "None"):
        return "no_file"
    else:
        return data_root + xpath

In [None]:
full_file_dict = {
    c: mk_full(dfm[dfm["participant_id"] == pid][c].values[0]) for c in filepath_cols
}

In [None]:
for k, v in full_file_dict.items():
    print(f"{k}  : {v}\n")

## read and explore the data

### supporting functions and information

In [None]:
def open_mhealth_json(filepath, verbose=False):
    # read the mHealth formatted data as json
    with open(filepath, "r") as f:
        data = json.load(f)

    f_simple = filepath.split("/")[-1]
    if verbose:
        print(
            f'{f_simple} has keys {data.keys()} and body has keys {data["body"].keys()}'
        )

    header = data["header"]

    # all of the activity files currently have one more key to get to the body information
    keylist = [k for k in data["body"].keys()]
    key1 = keylist[0]
    body = data["body"][key1]

    return header, body

In [None]:
def flatten_json(y):
    out = {}

    def flatten(x, name=""):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + "_")
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + "_")
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [None]:
def convert_time_string_to_datetime(t_str):
    """Converts time string to datetime format. Does not convert to local time.
    Args:
        t_str (str): UTC time string such as 2023-08-01T20:39:33Z
    Returns: datetime object
    """
    try:
        datetime_object = datetime.strptime(t_str, "%Y-%m-%dT%H:%M:%SZ")  # 4 digit Year
    except Exception as e:
        # print(f'Unknown date format: {t_str}')  # use this after we have the repaired files
        # use this for now to allow progress while the missing T, Z values are being fixed
        try:
            datetime_object = datetime.strptime(
                t_str, "%Y-%m-%d %H:%M:%S"
            )  # 4 digit Year
        except Exception as e2:
            print(f"Unknown date format: {t_str}")
    return datetime_object

In [None]:
def create_dataframe_from_body(b, verbose=False):
    # Activity observations are in a list of nested dicts; flatten these
    list_of_body_dicts = list()
    for observation in b:
        flat_obs = flatten_json(observation)
        list_of_body_dicts.append(flat_obs)

    df = pd.DataFrame.from_records(list_of_body_dicts)
    if verbose:
        print(f"  df {df.shape} with columns {df.columns}")

    # Flattening the dict creates some very long column names; shorten these
    df.rename(
        columns={
            # some activities use 2 time stamps
            "effective_time_frame_time_interval_start_date_time": "start_time",
            "effective_time_frame_time_interval_end_date_time": "end_time",
            # others use only 1 time stamp
            "effective_time_frame_date_time": "start_time",
        },
        inplace=True,
    )
    # note that this notebook will use only the start_time for making plots
    df["start_dtime"] = df.apply(
        lambda row: convert_time_string_to_datetime(row["start_time"]), axis=1
    )
    if verbose:
        print(f"  df {df.shape} with renamed columns {df.columns}")

    # physical_activity may contain an empty activity with an empty value; remove that data
    if "activity_name" in df.columns.tolist():
        mask_no_activity_value = df["activity_name"] == ""
        df = df[~mask_no_activity_value]
        if verbose:
            print(f"  df {df.shape} after dropping empty activities {df.columns}")

    # calories may contain a filed called duration_value which is not helpful; change the name
    if "duration_value" in df.columns.tolist():
        if ("duration_unit" in df.columns.tolist()) and (
            df["duration_unit"].value_counts().index[0] == "kcal"
        ):
            df.rename(columns={"duration_value": "kcalorie_value"}, inplace=True)

    return df

In [None]:
def plot_activity(df, value_col, pid=0, title=None):

    fig, ax = plt.subplots(1, 1, figsize=(8, 3))

    ax.xaxis.set_major_locator(mdates.HourLocator(interval=24))
    ax.xaxis.set_major_locator(mdates.HourLocator(byhour=12))  # each day at noon
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d %H:%M"))

    # df.plot.line(x='start_dtime', y=value_col, ax=ax, legend=False)  # another option for plotting
    df.plot.scatter(x="start_dtime", y=value_col, ax=ax, legend=False)

    # get a few stats to include in the title
    min_dtime = df["start_dtime"].min()
    max_dtime = df["start_dtime"].max()

    if title is not None:
        ax.set_title(title)
    else:
        ax.set_title(
            f"{value_col} vs. timestamp.\nMin_date: {min_dtime}    Max_date: {max_dtime}"
        )

    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()

    return min_dtime, max_dtime, fig

In [None]:
### some notes on physical_activity
# "activity_name": "",
# "activity_name": "generic",
# "activity_name": "running",
# "activity_name": "sedentary",
# "activity_name": "walking",

### open and read mHealth files

In [None]:
verbose = (
    True  # set this to True if you want more information as you decipher this notebook
)

In [None]:
for k, v in full_file_dict.items():
    if v == "no_file":
        print(f"  No file associated with {k}")
    else:
        h, b = open_mhealth_json(v, verbose=verbose)
        # print(f'  header has keys {h.keys()}')  # not used here, but can be explored
        if verbose:
            print(f"  body has {len(b)} elements")
        if len(b) > 0:

            df = create_dataframe_from_body(b, verbose=verbose)

            value_fields = [x for x in df.columns if "_value" in x]
            if verbose:
                print(f"  plottable value fields: {len(value_fields)} {value_fields}")

            for value_col in value_fields:
                min_dtime, max_dtime, fig = plot_activity(
                    df, value_col, pid=pid, title=None
                )
                print(
                    f"   data earliest timestamp: {min_dtime}    data latest timestamp: {max_dtime}"
                )
        else:
            print(f"  No observation data so no plot for {v}")
    print("\n", "-" * 40, "\n")

# A few final notes

Note that the watch keeps recording after the participant takes it off, and some variables save the data: heart rate, stress, respiratory, and movement

Other values stop being saved, so they could be used to help locate only the data of interest: oxygen_saturation, sleep (shown above), kcalorie

In [None]:
print("Done")