Skip to content

Commit

Permalink
fix: more progress on load_all_df, added derived.sleep, finished …
Browse files Browse the repository at this point in the history
…Whoop sleep loading, fixed location loading, and many other fixes
  • Loading branch information
ErikBjare committed May 23, 2023
1 parent ebe720b commit 3de8677
Show file tree
Hide file tree
Showing 11 changed files with 380 additions and 95 deletions.
13 changes: 6 additions & 7 deletions config.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,17 @@ name = "john"
date_offset_hours = 5

[data]
categories= "categories.example.toml"
habitbull = "~/Downloads/HabitBullData.csv"
location = "~/location"
oura = "~/Downloads/oura_2020-02-27T09-07-47.json"
categories = "~/work/quantifiedme/quantifiedme/categories.example.toml"
#habitbull = "~/Downloads/HabitBullData.csv"
#location = "~/location"
#oura = "~/Downloads/oura_2020-02-27T09-07-47.json"

[data.activitywatch]
port = 5666
hostnames = ["fakedata"]

[data.smartertime_buckets]
example-hostname = '~/data/smartertime/smartertime_export_example-hostname_2020-01-01_bb7f26aa.awbucket.json'

#[data.smartertime_buckets]
#example-hostname = '~/data/smartertime/smartertime_export_example-hostname_2020-01-01_bb7f26aa.awbucket.json'

[locations]
[locations.gym]
Expand Down
137 changes: 130 additions & 7 deletions src/quantifiedme/derived/all_df.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,136 @@
import os
import logging
from typing import Literal, TypeAlias
from datetime import date, datetime, timedelta, timezone

import pandas as pd

from aw_core import Event
from typing import Literal

from .heartrate import load_heartrate_daily_df
from .screentime import load_category_df
from ..load.location import load_daily_df as load_location_daily_df
from ..load.qslang import load_daily_df as load_drugs_df

from .heartrate import load_heartrate_summary_df
from .screentime import load_screentime_cached, load_category_df
from .sleep import load_sleep_df

Sources = Literal["screentime", "heartrate", "drugs", "location", "sleep"]

Sources = Literal["activitywatch", "heartrate"]

def load_all_df(events: list[Event], ignore: list[Sources] = []):
df = load_category_df(events)
def load_all_df(
fast=True, screentime_events: list[Event] | None = None, ignore: list[Sources] = []
) -> pd.DataFrame:
"""
Loads a bunch of data into a single dataframe with one row per day.
Serves as a useful starting point for further analysis.
"""
df = pd.DataFrame()
since = datetime.now(tz=timezone.utc) - timedelta(days=30 if fast else 2 * 365)

if "screentime" not in ignore:
print("Adding screentime")
if screentime_events is None:
screentime_events = load_screentime_cached(fast=fast, since=since)
df_time = load_category_df(screentime_events)
df_time = df_time[["Work", "Media", "ActivityWatch"]]
df = join(df, df_time.add_prefix("time:"))

if "heartrate" not in ignore:
df = df.join(load_heartrate_daily_df(events))
print("Adding heartrate")
df_hr = load_heartrate_summary_df(freq="D")
# translate daily datetime column to a date column
df_hr.index = df_hr.index.date # type: ignore
df = join(df, df_hr)

if "drugs" not in ignore:
print("Adding drugs")
# keep only columns starting with "tag"
df_drugs = load_drugs_df()
df_drugs = df_drugs[df_drugs.columns[df_drugs.columns.str.startswith("tag")]]
df = join(df, df_drugs)

if "location" not in ignore:
print("Adding location")
# TODO: add boolean for if sleeping together
df_location = load_location_daily_df()
df_location.index = df_location.index.date # type: ignore
df = join(df, df_location.add_prefix("loc:"))

if "sleep" not in ignore:
df_sleep = load_sleep_df()
df = join(df, df_sleep.add_prefix("sleep:"))

# look for all-na columns, emit a warning, and drop them
na_cols = df.columns[df.isna().all()]
if len(na_cols) > 0:
print(f"Warning: dropping all-NA columns: {str(list(na_cols))}")
df = df.drop(columns=na_cols)

return df


def join(df_target: pd.DataFrame, df_source: pd.DataFrame) -> pd.DataFrame:
if not df_target.empty:
check_new_data_in_range(df_source, df_target)
print(
f"Adding new columns: {str(list(df_source.columns.difference(df_target.columns)))}"
)
return df_target.join(df_source) if not df_target.empty else df_source


DateLike: TypeAlias = datetime | date | pd.Timestamp


def datelike_to_date(d: DateLike) -> date:
if isinstance(d, datetime) or isinstance(d, pd.Timestamp):
return d.date()
elif isinstance(d, date):
return d
else:
raise ValueError(f"Invalid type for datelike: {type(d)}")


def check_new_data_in_range(df_source: pd.DataFrame, df_target: pd.DataFrame) -> None:
# check that source data covers target data, or emit warning
source_start = datelike_to_date(df_source.index.min())
source_end = datelike_to_date(df_source.index.max())
target_start = datelike_to_date(df_target.index.min())
target_end = datelike_to_date(df_target.index.max())

# check the worst case
if source_start > target_end or source_end < target_start:
print(
f"Warning: source data does not cover ANY of target data: ({source_start}/{source_end}) not in ({target_start}/{target_end})"
)
elif source_start > target_start:
print(
f"Warning: source data starts after target data (partial): {source_start} > {target_start}"
)
elif source_end < target_end:
print(
f"Warning: source data ends before target data (partial): {source_end} < {target_end}"
)


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)

# print a summary of all data
df = load_all_df(fast=os.environ.get("FAST", "1") == "1")
print(df)
print(df.describe())

# check for missing data
df_days_na = df.isna().sum()
df_days_na = df_days_na[df_days_na > 0]
if len(df_days_na) > 0:
print(f"Missing data for {len(df_days_na)} out of {len(df.columns)} columns")
print(df_days_na)
print("Total days: ", len(df))

# keep days with full coverage
df = df.dropna()
print("Total days with full coverage: ", len(df))

print("Final dataframe:")
print(df)
38 changes: 24 additions & 14 deletions src/quantifiedme/derived/heartrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,37 @@ def load_heartrate_df() -> pd.DataFrame:
return df


def load_heartrate_daily_df(
zones={"low": 100, "med": 140, "high": 160}, freq="D"
def load_heartrate_minutes_df():
"""We consider using minute-resolution a decent starting point for summary heartrate data.
NOTE: ignores source, combines all sources into a single point per freq.
"""
df = load_heartrate_df().drop(columns=["source"])
df = df.resample("1min").mean()
return df


def load_heartrate_summary_df(
zones={"resting": 0, "low": 100, "med": 140, "high": 160}, freq="D"
) -> pd.DataFrame:
"""
Load heartrates, group into day, bin by zone, and return a dataframe.
NOTE: Ignores source, combines all sources into a single point per freq.
Load heartrates, group into freq, bin by zone, and return a dataframe.
"""
source_df = load_heartrate_df().drop(columns=["source"])
source_df = load_heartrate_minutes_df()
df = pd.DataFrame()
df["hr"] = source_df["hr"].groupby(pd.Grouper(freq=freq)).mean()
df["zone"] = pd.cut(
df["hr"], bins=[0, *zones.values(), 300], labels=["resting", *zones.keys()]
df["hr_mean"] = source_df["hr"].groupby(pd.Grouper(freq=freq)).mean()

# compute time spent in each zone
df_zones = pd.cut(
source_df["hr"], bins=[*zones.values(), 300], labels=[*zones.keys()]
)
for zone in zones.keys():
df[f"hr_duration_{zone}"] = df_zones[df_zones == zone].groupby(
pd.Grouper(freq=freq)
).count() * pd.Timedelta(minutes=1)
return df


if __name__ == "__main__":
df = load_heartrate_df()
print(df)
print(df.describe())

df = load_heartrate_daily_df()
df = load_heartrate_summary_df()
print(df)
27 changes: 23 additions & 4 deletions src/quantifiedme/derived/screentime.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pickle
import logging
from datetime import datetime, timezone, timedelta
from pathlib import Path
Expand Down Expand Up @@ -37,10 +38,10 @@ def _get_aw_client(testing: bool) -> ActivityWatchClient:


def load_screentime(
since: datetime | None,
datasources: list[DatasourceType] | None,
hostnames: list[str] | None,
personal: bool,
since: datetime | None = None,
datasources: list[DatasourceType] | None = None,
hostnames: list[str] | None = None,
personal: bool = True,
cache: bool = True,
awc: ActivityWatchClient | None = None,
) -> list[Event]:
Expand Down Expand Up @@ -122,6 +123,24 @@ def load_screentime(

return events

def load_screentime_cached(*args, since: datetime | None = None, fast = False, **kwargs) -> list[Event]:
# returns screentime from picked cache produced by Dashboard.ipynb (or here)
path = Path(__file__).parent.parent.parent.parent / "notebooks" / ("events_fast.pickle" if fast else "events.pickle")
if path.exists():
print(f"Loading from cache: {path}")
with open(path, "rb") as f:
events = pickle.load(f)
# if fast didn't get us enough data to satisfy the query, we need to load the rest
if fast and since and events[-1].timestamp < since:
print("Fast couldn't satisfy since, trying again without fast")
events = load_screentime_cached(fast=False, **kwargs)
# trim according to since
if since:
events = [e for e in events if e.timestamp >= since]
return events
else:
return load_screentime(*args, **kwargs)


def _join_events(
old_events: list[Event], new_events: list[Event], source: str
Expand Down
58 changes: 58 additions & 0 deletions src/quantifiedme/derived/sleep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Aggregates sleep data from Fitbit, Oura, and Whoop into a single dataframe.
"""

from datetime import datetime, timedelta, timezone

import pandas as pd

from ..load.fitbit import load_sleep_df as load_fitbit_sleep_df
from ..load.oura import load_sleep_df as load_oura_sleep_df
from ..load.whoop import load_sleep_df as load_whoop_sleep_df


def load_sleep_df(ignore: list[str] = []) -> pd.DataFrame:
"""
Loads sleep data from Fitbit, Oura, and Whoop into a single dataframe.
"""
df = pd.DataFrame()

# Fitbit
#df = join(df, load_fitbit_sleep_df(), rsuffix="_fitbit")

# Oura
if "oura" not in ignore:
df_oura = load_oura_sleep_df()
df = join(df, df_oura.add_suffix("_oura"))

# Whoop
if "whoop" not in ignore:
df_whoop = load_whoop_sleep_df()
df = join(df, df_whoop.add_suffix("_whoop"))

# perform some aggregations
keys = list(set(col.split("_")[0] for col in df.columns) & {"duration", "score"})
for key in keys:
subkeys = df.columns[df.columns.str.startswith(key)]
df[key] = df[subkeys].mean(axis=1)
df = df[keys]

return df


def join(df_target, df_source, **kwargs) -> pd.DataFrame:
if df_target.empty:
return df_source
else:
return df_target.join(df_source, **kwargs)


if __name__ == "__main__":
df = load_sleep_df()
print(df)
"""
df["duration_whoop"].plot()
import matplotlib.pyplot as plt
plt.show()
"""
4 changes: 4 additions & 0 deletions src/quantifiedme/load/fitbit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import pandas as pd


def load_sleep_df() -> pd.DataFrame:
raise NotImplementedError


def _load_heartrate_file(filepath):
# print(f"Loading {filepath}...")
# json format is {"dateTime": "2020-01-01", "value": {"bpm": 60, "confidence": 0}}
Expand Down
Loading

0 comments on commit 3de8677

Please sign in to comment.