### Purpose

The intent of this notebook is to serve as a rapid testing ground for new utilities. Any logic written here should migrate to the src/ directory as proper functions.

#### Import and Constants

In [None]:
from datetime import timedelta
from typing import Callable, List, Union

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
from pandas.core.generic import NDFrame  # This is the generic type that encompasses Series and DataFrame
from scipy.optimize import curve_fit
from scipy.ndimage import uniform_filter1d

number = Union[float, int]               # Something like this should be part of the standard library IMO

# Column names
DATE = "date"
WORKOUT_TYPE = "workout_type"
DATA_DURATION = "duration(HH:mm:ss)"     # This is the human-readable version -- it'll be dropped during processing
DURATION = "duration(s)"                 # Convert the human-readable durations to seconds for computational simplicity
DISTANCE = "distance(km)"
STEPS = "steps"
ELEVATION = "elevation(m)"
AVG_HEART_RATE = "avg_heart_rate"
MAX_HEART_RATE = "max_heart_rate"
RESTING_HEART_RATE = "resting_heart_rate(bpm)"
NOTES = "notes"
LOCATION = "location"
EXERCISE = "exercise"
REPS = "reps"
WEIGHT = "weight(lbs)"
RATING = "rating"
PACE = "pace (m/s)"
RATE_OF_CLIMB = "rate of climb (m/h)"

# Workout Types
WALK_TREADMILL = "walk (treadmill)"
WALK_OUTDOOR = "walk (outdoor)"
BIKE_STATIONARY = "bike (stationary)"

#### Load Data

In [None]:
health_metrics = pd.read_csv("../data/health_metrics.csv")
travel_days = pd.read_csv("../data/travel_days.csv")
cardio_workouts = pd.read_csv("../data/cardio_workouts.csv")
weight_training_workouts = pd.read_csv("../data/weight_training_workouts.csv")
weight_training_sets = pd.read_csv("../data/weight_training_sets.csv")

# Convert dates and times to proper types
for df in [health_metrics, travel_days, cardio_workouts, weight_training_workouts, weight_training_sets]:
    df[DATE] = pd.to_datetime(df[DATE])
    if DATA_DURATION in df:
        df[DATA_DURATION] = pd.to_timedelta(df[DATA_DURATION])
        df[DATA_DURATION] = df[DATA_DURATION].apply(lambda x: int(x.total_seconds()))
        df.rename(columns={DATA_DURATION: DURATION}, inplace=True)
    if NOTES in df:
        df[NOTES] = df[NOTES].fillna("")

# Will be used to pad all datasets to have consistent dates
all_dates = pd.concat([cardio_workouts[DATE], weight_training_workouts[DATE], travel_days[DATE]])

# Filling in a workout type for travel days
travel_days[WORKOUT_TYPE] = "Travel"

# Filter out any empty rows from the health metrics
health_metrics = health_metrics[health_metrics[WEIGHT].notnull() | health_metrics[RESTING_HEART_RATE].notnull()]

#### Transform

In [None]:
def join_with_comma(items: List[str]):
    """Wrapper function to join a list of strs with commas"""
    return ",".join(items)

def det_workout_type(joined_workout_types: str):
    """Determines the workout type given all of the comma-joined workout types for a given day."""
    result = ""
    for w_type in joined_workout_types.split(","):
        if not result:
            result = w_type
        if result and w_type != result:
            return "Mixed"
    return result

# Initialize the dataframe including all dates spanning the range of the data (rest days are missing in the workout data)
all_workouts = pd.DataFrame()
all_workouts[DATE] = pd.date_range(all_dates.min(), all_dates.max())
all_workouts = all_workouts.set_index(DATE)

# Populate the total daily workout duration
total_durations = pd.concat([
    weight_training_workouts.groupby(DATE)[DURATION].agg(sum),
    cardio_workouts.groupby(DATE)[DURATION].agg(sum),
    travel_days.groupby(DATE)[DURATION].agg(sum),
]).groupby(DATE).agg(sum)
total_durations.index = pd.DatetimeIndex(total_durations.index)
total_durations = total_durations.reindex(all_workouts.index, fill_value=0)
all_workouts[DURATION] = total_durations

# Populate the workout types (there can be more than one per day or rest days, this smooths that out)
workout_types = pd.concat([
    weight_training_workouts.groupby(DATE)[WORKOUT_TYPE].agg(join_with_comma),
    cardio_workouts.groupby(DATE)[WORKOUT_TYPE].agg(join_with_comma),
    travel_days.groupby(DATE)[WORKOUT_TYPE].agg(join_with_comma),
]).groupby(DATE).agg(join_with_comma)
workout_types = workout_types.apply(det_workout_type)
workout_types.index = pd.DatetimeIndex(workout_types.index)
workout_types = workout_types.reindex(all_workouts.index, fill_value="Rest Day")
all_workouts[WORKOUT_TYPE] = workout_types
del workout_types

# Populate computed fields
cardio_workouts[PACE] = cardio_workouts[DISTANCE] / cardio_workouts[DURATION]
cardio_workouts[PACE] = cardio_workouts[PACE] * 1000                         # Convert from km/s to m/s
cardio_workouts[RATE_OF_CLIMB] = cardio_workouts[ELEVATION] / cardio_workouts[DURATION]
cardio_workouts[RATE_OF_CLIMB] = cardio_workouts[RATE_OF_CLIMB] * (60 * 60)  # Convert from m/s to m/h

all_workouts = all_workouts.reset_index()

#### Compute Trends

In [None]:
EXTRAPOLATE_DAYS = 100

def f_log_curve(t, a, b, c):
    """This is a logaritmic function that Scipy's curve_fit will fit (using the variables given)"""
    return a * np.log(b * t) + c

def f_affine(t, a, b):
    """This is a linear function that Scipy's curve_fit will fit (using the variables given)"""
    return a*t + b

def get_padded_dates(df: pd.DataFrame, num_days_to_pad: int):
    """Pads the """
    first_index = df.index[0]
    periods = df.shape[0] + num_days_to_pad
    padded_dates = pd.date_range(df.iloc[0][DATE], periods=periods, freq='1d')
    padded_dates = padded_dates.to_series(name=DATE).reset_index(drop=True)
    padded_dates.index = pd.RangeIndex(start=first_index, stop=first_index + periods)
    return padded_dates

def compute_trendline(df: pd.DataFrame, key: str, f_to_fit: Callable, num_days_to_extrapolate: int):
    """Fits a trendline using the given functionm for the column specified by key in the given DataFrame"""
    nonnulls = df[df[key].notnull()]
    x = nonnulls.index
    y = nonnulls[key]
    fitted_params, _ = curve_fit(f_to_fit, x, y)
    padded_dates = get_padded_dates(nonnulls, num_days_to_extrapolate)
    return f_to_fit(padded_dates.index, *fitted_params).to_numpy()

# n-day average over a week gives a sense of if I'm keeping above a relatively low baseline of 150 minutes/week
MIN_DAILY_ACTIVE_MINUTES = 22.5  # Weekly is 150, this is about 150/7
N_DAYS_TO_AVG = 8
n_day_avg_workout_duration = uniform_filter1d(all_workouts[DURATION], size=N_DAYS_TO_AVG)

# Fit relevant trendlines
weight_trendline = compute_trendline(health_metrics, WEIGHT, f_affine, EXTRAPOLATE_DAYS)
heart_rate_trendline = compute_trendline(health_metrics, RESTING_HEART_RATE, f_log_curve, EXTRAPOLATE_DAYS)

### Build Visuals

In [None]:
# TODO Build at least these visuals:
# * Walking data (max distance, max elevation gain, max duration, pace graph)
# * Strength Metrics (PB + 10-rep; format TBD, likely better as two separate graphs or perhaps grouped by workout)
#    - Ideally: drop-down menu to select between various workouts, each plots its PB and 10-rep over time
ABOVE_TABLE = 1.15

def convert_pd_to_np(obj: NDFrame) -> np.ndarray:
    return np.array(obj)[:, None]

def convert_mins_to_hour_mins(mins: number, _ = None) -> str:
    if mins < 60:
        return f"{int(mins)}m"
    hours, mins = int(mins // 60), int(mins % 60)
    return f"{hours}h {mins}m"

### Utility function, not used in main path ###
def show_gcf_corners(plot: plt):
    plot.gcf().text(0, 0, "x")
    plot.gcf().text(1, 0, "x")
    plot.gcf().text(0, 1, "x")
    plot.gcf().text(1, 1, "x")

def configure_x_axis_by_month(all_workouts: pd.DataFrame, start_padding_days: int=1, end_padding_days: int=1):
    """Sets the current axes x-axis to major tick by month, minor tick on Sundays, and have MMM-YYYY major labels."""
    ax = plt.gca()
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
    ax.xaxis.set_minor_locator(mdates.WeekdayLocator(byweekday=mdates.SU))
    plt.xlim(
        all_workouts[DATE][0] - timedelta(days=start_padding_days),
        all_workouts[DATE].tail(1) + timedelta(days=end_padding_days)
    )

def create_legend_and_title(title: str, reverse_labels: bool=False, ncol: int=2):
    """Adds a legend and title centered above the current plot"""
    plt.title(title, y=ABOVE_TABLE)
    handles, labels = plt.gca().get_legend_handles_labels()
    if reverse_labels:
        handles, labels = handles[::-1], labels[::-1]
    plt.legend(handles, labels, bbox_to_anchor=(0.5, ABOVE_TABLE), loc='upper center', frameon=False, ncol=ncol)

#### Workout Frequency

In [None]:
RIGHT_OF_AXIS_X_COORD = 0.92
NON_GRAPH_GCF_PERCENT = 0.06

# Draw the main graph contents and setup the axes
workout_durations_mins = all_workouts[DURATION] // 60
plt.scatter(
    all_workouts[DATE],
    workout_durations_mins,
    s=5,
    label="Workout Duration",
)
plt.plot(
    convert_pd_to_np(all_workouts[DATE]),
    convert_pd_to_np(n_day_avg_workout_duration // 60),
    label="N-Day Avg Daily Duration",
)

# Delineate the ideal minimum daily exercise threshold as a horizontal reference line
plt.axhline(y=MIN_DAILY_ACTIVE_MINUTES, color='r', linestyle='-')
y_percent_min_daily_active = MIN_DAILY_ACTIVE_MINUTES / max(workout_durations_mins)
y_pos = y_percent_min_daily_active + NON_GRAPH_GCF_PERCENT
plt.gcf().text(RIGHT_OF_AXIS_X_COORD, y_pos, "Target\nMinimum")

# Set up axes
ax = plt.gca()
configure_x_axis_by_month(all_workouts)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(convert_mins_to_hour_mins))
ax.yaxis.set_minor_locator(ticker.MultipleLocator(5))
plt.grid(visible=True)
plt.grid(visible=True, which="minor", linestyle="--", linewidth="0.25")

# Add in the surrounding information
ABOVE_TABLE = 1.15
create_legend_and_title("Workout Frequency", reverse_labels=True)
plt.savefig('../img/workout_frequency.png', bbox_inches="tight")
plt.show()

# TODO change colour of rest days to red or orange and MAYBE also color-code cardio vs weights
# Another idea would be generating sub-graphs for each step of the hierarchy. E.g.
# All -> Cardio -> Walk
# All -> Weight -> Chest (per muscle group, not pairs, bc I don't want to marry the visuals to the current splits)

#### Resting Heart Rate

In [None]:
Y_MIN, Y_MAX = 45, 90
BOTTOM_OFFSET = 0.11
NON_GRAPH_AREA_SCALER = 1.3

nonnull_heart_rates = health_metrics[health_metrics[RESTING_HEART_RATE].notnull()]
padded_dates = get_padded_dates(nonnull_heart_rates, EXTRAPOLATE_DAYS)
plt.scatter(
    nonnull_heart_rates[DATE].to_numpy(),
    nonnull_heart_rates[RESTING_HEART_RATE].to_numpy(),
    s=5,
    label="Resting HR"
)
plt.plot(
    padded_dates.to_numpy(),
    heart_rate_trendline,
    linestyle="--",
    label="Projected Resting HR"
)

# Delineate various resting heart rate levels as horizontal reference lines
resting_heart_rate_levels = {
    "Average": 72,
    "Above Average": 68,
    "Good": 63,
    "Excellent": 58,
    "Athlete": 50,
}
for text, hr in resting_heart_rate_levels.items():
    plt.axhline(y=hr, color='k', linestyle='--', linewidth="0.75")
    y_range = Y_MAX - Y_MIN
    y_pos = BOTTOM_OFFSET + (hr - Y_MIN) / (NON_GRAPH_AREA_SCALER*y_range)
    plt.gcf().text(RIGHT_OF_AXIS_X_COORD, y_pos, text)

# Set up axes
ax = plt.gca()
configure_x_axis_by_month(all_workouts, end_padding_days=EXTRAPOLATE_DAYS)
ax.yaxis.set_minor_locator(ticker.MultipleLocator(1))
ax.set_ylim([Y_MIN, Y_MAX])
plt.grid(visible=True)
plt.grid(visible=True, which="minor", linestyle="--", linewidth="0.25")

# Add in the surrounding information
create_legend_and_title("Resting Heart Rate", reverse_labels=True)
plt.savefig('../img/resting_heart_rate.png', bbox_inches="tight")
plt.show()

#### Weight

In [None]:
Y_MIN, Y_MAX = 180, 300

nonnull_weights = health_metrics[health_metrics[WEIGHT].notnull()]
padded_dates = get_padded_dates(nonnull_weights, EXTRAPOLATE_DAYS)
plt.scatter(
    nonnull_weights[DATE].to_numpy(),
    nonnull_weights[WEIGHT].to_numpy(),
    s=5,
    label="Weight"
)
plt.plot(
    padded_dates.to_numpy(),
    weight_trendline,
    linestyle="--",
    label="Projected Weight"
)

# Delineate various resting heart rate levels as horizontal reference lines
weight_levels = {
    "Healthy": 250,
    "Target": 200,
}
for text, weight in weight_levels.items():
    plt.axhline(y=weight, color='k', linestyle='--', linewidth="0.75")
    y_range = Y_MAX - Y_MIN
    y_pos = BOTTOM_OFFSET + (weight - Y_MIN) / (NON_GRAPH_AREA_SCALER*y_range)
    plt.gcf().text(RIGHT_OF_AXIS_X_COORD, y_pos, text)

# Set up axes
ax = plt.gca()
configure_x_axis_by_month(all_workouts, end_padding_days=EXTRAPOLATE_DAYS)
ax.yaxis.set_minor_locator(ticker.MultipleLocator(5))
ax.set_ylim([Y_MIN, Y_MAX])
plt.grid(visible=True)
plt.grid(visible=True, which="minor", linestyle="--", linewidth="0.25")

# Add in the surrounding information
create_legend_and_title("Weight", reverse_labels=True)
plt.savefig('../img/weight.png', bbox_inches="tight")
plt.show()

In [None]:
walk_workouts = cardio_workouts[cardio_workouts[WORKOUT_TYPE].isin({WALK_TREADMILL, WALK_OUTDOOR})]
distance_walked = round(walk_workouts[DISTANCE].to_numpy().sum())
avg_distance = np.average(walk_workouts[DISTANCE].to_numpy())
farthest = walk_workouts[DISTANCE].to_numpy().max()
avg_duration = round(np.average(walk_workouts[DURATION].to_numpy()))
avg_pace = np.average(walk_workouts[PACE].to_numpy())
fastest_pace = walk_workouts[PACE].to_numpy().max()
avg_duration_str = convert_mins_to_hour_mins(avg_duration // 60)
print("Walking Metrics")
print("Average distance: {:.2f}km".format(avg_distance))
print(f"Farthest distance: {farthest}km")
print(f"Total distance: {distance_walked}km")
print(f"Average duration: {avg_duration_str}")
print("Average pace (m/s): {:.2f}".format(avg_pace))
print("Fastest pace (m/s): {:.2f}".format(fastest_pace))

bike_workouts = cardio_workouts[cardio_workouts[WORKOUT_TYPE] == BIKE_STATIONARY]
distance_biked = round(bike_workouts[DISTANCE].to_numpy().sum())
print()
print("Biking Metrics")
print(f"Total distance: {distance_biked}km")

distance_travelled = distance_walked + distance_biked
print()
print("Summary Metrics")
print(f"Total distance travelled: {distance_travelled}")