In [None]:
import datetime
import re
import sys
import itertools as it
import functools as ft
import dataclasses
from dataclasses import dataclass
from pathlib import Path
from typing import *
import scipy as sp
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_context("talk")

In [None]:
@dataclass
class Event:
    kind: str
    time: datetime.datetime

def parse_date(s):
    return datetime.datetime.strptime(s, "%Y-%m-%d").date()

def parse_time(s):
    return datetime.datetime.strptime(s, "%H%M").time()

def parse(lines: Iterable[Tuple[int, str]]) -> Iterable[Event]:
    date_regex = re.compile(r"^#\s*(\d{4}-\d{2}-\d{2})$")
    curl_regex = re.compile(r"^(curl|noncurl)\s*,\s*(\d+)\s*,\s*(.*)$")
    sicks_regex = re.compile(r"^(\d{4})\s*,\s*sicks")

    date = None
    for line_number, line in lines:
        try:
            line = line.lower().strip()

            if m := date_regex.search(line):
                new_date = parse_date(m.group(1))
                if not (date is None or date < new_date):
                    raise ValueError(f"dates out-of-order {date} < {new_date}")
                date = new_date

            if m := curl_regex.search(line):
                times = [parse_time(s.strip()) for s in m.group(3).strip(" ,").split(",")]
                if len(times) != int(m.group(2)):
                    print(f"Warning! Curl counts don't match {len(times)} vs {m.group(2)} at line {line_number}", file=sys.stderr)
                for time in times:
                    yield Event(m.group(1), datetime.datetime.combine(date, time))
                    
            if m := sicks_regex.search(line):
                time = parse_time(m.group(1))
                yield Event("sicks", datetime.datetime.combine(date, time))

        except Exception as e:
            print(f"Error {e} at line {line_number} {line!r}", file=sys.stderr)

def load(path: Path) -> List[Event]:
    return list(parse(reversed(list(zip(it.count(1), path.read_text().split("\n"))))))

events = load(Path("data/2022-12-07.txt"))

In [None]:
def events_df(kinds: Set[str], timespan: str) -> pd.DataFrame:
    df = pd.DataFrame.from_dict([e for e in events if e.kind in kinds])
    df["date"] = df.time.apply(lambda d: d.date())
    df["hour"] = df.time.apply(lambda t: t.hour)
    df = df.sort_values(by="time")
    df = df[df.time > df.time.max() - pd.to_timedelta(timespan)]
    return df

def plot_count(df: pd.DataFrame) -> None:
    plt.figure(figsize=(14, 8))
    rng = pd.date_range(df.date.min(), df.date.max())
    count = (pd.Series(data=np.zeros(len(rng), np.int32), index=rng) + df.date.value_counts()).fillna(0)
    g = pd.DataFrame.from_dict(dict(
        count=count,
        smooth_count=sp.ndimage.gaussian_filter1d(count, sigma=3),
    )).reset_index().rename(columns=dict(index="date"))
    g.plot(x='date', y='smooth_count', style="k", ax=plt.gca())
    g.plot(x='date', y='count', style="k", alpha=.4, label="_nolabel_", legend=False, ax=plt.gca())
    plt.xlabel("Date")
    plt.xticks(rotation=-15, ha="left")
    plt.gca().get_legend().remove()
    
def plot_hour(df: pd.DataFrame) -> None:
    plt.figure(figsize=(14, 8))
    xs = np.arange(24)
    plt.bar(xs, np.bincount(df.hour, minlength=24) / (df.date.max() - df.date.min()).days, color="gray", align="edge")
    plt.xticks(xs, [f"{x:02d}" for x in xs])
    plt.xlim((0, xs.max()+1))
    plt.xlabel("Hour of day")

dates = pd.read_csv("data/dates.csv", converters=dict(date=pd.to_datetime))

def show_dates(type_: str) -> None:
    for _, s in dates[dates.type == type_].iterrows():
        if s.date.toordinal() >= plt.xlim()[0]:
            plt.vlines(s.date, *plt.ylim(), linestyles="dashed", color="k")
            plt.annotate(s.event, [s.date, plt.ylim()[1]], fontsize=12,
                         xytext=(5, -20), textcoords="offset pixels",
                         rotation=-10, rotation_mode="anchor")

df = events_df({"sicks"}, "52w")
plot_count(df)
plt.ylim((0, 6))
plt.ylabel("Vomits /day")
plt.title("Vomits over time", pad=10);

show_dates("vomit")

plot_hour(events_df({"sicks"}, "13w"))
plt.ylabel("Vomits /hour /day")
plt.title("Vomits per hour (last 13w)", pad=10);

In [None]:
df = events_df({"curl", "noncurl"}, "52w")

plot_count(df)
plt.ylim((0, 12))
show_dates("curl")
plt.xlabel("Date")
plt.ylabel('Curl /day')
plt.title('Curl events over time', pad=10);

plot_hour(events_df({"curl", "noncurl"}, "13w"))
plt.ylabel("Curl /hour /day")
plt.title("Curls per hour (last 13w)", pad=10);