In [None]:
import datetime
import re
import sys
import itertools as it
import functools as ft
import dataclasses
from dataclasses import dataclass
from pathlib import Path
from typing import *
import scipy as sp
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_context("talk")

In [None]:
@dataclass
class Event:
    kind: str
    time: datetime.datetime
    size: Optional[str]

def parse_date(s):
    return datetime.datetime.strptime(s, "%Y-%m-%d").date()

def parse_time(s):
    return datetime.datetime.strptime(s, "%H%M").time()

def parse(lines: Iterable[Tuple[int, str]]) -> Iterable[Event]:
    date_regex = re.compile(r"^#\s*(\d{4}-\d{2}-\d{2})$")
    curl_regex = re.compile(r"^(curl|noncurl)\s*,\s*(\d+)\s*,\s*(.*)$")
    
    date = None
    for line_number, line in lines:
        try:
            line = line.lower().strip()
            if m := date_regex.search(line):
                new_date = parse_date(m.group(1))
                if not (date is None or date < new_date):
                    raise ValueError(f"dates out-of-order {date} < {new_date}")
                date = new_date
            if m := curl_regex.search(line):
                times = [parse_time(s.strip()) for s in m.group(3).strip(" ,").split(",")]
                if len(times) != int(m.group(2)):
                    print(f"Warning! Curl counts don't match {len(times)} vs {m.group(2)} at line {line_number}", file=sys.stderr)
                for time in times:
                    yield Event(m.group(1), datetime.datetime.combine(date, time), None)
        except Exception as e:
            print(f"Error {e} at line {line_number} {line!r}", file=sys.stderr)
    
def load(path: Path) -> List[Event]:
    return list(parse(reversed(list(zip(it.count(1), path.read_text().split("\n"))))))

events = load(Path("data/2022-03-28.txt"))

In [None]:
for timespan in ["36w", "12w"]:
    dates = []
    for line in Path("data/dates.csv").read_text().rstrip("\n").split("\n"):
        date, description = line.split(",")
        dates.append([datetime.date.fromisoformat(date), description])

    df = pd.DataFrame.from_dict([e for e in events if e.kind in {"curl", "noncurl"}])
    df["date"]= df.time.apply(lambda d: d.date())
    df = df.sort_values(by="time")
    df = df[df.time > df.time.max() - pd.to_timedelta(timespan)]

    plt.figure(figsize=(14, 8))
    for kind in ["curl", "noncurl"]:
        g = (df[df.kind == kind]
             .groupby("date")[["kind"]].count()
             .rename(columns=dict(kind="count"))
             .pipe(lambda d: d.assign(smooth_count=sp.ndimage.gaussian_filter1d(d["count"].map(float), sigma=3)))
             .reset_index()
        )
        style = dict(curl="k", noncurl="b")[kind]
        g.plot(x='date', y='smooth_count', style=style, label=kind, ax=plt.gca())
        g.plot(x='date', y='count', style=style, alpha=.4, label="_nolabel_", legend=False, ax=plt.gca())
        plt.ylim((0, 12))
    for date, description in dates:
        if date >= df.date.min():
            plt.vlines(date, *plt.ylim(), linestyles="dashed", color="k")
            plt.annotate(description, [date, plt.ylim()[1]], fontsize=12,
                         xytext=(5, -20), textcoords="offset pixels",
                         rotation=-10, rotation_mode="anchor")
    plt.xlabel("Date")
    plt.ylabel('Curl /day')
    plt.title('Curl events over time', pad=10)
    plt.xticks(rotation=-15)
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))