# Table of Content
1. [Setup](#Setup)
2. [Inspecting download counts](#Inspecting-download-counts)
3. [Contributed resources](#Contributed-resources)
4. [Average processing time of proposed contributions](#Average-processing-time-of-proposed-contributions)
5. [Cleanup](#cleanup)


# Setup 

In [None]:
from tempfile import mkdtemp
from pathlib import Path
from shutil import rmtree
from subprocess import run

import os
import warnings

def cleanup(folder):
    print(f"Cleaning up {folder}")
    try:
        rmtree(folder)
    except Exception as e:
        warnings.warn(str(e))

if "temp_dir" in locals():
    cleanup(temp_dir)

temp_dir = mkdtemp()

os.chdir(temp_dir)
run("git clone https://github.com/bioimage-io/collection-bioimage-io.git --branch gh-pages --single-branch", check=True)
os.chdir("collection-bioimage-io")
print(f"working in {Path().absolute()}")

# Inspecting download counts

In [None]:
out = run('git log --pretty=format:"%H,%aI" download_counts.json', check=True, capture_output=True)
log = out.stdout.decode().split()
print(len(log), log[0])

In [None]:
from datetime import date, datetime
from typing import NewType, Dict

Hash = NewType("Hash", str)
all_commits: Dict[date, Dict[datetime, Hash]] = {}
hash: Hash
for log_entry in log:
    hash, iso_datetime = log_entry.split(",")
    dt = datetime.fromisoformat(iso_datetime)
    d = dt.date()
    day = all_commits.setdefault(d, {})
    assert dt not in day
    day[dt] = hash

commits: Dict[date, Hash] = {}
for d, day in all_commits.items():
    commits[d] = max(day.items())[1]

len(commits)

In [None]:
from subprocess import CalledProcessError 
from tqdm import tqdm

import json

all_downloads: Dict[date, int] = {}
try:
    for d, hash in tqdm(commits.items(), total=len(commits)):
        out = run(f"git checkout --force {hash}", check=True, capture_output=True)
        with Path("download_counts.json").open() as f:
            counts = json.load(f)
        
        all_downloads[d] = sum(counts.values())
except CalledProcessError:
    print(out.stdout.decode())
    raise
finally:
    run("git checkout --force gh-pages", check=True)

In [None]:
import pandas as pd

series = pd.Series(all_downloads, name="total downloads")
series.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="darkgrid", context="talk")
plt.style.use("dark_background")

fig, axs = plt.subplots(figsize=(16, 4))

(series / 1e3).plot(kind='line',ax=axs, title=series.name)
plt.xlabel("date")
plt.ylabel("10³")
plt.show()

# Contributed resources

In [None]:
with Path("collection.json").open() as f:
    collection = json.load(f)

col = collection["collection"]
print("total:", len(col))
per_type = {}
for e in col:
    t = e["type"]
    per_type[t] = per_type.get(t, 0) + 1

print("per type:", per_type)

# Average processing time of proposed contributions

Here we analyze the time it takes to close a generated PR that proposes to update the bioimage.io collection based on a new Zenodo record (version).

These PRs are created by the [@bioimageiobot](https://github.com/bioimageiobot) and tagged with the 'auto-update' label.
They have to be closed/merged by a (human) bioimage.io maintainer.

In [None]:
from pprint import pprint

import os
import requests

url = "https://api.github.com/graphql"
gh_token = os.getenv("GITHUB_TOKEN")
assert gh_token is not None, "Missing env var 'GITHUB_TOKEN'"
query = """
{
  search(query: "repo:bioimage-io/collection-bioimage-io is:pr author:bioimageiobot is:closed sort:created-desc", type: ISSUE, first: 100) {
    edges {
      node {
        ... on PullRequest {
          createdAt
          closedAt
        }
      }
    }
    pageInfo {
      hasNextPage
    }
  }
}
"""
r = requests.post(url, auth=("TOKEN", gh_token), json={'query': query}).json()
assert "data" in r, r
data = r["data"]
edges = data["search"]["edges"][::-1]  # revert descending order to asceding

start = edges[0]['node']['createdAt']
end = edges[-1]['node']['closedAt']
print(f"{len(edges)} PRs from {start} to {end}")

In [None]:
from dateutil.parser import isoparse 
from numpy import busday_count, mean

from holidays import country_holidays

local_holidays = country_holidays("Germany", subdiv="BW")[start:end]

_durations = {}
for edge in edges:
    created = isoparse(edge["node"]["createdAt"])
    closed = isoparse(edge["node"]["closedAt"])
    delta = busday_count(created.date(), closed.date(), holidays=local_holidays)
    _durations[created] = delta

dur_col = "duration [work days in BW]"
durations = pd.DataFrame(_durations.items(), columns=("created", dur_col))
durations[dur_col].mean()

In [None]:
fig, axes = plt.subplots(figsize=(16, 4))
durations.plot(kind="scatter", x="created", y=dur_col, ax=axes)
plt.show()

# Cleanup

In [None]:
cleanup(temp_dir)