James Harrison, 2023-05-04

This notebook is used to produce the following aggregates:
- All-pairs trips OD matrix
- Subscriber counts
- Event counts
- Active cell counts
- Total active subscribers (admin0)

for each day in the specified date range (by default, the most recently-ended full calendar month before today).

No subscriber subsetting is used.

These aggregates are intended to be produced on an ongoing basis in preparation for crisis response work.

In [None]:
import datetime
from pathlib import Path

import flowclient as fc
import flowmachine as fm
import pandas as pd
from dateutil.relativedelta import relativedelta
from flowclient import (
    consecutive_trips_od_matrix,
    location_event_counts,
    total_network_objects,
    trips_od_matrix,
    unique_subscriber_counts,
)
from get_secret_or_env_var import environ
from utils import (
    find_dates_to_exclude,
    get_date_in_month,
    run_query_and_write_result,
    run_query_and_write_result_async,
)

# Setup

## Parameters

In [None]:
datetime_now = datetime.datetime.now()
datetime_now

In [None]:
# All aggregates available to run using this notebook
all_aggregates = (
    "subscriber-counts",
    "all-trips",
    "consecutive-trips",
    "event-counts",
    "active-cell-counts",
    "total-active-subscribers",
)

In [None]:
# Parameters
author = "James Harrison <james.harrison@flowminder.org>"

start_date = get_date_in_month(
    datetime_now, day_of_month=1, month_offset=-1
)  # Start date of the data interval (inclusive)
end_date = None  # End date of the data interval (exclusive) (defaults to one calendar month after start date)

aggregation_unit = "lon-lat"  # Spatial aggregation unit
mapping_table = "geography.cell_to_admin_via_clusters_1km_20221025"
geom_table = "geography.clusters_1km_20221025"
geom_table_join_column = "cluster_id"
event_types = ["calls"]  # Event types to use

shared_data_dir = "./"  # Writable output directory
outputs_subdir = "aggregates/crisis_response"  # Subdirectory of shared data dir to which results of aggregate queries will be written
output_format = "csv"  # 'csv' or 'netcdf'
overwrite = False  # Set True to overwrite previously-saved aggregates for this month (with overwrite=False, conflicting aggregate files will be renamed)
aggregates_to_calculate = all_aggregates
require_latest_data = True  # If True, computation will not proceed if the last required day of data is later than the most recent available date
use_async_client = False  # Set True to use the asynchronous flowclient

In [None]:
# start/end date parameters may be strings, so convert to datetime.date
start_date = pd.Timestamp(start_date).date()
if end_date is None:
    end_date = start_date + relativedelta(months=1)
end_date = pd.Timestamp(end_date).date()

(start_date, end_date)

In [None]:
# Construct outputs path (we don't actually create the dir until we're ready to start writing outputs later)
outputs_path = (
    Path(shared_data_dir)
    / outputs_subdir
    / f"daily_aggregates_{aggregation_unit}_{(end_date):%Y-%m-%d}"
)

outputs_path

In [None]:
unknown_aggregates = set(aggregates_to_calculate).difference(all_aggregates)
if unknown_aggregates:
    raise ValueError(f"Unknown aggregate types specified: {unknown_aggregates}")

## Connect

In [None]:
if use_async_client:
    fc_conn = await fc.connect_async(
        url=environ["FLOWAPI_URL"],
        ssl_certificate=False,  # Workaround pending https://github.com/Flowminder/flowpyter-task/issues/35
        token=environ["FLOWAPI_TOKEN"],
    )
else:
    fc_conn = fc.connect(
        url=environ["FLOWAPI_URL"],
        ssl_certificate=False,  # Workaround pending https://github.com/Flowminder/flowpyter-task/issues/35
        token=environ["FLOWAPI_TOKEN"],
    )

In [None]:
# Shouldn't need this, because we're not excluding dates based on temporal truncation,
# but we still want to exclude based on missing dates, and we want to check the latest required date is available,
# both of which are currently handled by `find_dates_to_exclude` which requires a db connection.
fm.connect(
    flowdb_connection_pool_overflow=20,
    flowdb_connection_pool_size=5,
)

## Check dates

In [None]:
dates_to_skip = find_dates_to_exclude(
    flowdb_connection=fm.core.context.get_db(),
    start_date=start_date,
    end_date=end_date,
    event_types=event_types,
    latest_truncation_threshold="00:00:00",  # Not excluding temporally-truncated data here
    fail_on_missing_latest=require_latest_data,
)
dates_to_skip

In [None]:
dates_to_run = sorted(
    set(
        str(d.date()) for d in pd.date_range(start_date, end_date, inclusive="left")
    ).difference(dates_to_skip)
)

# FlowKit queries

## Define queries

In [None]:
api_queries = {}
for d in dates_to_run:
    d_next = fm.utils.time_period_add(d, 1, "days")
    common_args = dict(
        connection=fc_conn,
        start_date=d,
        end_date=d_next,
        aggregation_unit=aggregation_unit,
        mapping_table=mapping_table,
        geom_table=geom_table,
        geom_table_join_column=geom_table_join_column,
        event_types=event_types,
    )
    # Unique subscriber counts
    if "subscriber-counts" in aggregates_to_calculate:
        api_queries[f"subscriber-counts_{d}"] = unique_subscriber_counts(**common_args)
    # Trips OD matrix (directed, all-pairs)
    if "all-trips" in aggregates_to_calculate:
        api_queries[f"all-trips_{d}"] = trips_od_matrix(**common_args)
    # Consecutive trips OD matrix
    if "consecutive-trips" in aggregates_to_calculate:
        api_queries[f"consecutive-trips_{d}"] = consecutive_trips_od_matrix(
            **common_args
        )
    # Event counts
    if "event-counts" in aggregates_to_calculate:
        api_queries[f"event-counts_{d}"] = location_event_counts(
            **common_args,
            count_interval="day",
        )
    # Active cell counts
    if "active-cell-counts" in aggregates_to_calculate:
        api_queries[f"active-cell-counts_{d}"] = total_network_objects(
            **common_args,
            total_by="day",
        )
    # Total active subscribers
    if "total-active-subscribers" in aggregates_to_calculate:
        api_queries[f"total-active-subscribers_admin0_{d}"] = unique_subscriber_counts(
            connection=fc_conn,
            start_date=d,
            end_date=d_next,
            aggregation_unit="admin0",
            mapping_table=mapping_table,
            event_types=event_types,
        )

## Run queries and write results to files

In [None]:
additional_attrs = {
    "author": author,
    "redacted": True,
}

In [None]:
outputs_path.mkdir(exist_ok=True, parents=True)

In [None]:
from dataclasses import dataclass
from types import coroutine
from typing import Iterator


@dataclass
class AggSpec:
    filepath: Path
    query: dict


def agg_run_task(agg_spec: AggSpec) -> coroutine:
    return run_query_and_write_result_async(
        agg_spec.query,
        filepath=agg_spec.filepath,
        overwrite=overwrite,
        file_format=output_format,
        additional_attrs=additional_attrs,
    )


def agg_task_generator(agg_specs: list) -> Iterator[coroutine]:
    for agg_spec in agg_specs:
        yield agg_run_task(agg_spec)


if use_async_client:
    aggs_to_run = [
        AggSpec(filepath=outputs_path / label, query=query)
        for label, query in api_queries.items()
    ]
else:
    # If using the sync client, we want to set all queries running before waiting on any results
    for label, query in api_queries.items():
        print(f"Setting '{label}' query running...")
        query.run()
    print("All queries are running")

In [None]:
# Shamelessly nicked from the python3.12 docs
from itertools import islice


def batched(iterable, n):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        yield batch


if use_async_client:
    import asyncio

    retry_count = 3
    batch_size = 10
    exceptions = []
    for attempt_no in range(0, retry_count):
        print(
            f"Running queries, attempt {attempt_no}. {len(aggs_to_run)} aggregates to fetch."
        )
        for batch in batched(agg_task_generator(aggs_to_run), batch_size):
            exceptions.append(await asyncio.gather(*batch, return_exceptions=True))
        exceptions = [e for e in exceptions if e is not None]
        if not exceptions:
            print("All aggregates successfully fetched")
            break
        aggs_to_run = [agg for agg in aggs_to_run if not agg.filepath.exists()]
        print(f"{len(aggs_to_run)} uploads failed")
        print([agg.filepath for agg in aggs_to_run])
    if exceptions:
        print(exceptions)
        raise Exception(
            f"The following uploads failed: {[ agg.filepath for agg in aggs_to_run ]}"
        )

else:
    for label, query in api_queries.items():
        print(f"Getting result of '{label}' query...")
        run_query_and_write_result(
            query,
            filepath=outputs_path / label,
            overwrite=overwrite,
            file_format=output_format,
            additional_attrs=additional_attrs,
        )

print("All queries completed")