James Harrison, 2022-05-11

This notebook is used to produce the following aggregates:
- Unique subscriber counts per admin3 per day
- Trips OD matrix per pair of admin3s per day
- Consecutive trips OD matrix per pair of admin3s per day
- Unique visitor counts per admin3 per day
- Home-away matrix (from monthly home admin3 to visited admin3) per day

for each day in the last month.

'Last month' is the most recently-ended full calendar month before today.

These aggregates can be used to produce mobility indicators related to subscriber presence and trips.

In [None]:
import datetime
import warnings
from pathlib import Path

import flowclient as fc
import flowmachine as fm
import pandas as pd
from flowclient import (
    consecutive_trips_od_matrix,
    flows,
    location_event_counts,
    total_network_objects,
    trips_od_matrix,
    unique_locations_spec,
    unique_subscriber_counts,
)
from flowclient.aggregates import (
    active_at_reference_location_counts_spec,
    unique_subscriber_counts_spec,
)
from get_secret_or_env_var import environ
from utils import (
    check_data_availability_for_home_locations,
    find_dates_to_exclude_monthly,
    get_date_in_month,
    monthly_home_location_spec,
    monthly_subscriber_subset_query,
    run_query_and_write_result,
    run_query_and_write_result_async,
)

# Setup

## Parameters

In [None]:
datetime_now = datetime.datetime.now()
datetime_now

In [None]:
# All aggregates available to run using this notebook
all_aggregates = (
    "subscriber-counts",
    "all-trips",
    "consecutive-trips",
    "visitor-counts",
    "home-away-matrix",
    "event-counts",
    "active-cell-counts",
)

In [None]:
# Parameters
author = "James Harrison <james.harrison@flowminder.org>"

start_date = (
    None  # First day of the month (defaults to start of first full month before now)
)
month_start_day = None  # If start_date is not specified, start on this day of the month (defaults to 1, i.e. first day of a calendar month)
window_length = (
    7  # Length in days of the rolling window used to compute average call days
)
min_call_days = (
    3  # Minimal number of average days in a window a subscriber was sighted on
)
min_percent_of_data_dates = 60  # Minimum percentage of days of data that must be present for a month to be included
max_data_gap = (
    6  # Maximum length (in days) of allowed data gap for a month to be included
)
max_empty_windows = (
    None  # Maximum number of entirely-empty windows allowed for a month to be included
)
latest_truncation_threshold = (
    "18:00:00"  # Threshold for excluding temporally-truncated data
)

aggregation_unit = "admin3"  # Spatial aggregation unit
mapping_table = "geography.cell_to_admin_via_clusters_1km_20221025"
geom_table = None
geom_table_join_column = None
event_types = ["calls"]  # Event types to use
flowmachine_log_level = "info"  # Flowmachine log level
shared_data_dir = "./"  # Writable output directory
outputs_subdir = "aggregates/presence_trips"  # Subdirectory of shared data dir to which results of aggregate queries will be written
output_format = "csv"  # 'csv' or 'netcdf'
overwrite = False  # Set True to overwrite previously-saved aggregates for this month (with overwrite=False, conflicting aggregate files will be renamed)
include_unsubsetted = (
    False  # Set true to also calculate aggregates using the full set of subscribers
)
aggregates_to_calculate = all_aggregates
require_latest_data = True  # If True, computation will not proceed if the last required day of data is later than the most recent available date
use_async_client = False  # Set True to use the asynchronous flowclient

In [None]:
# Temporary hack until we can use list parameters with flowpyter-task
if isinstance(aggregates_to_calculate, str):
    import json

    aggregates_to_calculate = json.loads(aggregates_to_calculate)

In [None]:
# If start date is not specified, calculate start date from time now
if start_date is not None:
    if month_start_day is not None:
        raise ValueError(
            "Only one of 'start_date' or 'month_start_day' may be specified"
        )
else:
    if month_start_day is None:
        month_start_day = 1
    start_date = get_date_in_month(
        datetime_now,
        day_of_month=month_start_day,
        month_offset=(
            -1 if datetime_now.day >= month_start_day else -2
        ),  # If day of month now is before month_start_day, need to offset by 2 months to get a complete month (this ignores ingestion time)
    )

In [None]:
# Construct outputs path (we don't actually create the dir until we're ready to start writing outputs later)
outputs_path = (
    Path(shared_data_dir) / outputs_subdir / f"presence_trips_aggregates_{start_date}"
)

outputs_path

In [None]:
# TODO: We're now specifying start_date directly, so rewrite the rest of the notebook to use this instead of date_today and month_start_day
date_today = pd.Timestamp(start_date) + pd.DateOffset(months=1)
month_start_day = pd.Timestamp(start_date).day

In [None]:
unknown_aggregates = set(aggregates_to_calculate).difference(all_aggregates)
if unknown_aggregates:
    raise ValueError(f"Unknown aggregate types specified: {unknown_aggregates}")

## Connect

In [None]:
if use_async_client:
    fc_conn = await fc.connect_async(
        url=environ["FLOWAPI_URL"],
        ssl_certificate=False,  # Workaround pending https://github.com/Flowminder/flowpyter-task/issues/35
        token=environ["FLOWAPI_TOKEN"],
    )
else:
    fc_conn = fc.connect(
        url=environ["FLOWAPI_URL"],
        ssl_certificate=False,  # Workaround pending https://github.com/Flowminder/flowpyter-task/issues/35
        token=environ["FLOWAPI_TOKEN"],
    )

In [None]:
fm.connect(
    flowdb_connection_pool_overflow=20,
    flowdb_connection_pool_size=5,
    log_level=flowmachine_log_level,
)

## Check dates

In [None]:
(
    data_available,
    lookback_month_available,
    lookback_n_months,
) = check_data_availability_for_home_locations(
    date_today,
    flowdb_connection=fm.core.context.get_db(),
    month_offset=-1,
    month_start_day=month_start_day,
    window_length=window_length,
    event_types=event_types,
    latest_truncation_threshold=latest_truncation_threshold,
    fail_on_missing_latest=require_latest_data,
    min_percent_of_dates=min_percent_of_data_dates,
    max_allowed_gap=max_data_gap,
    max_empty_windows=max_empty_windows,
    min_median_included_days_per_window=min_call_days,
)

if data_available:
    required_subsets = [-1]
    if any(
        agg_type in aggregates_to_calculate
        for agg_type in ["visitor-counts", "home-away-matrix"]
    ):
        if lookback_month_available:
            lookback_month_offset = -1 - lookback_n_months
            required_subsets.append(lookback_month_offset)
        else:
            warnings.warn(
                f"Home locations will be calculated from just one month's majority location (no lookback)"
            )
else:
    if include_unsubsetted or ("active-cell-counts" in aggregates_to_calculate):
        warnings.warn(
            f"Insufficient data to calculate subscriber subset for this month. Only unsubsetted aggregates will be produced."
        )
        required_subsets = []
    else:
        raise ValueError("Insufficient data to produce aggregates for this month")

In [None]:
sorted_month_offsets = sorted(required_subsets, reverse=True)
dates_to_skip = find_dates_to_exclude_monthly(
    date_today,
    flowdb_connection=fm.core.context.get_db(),
    month_offset=sorted_month_offsets[0],
    month_start_day=month_start_day,
    window_length=window_length,
    event_types=event_types,
    latest_truncation_threshold=latest_truncation_threshold,
    fail_on_missing_latest=require_latest_data,
).union(
    *(
        find_dates_to_exclude_monthly(
            date_today,
            flowdb_connection=fm.core.context.get_db(),
            month_offset=mo,
            month_start_day=month_start_day,
            window_length=window_length,
            event_types=event_types,
            latest_truncation_threshold=latest_truncation_threshold,
            fail_on_missing_latest=require_latest_data,
        )
        for mo in sorted_month_offsets[1:]
    )
)
dates_to_skip

# Subscriber subsets

Subscriber subsets have to be defined and run using flowmachine directly, and then the query IDs can be used to subset FlowAPI queries.

## Define subscriber subset queries

In [None]:
tables = [f"events.{event_type}" for event_type in event_types]

In [None]:
# Convert FlowAPI aggregation unit parameters to a flowmachine spatial unit
if "admin" in aggregation_unit:
    spatial_unit = fm.core.spatial_unit.make_spatial_unit(
        spatial_unit_type="admin",
        level=int(aggregation_unit[-1]),
        mapping_table=mapping_table,
        geom_table=geom_table,
        geom_table_join_on=geom_table_join_column,
    )
else:
    spatial_unit = fm.core.spatial_unit.make_spatial_unit(
        spatial_unit_type=aggregation_unit,
        mapping_table=mapping_table,
        geom_table=geom_table,
        geom_table_join_on=geom_table_join_column,
    )

In [None]:
subscriber_subsets_and_intermediates = [
    monthly_subscriber_subset_query(
        date_today,
        month_offset=month_offset,
        month_start_day=month_start_day,
        window_length=window_length,
        min_call_days=min_call_days,
        spatial_unit=spatial_unit,
        tables=tables,
        dates_to_exclude=dates_to_skip,
    )
    for month_offset in required_subsets
]

## Run subscriber subset queries

In [None]:
subset_futs = [
    subset.store(store_dependencies=True)
    for subset, intermediates in subscriber_subsets_and_intermediates
    if not subset.is_stored
]

Wait until all subsets have been calculated.

In [None]:
import concurrent.futures

concurrent.futures.wait(subset_futs)

In [None]:
[len(subset) for subset, intermediates in subscriber_subsets_and_intermediates]

## Wrap in Table objects so that flowmachine server can unpickle

In [None]:
subscriber_subset_tables = [
    subset.get_table() for subset, intermediates in subscriber_subsets_and_intermediates
]

In [None]:
subscriber_subset_query_ids = [subset.query_id for subset in subscriber_subset_tables]

In [None]:
subscriber_subset_query_ids

Subset query ids can now be passed on to API queries.

# FlowAPI side

## Define queries

In [None]:
api_queries = {}

In [None]:
dates_to_run = {
    str(d.date())
    for d in pd.date_range(
        get_date_in_month(date_today, day_of_month=month_start_day, month_offset=-1),
        get_date_in_month(date_today, day_of_month=month_start_day, month_offset=0),
        inclusive="left",
    )
}.difference(dates_to_skip)

### Home location sub-query

Required for visitor-counts and home-away-matrix aggregates.

In [None]:
if data_available and any(
    agg_type in aggregates_to_calculate
    for agg_type in ["visitor-counts", "home-away-matrix"]
):
    home_location_spec = monthly_home_location_spec(
        date_today,
        month_offset=-1,
        month_start_day=month_start_day,
        window_length=window_length,
        lookback_n_months=lookback_n_months,
        aggregation_unit=aggregation_unit,
        mapping_table=mapping_table,
        geom_table=geom_table,
        geom_table_join_column=geom_table_join_column,
        this_month_subscriber_subset=subscriber_subset_query_ids[0],
        last_month_subscriber_subset=(
            subscriber_subset_query_ids[1] if lookback_month_available else None
        ),
        event_types=event_types,
        dates_to_exclude=dates_to_skip,
    )

### Subsetted queries

In [None]:
if data_available:
    for d in dates_to_run:
        d_next = fm.utils.time_period_add(d, 1, "days")
        common_args = dict(
            start_date=d,
            end_date=d_next,
            aggregation_unit=aggregation_unit,
            mapping_table=mapping_table,
            geom_table=geom_table,
            geom_table_join_column=geom_table_join_column,
            event_types=event_types,
            subscriber_subset=subscriber_subset_query_ids[0],
        )
        # Unique subscriber counts
        if "subscriber-counts" in aggregates_to_calculate:
            api_queries[f"subscriber-counts_subset_{d}"] = unique_subscriber_counts(
                connection=fc_conn,
                **common_args,
            )
        # Trips OD matrix (directed, all-pairs)
        if "all-trips" in aggregates_to_calculate:
            api_queries[f"all-trips_subset_{d}"] = trips_od_matrix(
                connection=fc_conn,
                **common_args,
            )
        # Consecutive trips OD matrix
        if "consecutive-trips" in aggregates_to_calculate:
            api_queries[f"consecutive-trips_subset_{d}"] = consecutive_trips_od_matrix(
                connection=fc_conn,
                **common_args,
            )
        # Visitor counts
        if "visitor-counts" in aggregates_to_calculate:
            # unique_visitor_counts function is missing from flowclient, for some reason
            api_queries[f"visitor-counts_subset_{d}"] = fc_conn.make_api_query(
                parameters=dict(
                    query_kind="unique_visitor_counts",
                    unique_subscriber_counts=unique_subscriber_counts_spec(
                        **common_args
                    ),
                    active_at_reference_location_counts=active_at_reference_location_counts_spec(
                        reference_locations=home_location_spec,
                        unique_locations=unique_locations_spec(**common_args),
                    ),
                )
            )
        # Home-away matrix
        if "home-away-matrix" in aggregates_to_calculate:
            api_queries[f"home-away-matrix_subset_{d}"] = flows(
                connection=fc_conn,
                from_location=home_location_spec,
                to_location=unique_locations_spec(**common_args),
                join_type="full outer",
            )
        # Event counts
        if "event-counts" in aggregates_to_calculate:
            api_queries[f"event-counts_subset_{d}"] = location_event_counts(
                connection=fc_conn,
                **common_args,
                count_interval="day",
            )

### Unsubsetted queries

In [None]:
if include_unsubsetted:
    for d in dates_to_run:
        d_next = fm.utils.time_period_add(d, 1, "days")
        common_args = dict(
            connection=fc_conn,
            start_date=d,
            end_date=d_next,
            aggregation_unit=aggregation_unit,
            mapping_table=mapping_table,
            geom_table=geom_table,
            geom_table_join_column=geom_table_join_column,
            event_types=event_types,
        )
        # Unique subscriber counts
        if "subscriber-counts" in aggregates_to_calculate:
            api_queries[f"subscriber-counts_nosubset_{d}"] = unique_subscriber_counts(
                **common_args
            )
        # Trips OD matrix (directed, all-pairs)
        if "all-trips" in aggregates_to_calculate:
            api_queries[f"all-trips_nosubset_{d}"] = trips_od_matrix(**common_args)
        # Consecutive trips OD matrix
        if "consecutive-trips" in aggregates_to_calculate:
            api_queries[f"consecutive-trips_nosubset_{d}"] = (
                consecutive_trips_od_matrix(**common_args)
            )
        # Event counts
        if "event-counts" in aggregates_to_calculate:
            api_queries[f"event-counts_nosubset_{d}"] = location_event_counts(
                **common_args,
                count_interval="day",
            )

### Active cell counts

Subsetting is irrelevant for this one, so we run it regardless of whether or not sufficient data are available for the monthly subscriber subset.

In [None]:
# Active cell counts
if "active-cell-counts" in aggregates_to_calculate:
    for d in dates_to_run:
        d_next = fm.utils.time_period_add(d, 1, "days")
        api_queries[f"active-cell-counts_{d}"] = total_network_objects(
            connection=fc_conn,
            start_date=d,
            end_date=d_next,
            aggregation_unit=aggregation_unit,
            mapping_table=mapping_table,
            geom_table=geom_table,
            geom_table_join_column=geom_table_join_column,
            event_types=event_types,
            total_by="day",
        )

## Get results and write to files

In [None]:
additional_attrs = {
    "author": author,
    "redacted": True,
    "excluded_dates": sorted(dates_to_skip),
}

In [None]:
outputs_path.mkdir(exist_ok=True, parents=True)

In [None]:
if use_async_client:
    awaitables = [
        run_query_and_write_result_async(
            query,
            filepath=outputs_path / label,
            overwrite=overwrite,
            file_format=output_format,
            additional_attrs=additional_attrs,
        )
        for label, query in api_queries.items()
    ]
else:
    # If using the sync client, we want to set all queries running before waiting on any results
    for label, query in api_queries.items():
        print(f"Setting '{label}' query running...")
        query.run()
    print("All queries are running")

In [None]:
if use_async_client:
    import asyncio

    await asyncio.gather(*awaitables)
else:
    for label, query in api_queries.items():
        print(f"Getting result of '{label}' query...")
        run_query_and_write_result(
            query,
            filepath=outputs_path / label,
            overwrite=overwrite,
            file_format=output_format,
            additional_attrs=additional_attrs,
        )
print("All queries completed")