James Harrison, 2023-06-08

This is a modified version of 'run_daily_crisis_response_aggregates.ipynb' which runs the aggregates using flowmachine directly, so that aggregates can optionally be unredacted.

This notebook is used to produce the following aggregates:
- All-pairs trips OD matrix
- Subscriber counts
- Event counts
- Active cell counts
- Total active subscribers (admin0)

for each day in the specified date range (by default, the most recently-ended full calendar month before today).

No subscriber subsetting is used.

These aggregates are intended to be produced on an ongoing basis in preparation for crisis response work.

In [None]:
import concurrent.futures
import datetime
import json
from pathlib import Path

import flowclient as fc
import flowmachine as fm
import pandas as pd
from dateutil.relativedelta import relativedelta
from flowclient.aggregates import (
    consecutive_trips_od_matrix_spec,
    location_event_counts_spec,
    total_network_objects_spec,
    trips_od_matrix_spec,
    unique_subscriber_counts_spec,
)
from utils import (
    _write_query_result,
    find_dates_to_exclude,
    get_date_in_month,
)

# Setup

## Parameters

In [None]:
datetime_now = datetime.datetime.now()
datetime_now

In [None]:
# All aggregates available to run using this notebook
all_aggregates = (
    "subscriber-counts",
    "all-trips",
    "consecutive-trips",
    "event-counts",
    "active-cell-counts",
    "total-active-subscribers",
)

In [None]:
# Parameters
author = "James Harrison <james.harrison@flowminder.org>"

start_date = get_date_in_month(
    datetime_now, day_of_month=1, month_offset=-1
)  # Start date of the data interval (inclusive)
end_date = None  # End date of the data interval (exclusive) (defaults to one calendar month after start date)

aggregation_unit = "lon-lat"  # Spatial aggregation unit
mapping_table = "geography.cell_to_admin_via_clusters_1km_20221025"
geom_table = "geography.clusters_1km_20221025"
geom_table_join_column = "cluster_id"
event_types = ["calls"]  # Event types to use

flowmachine_log_level = "info"  # Flowmachine log level
shared_data_dir = "./"  # Writable output directory
outputs_subdir = "aggregates/crisis_response"  # Subdirectory of shared data dir to which results of aggregate queries will be written
output_format = "csv"  # 'csv' or 'netcdf'
overwrite = False  # Set True to overwrite previously-saved aggregates for this month (with overwrite=False, conflicting aggregate files will be renamed)
aggregates_to_calculate = all_aggregates
redact = False  # Set True to redact small counts from the aggregate outputs (as would be the case for results retrieved through the API)
require_latest_data = True  # If True, computation will not proceed if the last required day of data is later than the most recent available date
use_async_client = False  # Set True to use the asynchronous flowclient

In [None]:
# start/end date parameters may be strings, so convert to datetime.date
start_date = pd.Timestamp(start_date).date()
if end_date is None:
    end_date = start_date + relativedelta(months=1)
end_date = pd.Timestamp(end_date).date()

(start_date, end_date)

In [None]:
# Construct outputs path (we don't actually create the dir until we're ready to start writing outputs later)
outputs_path = (
    Path(shared_data_dir)
    / outputs_subdir
    / f"daily_aggregates_{aggregation_unit}_{end_date:%Y-%m-%d}"
)

outputs_path

In [None]:
unknown_aggregates = set(aggregates_to_calculate).difference(all_aggregates)
if unknown_aggregates:
    raise ValueError(f"Unknown aggregate types specified: {unknown_aggregates}")

## Connect

In [None]:
fm.connect(
    flowdb_connection_pool_overflow=20,
    flowdb_connection_pool_size=5,
    log_level=flowmachine_log_level,
)

## Check dates

In [None]:
dates_to_skip = find_dates_to_exclude(
    flowdb_connection=fm.core.context.get_db(),
    start_date=start_date,
    end_date=end_date,
    event_types=event_types,
    latest_truncation_threshold="00:00:00",  # Not excluding temporally-truncated data here
    fail_on_missing_latest=require_latest_data,
)
dates_to_skip

In [None]:
dates_to_run = sorted(
    set(
        str(d.date()) for d in pd.date_range(start_date, end_date, inclusive="left")
    ).difference(dates_to_skip)
)

# FlowKit queries

## Define queries

In [None]:
api_specs = {}
for d in dates_to_run:
    d_next = fm.utils.time_period_add(d, 1, "days")
    common_args = dict(
        start_date=d,
        end_date=d_next,
        aggregation_unit=aggregation_unit,
        mapping_table=mapping_table,
        geom_table=geom_table,
        geom_table_join_column=geom_table_join_column,
        event_types=event_types,
    )
    # Unique subscriber counts
    if "subscriber-counts" in aggregates_to_calculate:
        api_specs[f"subscriber-counts_{d}"] = unique_subscriber_counts_spec(
            **common_args
        )
    # Trips OD matrix (directed, all-pairs)
    if "all-trips" in aggregates_to_calculate:
        api_specs[f"all-trips_{d}"] = trips_od_matrix_spec(**common_args)
    # Consecutive trips OD matrix
    if "consecutive-trips" in aggregates_to_calculate:
        api_specs[f"consecutive-trips_{d}"] = consecutive_trips_od_matrix_spec(
            **common_args
        )
    # Event counts
    if "event-counts" in aggregates_to_calculate:
        api_specs[f"event-counts_{d}"] = location_event_counts_spec(
            **common_args,
            count_interval="day",
        )
    # Active cell counts
    if "active-cell-counts" in aggregates_to_calculate:
        api_specs[f"active-cell-counts_{d}"] = total_network_objects_spec(
            **common_args,
            total_by="day",
        )
    # Total active subscribers
    if "total-active-subscribers" in aggregates_to_calculate:
        api_specs[f"total-active-subscribers_admin0_{d}"] = (
            unique_subscriber_counts_spec(
                start_date=d,
                end_date=d_next,
                aggregation_unit="admin0",
                mapping_table=mapping_table,
                event_types=event_types,
            )
        )

## Run queries

In [None]:
from flowmachine.core.server.query_schemas import FlowmachineQuerySchema

In [None]:
outputs_path.mkdir(exist_ok=True, parents=True)

In [None]:
fm_queries = {}
for label, query_spec in api_specs.items():
    print(label)
    fm_query_obj = FlowmachineQuerySchema().load(query_spec)._flowmachine_query_obj
    if redact or label.startswith("active-cell-counts"):
        fm_queries[label] = (fm_query_obj, query_spec)
    else:
        unredacted_query_obj = fm_query_obj.redaction_target
        fm_queries[label] = (unredacted_query_obj, query_spec)

## Get results and write to files

In [None]:
futures = [q[0].store(store_dependencies=True) for label, q in fm_queries.items()]
concurrent.futures.wait(futures)

In [None]:
for label, (query, spec) in fm_queries.items():
    print(label)
    attrs = dict(
        created_at=datetime.datetime.now().isoformat(),
        flowclient_version=fc.__version__,
        flowmachine_version=fm.__version__,
        parameters=json.dumps(spec),
        author=author,
        query_id=query.query_id,
    )
    if not label.startswith("active-cell-counts"):
        # Redaction is not applicable for cell counts
        attrs["redacted"] = str(redact)
    if redact or label.startswith("active-cell-counts"):
        filepath = outputs_path / label
    else:
        filepath = outputs_path / f"{label}_unredacted"
    _write_query_result(
        query.get_dataframe(),
        filepath,
        file_format=output_format,
        overwrite=overwrite,
        attrs=attrs,
    )

In [None]:
print("All queries completed")