In [None]:
import sys

sys.version

In [None]:
import asyncio
import json
from pathlib import Path
from httpx import ReadTimeout, TimeoutException
import os
import httpx
import logging
import csv
from functools import partial

from data_cleaning import (
    csv_to_dataset,
    RelocationsDataset,
    ResidentsDataset,
    MovementsDataset,
    PresenceDataset,
)
from auth import get_token
from params import (
    SridParams,
    TridParams,
    CategoryParams,
    lookup_factory,
    upload_config,
    get_remote_parameter,
    get_local_parameter,
)
from upload import do_upload, AttemptState, set_scopes, store_mdids
from pandas import DataFrame, read_csv

In [None]:
class UploadError(Exception):
    pass


class InvalidIndicatorError(Exception):
    pass


class HeartbeatError(Exception):
    pass

In [None]:
for line in Path("../.env").read_text().splitlines():
    foo, _, bar = line.partition("=")
    os.environ[foo] = bar

log = logging.getLogger("upload_notebook")

In [None]:
# Parameters
shared_data_dir = Path(
    "../tests/"
)  # The path to the shared_data_dir for this notebook (see FlowpyterOperator notes)
dagrun_data_dir = Path("../tests/")
static_dir = Path(
    "../tests"
)  # The path to the static_dir for this notebook (see FlowpyterOperator notes)
INDICATORS_LIST = list(
    (static_dir / "sample_indicators").glob("*.csv")
)  # A list of path-formatted strings relative to shared_data_dir of indicator csvs to upload
CONFIG_STATIC_PATH = "config.json"  # The path to config.json within static_dir
CHUNK_SIZE = 20  # Number of parallel uploads to attempt at once
RETRY_COUNT = 3  # Number of times a chunk of parallel uploads will retry before failing
DATA_VERSION = os.getenv("DATA_VERSION")  # The data version of this upload
JSON_DATA_SUBDIR = "outputs"  # If set, the path within dagrun_data_dir to cache json upload artefacts. Can be None.
REDACTED_ADMIN_3_STATIC_PATH = "redacted_sections.csv"  # The path to the .csv containing a column 'pcod' of admin3 regions to redact
BASE_URL = "https://api.dev.haiti.mobility-dashboard.org/v1"  # The base URL for the backend api
MDIDS_DATA_PATH = "mdids"
CDR_POPULATION_FILE = "cdr_pop.csv"  # Name of file containing CDR-derived subscriber population counts (within dagrun dir). Should have columns 'pcod' (admin3) and 'value' (subscriber population count)

In [None]:
# Airflow variables injected via env vars
AUTH0_CLIENT_ID_ADMIN = os.getenv("ADMIN_CLIENT")  # Admin client id from Auth0
AUTH0_CLIENT_ID_UPDATER = os.getenv("UPDATER_CLIENT")  # Updator client id from Auth0
AUTH0_CLIENT_SECRET_ADMIN = os.getenv("ADMIN_SECRET")  # Admin secret from Auth0
AUTH0_CLIENT_SECRET_UPDATER = os.getenv("UPDATER_SECRET")  # Updator secret from Auth0
AUTH0_DOMAIN = os.getenv(
    "AUTH0_DOMAIN", "flowminder-dev.eu.auth0.com"
)  # Auth0 domain to request tokens from
AUDIENCE = os.getenv(
    "AUDIENCE", "https://flowkit-ui-backend.flowminder.org"
)  # Domain to request tokens for

In [None]:
# Postprocessing of parameters + af vars
shared_data_dir = Path(shared_data_dir)
dagrun_data_dir = Path(dagrun_data_dir)
static_dir = Path(static_dir)
JSON_FOLDER = dagrun_data_dir / JSON_DATA_SUBDIR
REDACTED_ADMIN_3_LIST = static_dir / REDACTED_ADMIN_3_STATIC_PATH
CONFIG_PATH = static_dir / CONFIG_STATIC_PATH
INDICATORS_PATHS = [shared_data_dir / ind for ind in INDICATORS_LIST]
MDIDS_PATH = static_dir / MDIDS_DATA_PATH
CACHE_FOLDER = dagrun_data_dir / "token_cache"
CDR_POPULATION_PATH = dagrun_data_dir / CDR_POPULATION_FILE

In [None]:
for f in INDICATORS_PATHS:
    print(f)

In [None]:
# Redaction rules
def get_redaction_list(redaction_list_path):
    with open(redaction_list_path, "r") as csvfile:
        reader = csv.DictReader(csvfile)
        return [row["pcod"] for row in reader]


def cdr_pop_redaction_residents(df: DataFrame) -> DataFrame:
    cdr_pop = read_csv(CDR_POPULATION_PATH)
    admin3s_above_threshold = cdr_pop.loc[cdr_pop["value"] >= 200, "pcod"]
    return df.loc[
        df.index.get_level_values("spatial_unit").isin(admin3s_above_threshold)
    ]


def cdr_pop_redaction_relocations(df: DataFrame) -> DataFrame:
    cdr_pop = read_csv(CDR_POPULATION_PATH)
    admin3s_above_threshold = cdr_pop.loc[cdr_pop["value"] >= 200, "pcod"]
    return df.loc[
        df.index.get_level_values("origin").isin(admin3s_above_threshold)
        & df.index.get_level_values("destination").isin(admin3s_above_threshold)
    ]


def admin_3_drop_residents(df: DataFrame) -> DataFrame:
    return df.drop(get_redaction_list(REDACTED_ADMIN_3_LIST), level=1, errors="ignore")


def admin_3_drop_relocations(df: DataFrame) -> DataFrame:
    df = df.drop(get_redaction_list(REDACTED_ADMIN_3_LIST), level=1, errors="ignore")
    return df.drop(get_redaction_list(REDACTED_ADMIN_3_LIST), level=2, errors="ignore")


def relocations_redaction(df: DataFrame) -> DataFrame:
    indicies = df.loc[df["relocations"] <= 15].index.values
    return df.drop(indicies)


def residents_redaction(df: DataFrame) -> DataFrame:
    indicies = df.loc[df["residents"] <= 15].index.values
    return df.drop(indicies)


def presence_redaction(df: DataFrame) -> DataFrame:
    indicies = df.loc[df["presence"] <= 15].index.values
    return df.drop(indicies)


def trips_redaction(df: DataFrame) -> DataFrame:
    indicies = df.loc[df["trips"] <= 15].index.values
    return df.drop(indicies)


def arrivals_departed_nan(df: DataFrame) -> DataFrame:
    return df.dropna(subset=["arrived", "departed"])


def not_implemented_redaction(df):
    raise NotImplementedError("Redactions not implemented for this dataset")


def round_residents(df) -> DataFrame:
    col_dps = {
        "residents": -2,
        "residents_perKm2": -1,
        "arrived": -1,
        "departed": -1,
        "delta_arrived": -1,
        "residents_diffwithref": -1,
        "abnormality": 3,
        "residents_pctchangewithref": 2,
    }
    return df.round(col_dps)


def round_relocations(df) -> DataFrame:
    col_dps = {
        "relocations": -1,
        "relocations_diffwithref": -1,
        "abnormality": 3,
        "relocations_pctchangewithref": 2,
    }
    return df.round(col_dps)


def round_movements(df) -> DataFrame:
    col_dps = {
        "trips": -2,
        "abnormality": 3,
        "trips_diffwithref": -1,
        "trips_pctchangewithref": 2,
    }
    return df.round(col_dps)


def round_presence(df) -> DataFrame:
    col_dps = {
        "presence": -2,
        "presence_perKm2": -1,
        "trips_in": -1,
        "trips_out": -1,
        "abnormality": 3,
        "presence_diffwithref": -1,
        "presence_pxtchangewithref": 2,
    }
    return df.round(col_dps)


RESIDENTS_REDACTIONS = (
    admin_3_drop_residents,
    cdr_pop_redaction_residents,
    residents_redaction,
    arrivals_departed_nan,
    round_residents,
)

RELOCATIONS_REDACTIONS = (
    admin_3_drop_relocations,
    cdr_pop_redaction_relocations,
    relocations_redaction,
    round_relocations,
)

PRESENCE_REDACTIONS = (
    admin_3_drop_residents,
    cdr_pop_redaction_residents,
    presence_redaction,
    round_presence,
)

MOVEMENTS_REDACTIONS = (
    admin_3_drop_relocations,
    cdr_pop_redaction_relocations,
    trips_redaction,
    round_movements,
)


def redactor_factory(redaction_rules) -> callable:
    def inner(df: DataFrame):
        for rule in redaction_rules:
            df = rule(df)
        return df

    return inner

In [None]:
def trid_lookup(trid_label):
    trid_dict = {"years": 17, "months": 18, "weeks": 19, "days": 20}
    return trid_dict[trid_label]


def srid_lookup(srid_label):
    srid_dict = {"Commune": 14, "Communal section": 15, "Department": 13}
    return srid_dict[srid_label]


def category_lookup(cat_label):
    cat_dict = {
        "relocations": "flow",
        "residents": "single_location",
        "presence": "single_location",
        "movements": "flow",
    }
    return cat_dict[cat_label]


async def main():
    try:
        response = httpx.get(f"{BASE_URL}/heartbeat", follow_redirects=True)
    except TimeoutException:
        # If we get a ReadTimeout, the last request caused the server to spin up. Try again now it's awake
        await asyncio.sleep(
            1
        )  # Hey, if we're already async might as well be preemptible.
        response = httpx.get(f"{BASE_URL}/heartbeat", follow_redirects=True)
    if response.status_code >= 300:
        raise HeartbeatError("Heartbeat not found")

    CACHE_FOLDER.mkdir(exist_ok=True)
    admin_token = get_token(
        AUTH0_DOMAIN,
        AUTH0_CLIENT_ID_ADMIN,
        AUTH0_CLIENT_SECRET_ADMIN,
        AUDIENCE,
        CACHE_FOLDER,
    )
    updater_token = get_token(
        AUTH0_DOMAIN,
        AUTH0_CLIENT_ID_UPDATER,
        AUTH0_CLIENT_SECRET_UPDATER,
        AUDIENCE,
        CACHE_FOLDER,
    )

    responses = list(
        get_remote_parameter(p.endpoint, admin_token, BASE_URL)
        for p in [SridParams, TridParams, CategoryParams]
    )
    print([r for r in responses])
    if any(r == [] for r in responses):
        log.warning(
            f"Config not found: loading modifiers from {CONFIG_PATH} and uploading config"
        )
        upload_config(CONFIG_PATH, admin_token, BASE_URL)
        param_fetcher = partial(get_local_parameter, config_path=CONFIG_PATH)
    else:
        param_fetcher = partial(
            get_remote_parameter, admin_token=admin_token, base_url=BASE_URL
        )

    for indicator in INDICATORS_PATHS:
        log.info(f"Uploading {indicator}")
        if "residents" in indicator.name:
            csv_ds = ResidentsDataset(indicator)
            local_redactor = redactor_factory(RESIDENTS_REDACTIONS)
        elif "relocations" in indicator.name:
            csv_ds = RelocationsDataset(indicator)
            local_redactor = redactor_factory(RELOCATIONS_REDACTIONS)
        elif "movements" in indicator.name:
            csv_ds = MovementsDataset(indicator)
            local_redactor = redactor_factory(MOVEMENTS_REDACTIONS)
        elif "presence" in indicator.name:
            csv_ds = PresenceDataset(indicator)
            local_redactor = redactor_factory(PRESENCE_REDACTIONS)
        else:
            raise InvalidIndicatorError(
                f"Invalid indicator {indicator.name}; must include one of 'residents', 'relocations', 'movements' or 'presence'."
            )
        payloads = csv_to_dataset(
            csv_ds,
            srid_lookup=srid_lookup,
            trid_lookup=trid_lookup,
            category_type_lookup=category_lookup,
            redactor=local_redactor,
            revision=DATA_VERSION,
        )

        attempts = await do_upload(
            payloads, BASE_URL, updater_token, JSON_FOLDER, CHUNK_SIZE, RETRY_COUNT
        )

        if any(a.state == AttemptState.FAILED for a in attempts):
            raise UploadError("Some uploads failed, see logs")

        mdids = (int(a.response.text) for a in attempts)
        log.info("Setting 'read:preview_data' for uploads")
        responses = await set_scopes(mdids, "read:preview_data", BASE_URL, admin_token)
        if any(r.status_code >= 300 for r in responses):
            print([r.json() for r in responses])
            raise UploadError("Scope setting failed")

        store_mdids(attempts, MDIDS_PATH)


await main()