In [None]:
from pathlib import Path
import json
import csv
from pandas import DataFrame, read_csv
from dataclasses import asdict
from data_cleaning import (
    csv_to_dataset,
    RelocationsDataset,
    ResidentsDataset,
    MovementsDataset,
    PresenceDataset,
    validate_indicator,
)

In [None]:
class InvalidIndicatorError(Exception):
    pass

In [None]:
# Parameters
shared_data_dir = Path(
    "../../tests/"
)  # The path to the shared_data_dir for this notebook (see FlowpyterOperator notes)
dagrun_data_dir = Path("../../tests/")
static_dir = Path("../../tests/")
INDICATORS_FILES = {
    "sample_indicators/residents_indicators_2020-01_release.csv": [
        *ResidentsDataset("foobar").indicators
    ]
}  # A dict of category_path:[indicators_to_upload]
REDACTED_ADMIN_3_STATIC_PATH = "redacted_sections.csv"  # The path to the .csv containing a column 'pcod' of admin3 regions to redact
CDR_POPULATION_FILE = "cdr_pop_synth.csv"
JSON_OUTPUTS = "outputs"
CONFIG_STATIC_PATH = "config.json"
DATA_VERSION = "JOHN_TEST_DATA"
IS_FIRST_MONTH = False  # Residents redaction rules are different for the first month
ADMIN3_WITH_ACTIVITY_FILE = "admin3s_with_cdr_activity.csv"

In [None]:
# Temporary hack required until we update flowpyter-task to allow list params
if isinstance(INDICATORS_FILES, str):
    INDICATORS_FILES = json.loads(INDICATORS_FILES)

In [None]:
# Postprocessing of parameters + af vars
shared_data_dir = Path(shared_data_dir)
dagrun_data_dir = Path(dagrun_data_dir)
static_dir = Path(static_dir)
INDICATORS = {shared_data_dir / path: inds for path, inds in INDICATORS_FILES.items()}
JSON_FOLDER = dagrun_data_dir / JSON_OUTPUTS
CDR_POPULATION_PATH = dagrun_data_dir / CDR_POPULATION_FILE
CONFIG_PATH = static_dir / CONFIG_STATIC_PATH
REDACTED_ADMIN_3_LIST = static_dir / REDACTED_ADMIN_3_STATIC_PATH
ADMIN3_WITH_ACTIVITY_PATH = dagrun_data_dir / ADMIN3_WITH_ACTIVITY_FILE

In [None]:
# Redaction rules
def get_redaction_list(redaction_list_path):
    with open(redaction_list_path, "r") as csvfile:
        reader = csv.DictReader(csvfile)
        return [row["pcod"] for row in reader]


def cdr_pop_redaction_residents(df: DataFrame) -> DataFrame:
    cdr_pop = read_csv(CDR_POPULATION_PATH)
    admin3s_above_threshold = cdr_pop.loc[cdr_pop["value"] >= 200, "pcod"]
    return df.loc[
        df.index.get_level_values("spatial_unit").isin(admin3s_above_threshold)
    ]


def cdr_pop_redaction_relocations(df: DataFrame) -> DataFrame:
    cdr_pop = read_csv(CDR_POPULATION_PATH)
    admin3s_above_threshold = cdr_pop.loc[cdr_pop["value"] >= 200, "pcod"]
    return df.loc[
        df.index.get_level_values("origin").isin(admin3s_above_threshold)
        & df.index.get_level_values("destination").isin(admin3s_above_threshold)
    ]


def cdr_activity_redaction_residents(df: DataFrame) -> DataFrame:
    admin3s_with_cdr_activity = read_csv(ADMIN3_WITH_ACTIVITY_PATH)
    return df.loc[
        df.index.get_level_values("spatial_unit").isin(
            admin3s_with_cdr_activity["pcod"]
        )
    ]


def admin_3_drop_residents(df: DataFrame) -> DataFrame:
    return df.drop(get_redaction_list(REDACTED_ADMIN_3_LIST), level=1, errors="ignore")


def admin_3_drop_relocations(df: DataFrame) -> DataFrame:
    df = df.drop(get_redaction_list(REDACTED_ADMIN_3_LIST), level=1, errors="ignore")
    return df.drop(get_redaction_list(REDACTED_ADMIN_3_LIST), level=2, errors="ignore")


def relocations_redaction(df: DataFrame) -> DataFrame:
    indicies = df.loc[df["relocations"] <= 15].index.values
    return df.drop(indicies)


def residents_redaction(df: DataFrame) -> DataFrame:
    indicies = df.loc[df["residents"] <= 15].index.values
    return df.drop(indicies)


def presence_redaction(df: DataFrame) -> DataFrame:
    indicies = df.loc[df["presence"] <= 15].index.values
    return df.drop(indicies)


def travellers_redaction(df: DataFrame) -> DataFrame:
    indicies = df.loc[df["travellers"] <= 15].index.values
    return df.drop(indicies)


def arrivals_departed_nan(df: DataFrame) -> DataFrame:
    return df.dropna(subset=["arrived", "departed"])


def not_implemented_redaction(df):
    raise NotImplementedError("Redactions not implemented for this dataset")


def round_residents(df) -> DataFrame:
    col_dps = {
        "residents": -2,
        "residents_perKm2": -1,
        "arrived": -1,
        "departed": -1,
        "delta_arrived": -1,
        "residents_diffwithref": -1,
        "abnormality": 3,
        "residents_pctchangewithref": 2,
    }
    return df.round(col_dps)


def round_relocations(df) -> DataFrame:
    col_dps = {
        "relocations": -1,
        "relocations_diffwithref": -1,
        "abnormality": 3,
        "relocations_pctchangewithref": 2,
    }
    return df.round(col_dps)


def round_movements(df) -> DataFrame:
    col_dps = {
        "travellers": -2,
        "abnormality": 3,
        "travellers_diffwithref": -1,
        "travellers_pctchangewithref": 2,
    }
    return df.round(col_dps)


def round_presence(df) -> DataFrame:
    col_dps = {
        "presence": -2,
        "presence_perKm2": -1,
        "travellers_in": -1,
        "travellers_out": -1,
        "abnormality": 3,
        "presence_diffwithref": -1,
        "presence_pxtchangewithref": 2,
    }
    return df.round(col_dps)


if IS_FIRST_MONTH:
    RESIDENTS_REDACTIONS = (
        admin_3_drop_residents,
        cdr_activity_redaction_residents,
        residents_redaction,
        round_residents,
    )
else:
    RESIDENTS_REDACTIONS = (
        admin_3_drop_residents,
        cdr_pop_redaction_residents,
        residents_redaction,
        arrivals_departed_nan,
        round_residents,
    )

RELOCATIONS_REDACTIONS = (
    admin_3_drop_relocations,
    cdr_pop_redaction_relocations,
    relocations_redaction,
    round_relocations,
)

PRESENCE_REDACTIONS = (
    admin_3_drop_residents,
    cdr_pop_redaction_residents,
    presence_redaction,
    round_presence,
)

MOVEMENTS_REDACTIONS = (
    admin_3_drop_relocations,
    cdr_pop_redaction_relocations,
    travellers_redaction,
    round_movements,
)


def redactor_factory(redaction_rules) -> callable:
    def inner(df: DataFrame):
        for rule in redaction_rules:
            df = rule(df)
        return df

    return inner

In [None]:
# NOTE TO FUTURE NOTEBOOK SPELUNKERS: trid and srid are the victims of autogenerated code. They are not set in the db
# until after data is uploaded, but you need them to upload data. This isn't an issue unless you've recently nuked the db.
# If you _have_ nuked the db, this is the default starting values that seemed to work in the past - but if this nb is part
# of a dag, you might need to go digging in `flowkit-ui-backend-db` (hosted on gcloud/sql at time of writing) to find out
# what these actually map to if you're having issues. Love and kisses, John c. 2023


def trid_lookup(trid_label):
    trid_dict = {"years": 1, "months": 2, "weeks": 3, "days": 4}
    return trid_dict[trid_label]


def srid_lookup(srid_label):
    srid_dict = {"Commune": 2, "Communal section": 3, "Department": 1}
    return srid_dict[srid_label]


def category_lookup(cat_label):
    cat_dict = {
        "relocations": "flow",
        "residents": "single_location",
        "presence": "single_location",
        "movements": "flow",
    }
    return cat_dict[cat_label]

In [None]:
for indicator_path, indicator_columns in INDICATORS.items():
    indicator_path = Path(indicator_path)
    print(indicator_path, indicator_columns)

    if "residents" in indicator_path.name:
        csv_ds = ResidentsDataset(indicator_path, indicators=indicator_columns)
        local_redactor = redactor_factory(RESIDENTS_REDACTIONS)
    elif "relocations" in indicator_path.name:
        csv_ds = RelocationsDataset(indicator_path, indicators=indicator_columns)
        local_redactor = redactor_factory(RELOCATIONS_REDACTIONS)
    elif "movements" in indicator_path.name:
        csv_ds = MovementsDataset(indicator_path, indicators=indicator_columns)
        local_redactor = redactor_factory(MOVEMENTS_REDACTIONS)
    elif "presence" in indicator_path.name:
        csv_ds = PresenceDataset(indicator_path, indicators=indicator_columns)
        local_redactor = redactor_factory(PRESENCE_REDACTIONS)
    else:
        raise InvalidIndicatorError(
            f"Invalid indicator {indicator_path.name}; must include one of 'residents', 'relocations', 'movements' or 'presence'."
        )

    validate_indicator(csv_ds, CONFIG_PATH)
    print(csv_ds)

    json_datasets = csv_to_dataset(
        csv_ds,
        srid_lookup=srid_lookup,
        trid_lookup=trid_lookup,
        category_type_lookup=category_lookup,
        redactor=local_redactor,
        revision=DATA_VERSION,
        indicators=indicator_columns,
    )

    JSON_FOLDER.mkdir(exists_ok=True)

    for ds in json_datasets:
        (JSON_FOLDER / ds.filename).write_text(json.dumps(asdict(ds)))