# Initialise the production of indicators

The first month of indicators are different from the others as they use nonCDR derived estimates of population.

## Imports

In [None]:
from pathlib import Path
import pandas as pd
import geopandas as gpd

In [None]:
shared_data_dir = "data"  # donde esta la data that can change
static_dir = "static"  # wo ist das static data
dagrun_data_dir = "data"  # unique writable directory for this DAG run

# Filenames of auxiliary files in static_dir
geometry_filename = "admin3.geojson"  # file with shapefile for residents per km2
geometry_admin3_col = "ADM3_PCOD"
base_pop_and_growth_rates_filename = (
    "haiti_growth_rates_fixed.csv"  # file descibing base pop and growth rates
)
base_pop_column = "est_pop_2020_01"  # Column in above containing population estimates

aggregates_subdir = (
    "aggregates"  # where art the aggregate data (within shared_data_dir)
)
indicators_subdir = "indicators"  # where art the indicators (within shared_data_dir)

metric_crs_epsg = (
    32618  # country specific, what projection to use for metre coordinates
)
residents_reference_date = "2020-01-01"  # platform specific, what is the first date which we show on platform, which we can't show many derived indicators

In [None]:
# Convert date parameters to datetime objects here because papermill date parameters will be strings
residents_reference_date = pd.to_datetime(residents_reference_date)

# Get full path to data subdirs
aggregates_dir = Path(shared_data_dir) / aggregates_subdir
indicators_dir = Path(shared_data_dir) / indicators_subdir

# Get full path to static files
geometry_filepath = Path(dagrun_data_dir) / geometry_filename
base_pop_and_growth_rates_filepath = (
    Path(static_dir) / base_pop_and_growth_rates_filename
)

#### What static data is being used?

In [None]:
print(base_pop_and_growth_rates_filename)

#### What are we doing with this notebook?

In [None]:
f'Computing first set of residents indicators for the month of {residents_reference_date.date().strftime("%Y-%m")}.'

## Residents indicators for month 1

##### Residents

Grabs the static pop estimates, computed residents per km2, leaves remaining NaN or 0 depending derived indicator.

In [None]:
# As this is the first month, create the indicators subdir if it doesn't already exist
indicators_dir.mkdir(exist_ok=True, parents=True)

In [None]:
base_pop = pd.read_csv(base_pop_and_growth_rates_filepath)
base_pop = base_pop[["pcod", base_pop_column]]
base_pop.columns = ["pcod", "residents"]

admin3km2 = (
    gpd.read_file(geometry_filepath)
    .set_index(geometry_admin3_col)
    .to_crs(epsg=metric_crs_epsg)
    .area
    * 1e-6
)
admin3km2.name = "admin3_area_km2"

base_pop = base_pop.merge(admin3km2, left_on="pcod", right_index=True)

# In month 1, we can only compute the population per km2 from the base population estimates.
month1_residents_indicators = pd.DataFrame(
    columns=[
        "date",
        "pcod",
        "residents",
        "residents_perKm2",
        "arrived",
        "departed",
        "delta_arrived",
        "residents_diffwithref",
        "abnormality",
        "residents_pctchangewithref",
        "residents_UB",
        "residents_perKm2_UB",
        "arrived_UB",
        "departed_UB",
        "delta_arrived_UB",
        "residents_diffwithref_UB",
        "abnormality_UB",
        "residents_pctchangewithref_UB",
        "residents_LB",
        "residents_perKm2_LB",
        "arrived_LB",
        "departed_LB",
        "delta_arrived_LB",
        "residents_diffwithref_LB",
        "abnormality_LB",
        "residents_pctchangewithref_LB",
    ]
)


month1_residents_indicators["pcod"] = base_pop["pcod"]
month1_residents_indicators["date"] = residents_reference_date

# Regular estimates
month1_residents_indicators["residents"] = base_pop["residents"]
month1_residents_indicators["residents_perKm2"] = (
    base_pop["residents"] / base_pop["admin3_area_km2"]
)
month1_residents_indicators["residents_diffwithref"] = 0
month1_residents_indicators["residents_pctchangewithref"] = 0

# Lower bound
month1_residents_indicators["residents_LB"] = base_pop["residents"]
month1_residents_indicators["residents_perKm2_LB"] = (
    base_pop["residents"] / base_pop["admin3_area_km2"]
)
month1_residents_indicators["residents_diffwithref_LB"] = 0
month1_residents_indicators["residents_pctchangewithref_LB"] = 0

# Upper bound
month1_residents_indicators["residents_UB"] = base_pop["residents"]
month1_residents_indicators["residents_perKm2_UB"] = (
    base_pop["residents"] / base_pop["admin3_area_km2"]
)
month1_residents_indicators["residents_diffwithref_UB"] = 0
month1_residents_indicators["residents_pctchangewithref_UB"] = 0

month1_residents_indicators.to_csv(
    Path(indicators_dir)
    / f'residents_indicators_{residents_reference_date.date().strftime("%Y-%m")}.csv',
    index=False,
)

## Additional metrics for redaction

Calculate additional per-admin3 metrics to be used later when redacting the indicators

### CDR subscriber resident counts

In [None]:
home_locs_month_X = pd.read_csv(
    Path(aggregates_dir)
    / "residence_relocations"
    / f"residence_relocations_aggregates_{residents_reference_date.date()}"
    / f"resident-counts_{residents_reference_date.date()}.csv"
).dropna()

home_locs_month_X.to_csv(Path(dagrun_data_dir) / "cdr_subscriber_population.csv")

### Admin3s with CDR events

In this first month, although the residents indicators are not produced using CDR data, we want to only upload indicators for locations which had at least one CDR event during the month (it would be confusing to upload indicators for locations where we have no data coverage). Although we're not producing an aggregate specifically for this, we can calculate it from the daily active cell counts aggregates (using these ones, rather than subscriber/event counts, because there's no issue with redaction).

In [None]:
month_X_dates = pd.date_range(
    residents_reference_date,
    residents_reference_date + pd.DateOffset(months=1) - pd.DateOffset(days=1),
)

active_cell_counts_days = []

for date in month_X_dates:
    filepath = (
        Path(aggregates_dir)
        / "presence_trips"
        / f"presence_trips_aggregates_{residents_reference_date.date()}"
        / f"active-cell-counts_{date.date()}.csv"
    )
    if Path(filepath).is_file():
        active_cell_counts_day_X = pd.read_csv(filepath)
        active_cell_counts_days.append(active_cell_counts_day_X)

active_cell_counts_month_X = pd.concat(active_cell_counts_days)

In [None]:
admin3_active_days = (
    active_cell_counts_month_X.groupby("pcod")
    .size()
    .rename("days_with_activity")
    .reset_index()
)

admin3_active_days.to_csv(Path(dagrun_data_dir) / "admin3s_with_cdr_activity.csv")