## Imports

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd

# Parameters

In [None]:
data_date = "2021-11-01"  # which date this notebook should run date for

shared_data_dir = "data"  # donde esta la data that can change
static_dir = "static"  # wo ist das static data
dagrun_data_dir = "data"  # unique writable directory for this DAG run

# Filenames of auxiliary files in static_dir
geometry_filename = "admin3.geojson"  # file with shapefile for residents per km2
base_pop_and_growth_rates_filename = (
    "haiti_growth_rates_fixed.csv"  # file descibing base pop and growth rates
)
scaling_factors_bilateral_pairs_filename = (
    "Haiti all possible bilateral flows with combined factors v2.0.csv"
)

base_pop_column = "est_pop_2020_01"

aggregates_subdir = (
    "aggregates"  # where art the aggregate data (within shared_data_dir)
)
indicators_subdir = "indicators"  # where art the indicators (within shared_data_dir)

metric_crs_epsg = (
    32618  # country specific, what projection to use for metre coordinates
)
residents_reference_date = "2020-01-01"  # platform specific, what is the first date which we show on platform, which we can't show many derived indicators
relocations_reference_date = (
    "2020-02-01"  # there is a month offset before we can show flows between months
)

# what spatial unit are we at
geo_spatial_unit = "ADM3_PCODE"
agg_spatial_unit = "pcod"
pop_and_gr_spatial_unit = "pcod"

geometry_admin3_col = ("ADM3_PCODE",)

agg_spatial_unit_to, agg_spatial_unit_from = (
    "pcod_to",
    "pcod_from",
)  # what to and from are called in the aggregates
sf_spatial_unit_to, sf_spatial_unit_from = (
    "to_pcod",
    "from_pcod",
)  # what to and from is called in the scaling factor file

In [None]:
# Convert date parameters to datetime objects here because papermill date parameters will be strings
data_date = pd.to_datetime(data_date)
residents_reference_date = pd.to_datetime(residents_reference_date)
relocations_reference_date = pd.to_datetime(relocations_reference_date)

# Get full path to data subdirs
aggregates_dir = Path(shared_data_dir) / aggregates_subdir
indicators_dir = Path(shared_data_dir) / indicators_subdir
geometry_filepath = Path(dagrun_data_dir) / geometry_filename

# Get full path to static files
base_pop_and_growth_rates_filepath = (
    Path(static_dir) / base_pop_and_growth_rates_filename
)
scaling_factors_bilateral_pairs_filepath = (
    Path(static_dir) / scaling_factors_bilateral_pairs_filename
)

#### What static data is being used?

In [None]:
print(base_pop_and_growth_rates_filename)

#### What scaling factors are being used?

In [None]:
print(scaling_factors_bilateral_pairs_filename)

#### What are we doing with this notebook?

In [None]:
f"Computing indicators for the month of:"

In [None]:
data_date.date().strftime("%Y-%m")

# Data needed to compute indicators

To compute the scaled residents estimates we need:

Scaled resident estimates
1. Relocations from the previous month to this month
2. Scaling factors to adjust the flows between each corridor
3. The resulting adjusted relocation aggregates computed by scaling the flows from (1) by the factors in (2)
4. The number of residents in the previous month (that we will be adding/subtracting from from arrivals and departures calculated from relocations)
5. The monthly growth rates for each admin3 

Derived indicators
1. Residents at month 1, for the pctchange_withref and diff_withref indicators
2. Residents at month X-1, X-2, X-3, X-4, X-5, X-6 for the abnormality indicators
3. Area of each admin3, so we can divide the number of residents to get residents_per_KM2

#### Relocation aggregates from month _X-1_ to month _X_

In [None]:
home_reloc_month_X = pd.read_csv(
    Path(aggregates_dir)
    / "residence_relocations"
    / f"residence_relocations_aggregates_{data_date.date()}"
    / f"home-relocations_full-outer_from{(data_date-pd.DateOffset(months=1)).date()}_to{data_date.date()}.csv"  # TODO: Add use_unredacted as a choice for this notebook
)

home_reloc_month_X = home_reloc_month_X[
    home_reloc_month_X[agg_spatial_unit_from] != home_reloc_month_X[agg_spatial_unit_to]
].dropna()
home_reloc_month_X

#### Scaling factors

In [None]:
scaling_factors = pd.read_csv(scaling_factors_bilateral_pairs_filepath)
scaling_factors

#### Adjusted relocation aggregates (flows from month _X-1_ to month _X_)

In [None]:
def adjust_flows(df):
    adjusted_home_reloc_month_X = df.merge(
        scaling_factors,
        left_on=[agg_spatial_unit_from, agg_spatial_unit_to],
        right_on=[sf_spatial_unit_from, sf_spatial_unit_to],
    ).assign(
        relocations=lambda z: np.ceil(z.value * z.combi_movbi_tot),
        relocations_LB=lambda z: np.ceil(z.value * z.combi_movbi_tot_LB),
        relocations_UB=lambda z: np.ceil(z.value * z.combi_movbi_tot_UB),
    )

    adjusted_home_reloc_month_X = adjusted_home_reloc_month_X[
        [
            agg_spatial_unit_from,
            agg_spatial_unit_to,
            "relocations",
            "relocations_LB",
            "relocations_UB",
        ]
    ]

    adjusted_home_reloc_month_X["date"] = data_date

    return adjusted_home_reloc_month_X


adjusted_home_reloc_month_X = adjust_flows(home_reloc_month_X)
adjusted_home_reloc_month_X

#### Base population _(Jan 2020)_ and Growth rates _(monthly per year)_

In [None]:
base_pop_and_growth_rates = pd.read_csv(base_pop_and_growth_rates_filepath).set_index(
    pop_and_gr_spatial_unit
)

base_pop = base_pop_and_growth_rates[base_pop_column]
base_pop

In [None]:
growth_rates = base_pop_and_growth_rates.filter(regex=f'.*{data_date.strftime("%Y")}$')
growth_rates

#### Residents at month _X-1_

In [None]:
def month_X_1(residents_col="residents", suffix=""):
    residents_month_X_minus_1 = (
        pd.read_csv(
            Path(indicators_dir)
            / f'residents_indicators_{(data_date-pd.DateOffset(months=1)).date().strftime("%Y-%m")}.csv'
        )
        .set_index(agg_spatial_unit)[residents_col]
        .to_frame()
    )
    return residents_month_X_minus_1


residents_month_X_minus_1 = month_X_1("residents")
residents_month_X_minus_1

In [None]:
residents_month_X_minus_1_UB = month_X_1("residents_UB")
residents_month_X_minus_1_UB

In [None]:
residents_month_X_minus_1_LB = month_X_1("residents_LB")
residents_month_X_minus_1_LB

#### Residents indicators at month _1_

In [None]:
def month_1(residents_col="residents", suffix=""):
    residents_month_1 = pd.read_csv(
        Path(indicators_dir)
        / f'residents_indicators_{residents_reference_date.strftime("%Y-%m")}.csv'
    )

    residents_month_1 = residents_month_1.set_index("pcod")[residents_col].rename(
        "residents_month_1" + suffix
    )
    return residents_month_1


residents_month_1 = month_1("residents")
residents_month_1

In [None]:
residents_month_1_UB = month_1("residents_UB", suffix="_UB")
residents_month_1

In [None]:
residents_month_1_LB = month_1("residents_LB", suffix="_LB")
residents_month_1

#### Residents indicators at month X-12, ..., X-6, X-5, X-4, X-3, X-2, X-1 (for abnormality rolling window)

##### Updated to diff in residents

In [None]:
# actually is the last year, not 6 months (!!!) seems cumbersome to change variable name so leaving right now

In [None]:
def last_6_months(residents_col="residents", suffix=""):
    last_6_mo = [
        (data_date - pd.DateOffset(months=i)).date().strftime("%Y-%m")
        for i in range(12, 0, -1)
    ]

    ref_periods = 0
    month_dfs = []
    for year_month in last_6_mo:
        with Path(indicators_dir) / (
            f"residents_indicators_{year_month}.csv"
        ) as filepath:
            if filepath.is_file():
                print(f"reading {filepath}")
                ref_periods += 1
                month_dfs.append(pd.read_csv(filepath))
    print(f"{ref_periods} months of data for abnormality calculations")

    residents_past_6_months = pd.concat(month_dfs)

    residents_past_6_months = residents_past_6_months[
        ["date", "pcod", residents_col, "arrived", "departed"]
    ]

    print("\n", residents_past_6_months.head(3), "\n\n")

    # Set residents value to null if 'arrived' or 'departed' are zero or null
    residents_past_6_months[residents_col] = residents_past_6_months.apply(
        lambda row: (
            np.nan
            if pd.isna(row["arrived"])
            or row["arrived"] == 0
            or pd.isna(row["departed"])
            or row["departed"] == 0
            else row[residents_col]
        ),
        axis=1,
    )

    residents_past_6_months = (
        residents_past_6_months.groupby("pcod")[residents_col]
        .apply(lambda z: np.diff(np.array(z))[~np.isnan(np.diff(np.array(z)))])
        .rename("residents_past_6_months" + suffix)
    )

    return residents_past_6_months


residents_past_6_months = last_6_months("residents")
residents_past_6_months

In [None]:
residents_past_6_months_UB = last_6_months("residents_UB", suffix="_UB")
residents_past_6_months

In [None]:
residents_past_6_months_LB = last_6_months("residents_LB", suffix="_LB")
residents_past_6_months

#### Area of each adm3

In [None]:
admin3km2 = (
    gpd.read_file(geometry_filepath)
    .set_index(geometry_admin3_col)
    .to_crs(epsg=metric_crs_epsg)
    .area
    * 1e-6
)
admin3km2.name = "admin3_area_km2"

# Create scaled resident indicators for Month X

## Scaled residents counts

(Previous months residents - |people who left| + |people who arrived|) * growth rate


We |people who left| to be at most as large as the resident population in the previous month to avoid more people leaving an area than we think live there.

### 1. Get arrived departed at each location

In [None]:
# Estimated arrived, departed

departed = adjusted_home_reloc_month_X.groupby(agg_spatial_unit_from).relocations.sum()
arrived = adjusted_home_reloc_month_X.groupby(agg_spatial_unit_to).relocations.sum()

departed.name = "departed"
arrived.name = "arrived"

# Upper bound arrived, departed

departed_UB = adjusted_home_reloc_month_X.groupby(
    agg_spatial_unit_from
).relocations_UB.sum()
arrived_UB = adjusted_home_reloc_month_X.groupby(
    agg_spatial_unit_to
).relocations_UB.sum()

departed_UB.name = "departed_UB"
arrived_UB.name = "arrived_UB"

# Lower bound arrived, departed

departed_LB = adjusted_home_reloc_month_X.groupby(
    agg_spatial_unit_from
).relocations_LB.sum()
departed_LB

In [None]:
arrived_LB = adjusted_home_reloc_month_X.groupby(
    agg_spatial_unit_to
).relocations_LB.sum()
arrived_LB

In [None]:
departed_LB.name = "departed_LB"
arrived_LB.name = "arrived_LB"

In [None]:
# cap outflows at pop level (more people cannot leave than those who are there)

departed = (
    pd.concat([departed, residents_month_X_minus_1], axis=1)
    .dropna()
    .min(axis=1)
    .rename("departed")
)

departed_LB = (
    pd.concat([departed_LB, residents_month_X_minus_1], axis=1)
    .dropna()
    .min(axis=1)
    .rename("departed_LB")
)

departed_UB = (
    pd.concat([departed_UB, residents_month_X_minus_1], axis=1)
    .dropna()
    .min(axis=1)
    .rename("departed_UB")
)

### 2. Compute net flows to add to previous months estimates

In [None]:
# net flows, UB and LB

net_arrived = arrived - departed
net_arrived.name = "net_arrived"
net_arrived

In [None]:
net_arrived_UB = arrived_UB - departed_LB
net_arrived_UB.name = "net_arrived_UB"
net_arrived_UB

In [None]:
net_arrived_LB = arrived_LB - departed_UB
net_arrived_LB.name = "net_arrived_LB"
net_arrived_LB

In [None]:
residents_base_components = (
    residents_month_X_minus_1.merge(
        net_arrived, left_index=True, right_index=True, how="left"
    )
    .merge(net_arrived_LB, left_index=True, right_index=True, how="left")
    .merge(net_arrived_UB, left_index=True, right_index=True, how="left")
    .merge(growth_rates, left_index=True, right_index=True, how="left")
)

In [None]:
residents_base_components

### 3. (Previous month's residents + net arrived) * growth_rate

In [None]:
# Resident estimates
residents_this_month = (
    (
        np.round(
            (
                residents_base_components.residents
                + residents_base_components.net_arrived.replace(np.nan, 0)
            )
            * residents_base_components[f'admin3_growth_{data_date.strftime("%Y")}']
        )
    )
).rename("residents")
residents_this_month

In [None]:
# Upper bound estimates
residents_this_month_LB = (
    (
        np.round(
            (
                residents_base_components.residents
                + residents_base_components.net_arrived_LB.replace(np.nan, 0)
            )
            * residents_base_components[f'admin3_growth_{data_date.strftime("%Y")}']
        )
    )
).rename("residents_LB")
residents_this_month_LB

In [None]:
# Lower bound estimates
residents_this_month_UB = (
    (
        np.round(
            (
                residents_base_components.residents
                + residents_base_components.net_arrived_UB.replace(np.nan, 0)
            )
            * residents_base_components[f'admin3_growth_{data_date.strftime("%Y")}']
        )
    )
).rename("residents_UB")
residents_this_month_UB

### 4. Diff in residents

In [None]:
residents_diff = (residents_this_month - residents_month_X_minus_1.residents).rename(
    "residents_diff"
)
residents_diff

### 5. Concat all info needed for derived indicators

In [None]:
residents_intermediate_components = pd.concat(
    [
        residents_this_month,
        residents_this_month_LB,
        residents_this_month_UB,
        net_arrived,
        net_arrived_LB,
        net_arrived_UB,
        departed,
        departed_LB,
        departed_UB,
        arrived,
        arrived_LB,
        arrived_UB,
        admin3km2,
        residents_past_6_months,
        residents_past_6_months_UB,
        residents_past_6_months_LB,
        residents_month_1,
        residents_month_1_UB,
        residents_month_1_LB,
        residents_diff,
    ],
    axis=1,
)

residents_intermediate_components

# Derived indicators

### a. Pct change with ref and b. diff with ref

In [None]:
def pct_change_with_ref(
    residents_intermediate_components,
    residents_column,
    residents_month_1_column,
    suffix="",
):
    return (
        (
            100
            * (
                residents_intermediate_components[residents_column]
                - residents_intermediate_components[residents_month_1_column]
            )
            / residents_intermediate_components[residents_month_1_column]
        )
        .rename("residents_pctchangewithref" + suffix)
        .to_frame()
    )


def diff_with_ref(
    residents_intermediate_components,
    residents_column,
    residents_month_1_column,
    suffix="",
):
    return (
        (
            residents_intermediate_components[residents_column]
            - residents_intermediate_components[residents_month_1_column]
        )
        .rename("residents_diffwithref" + suffix)
        .to_frame()
    )


pct_change_w_ref = pct_change_with_ref(
    residents_intermediate_components, "residents", "residents_month_1"
)
pct_change_w_ref_UB = pct_change_with_ref(
    residents_intermediate_components,
    "residents_UB",
    "residents_month_1_UB",
    suffix="_UB",
)
pct_change_w_ref_LB = pct_change_with_ref(
    residents_intermediate_components,
    "residents_LB",
    "residents_month_1_LB",
    suffix="_LB",
)

diff_w_ref = diff_with_ref(
    residents_intermediate_components, "residents", "residents_month_1"
)
diff_w_ref_UB = diff_with_ref(
    residents_intermediate_components,
    "residents_UB",
    "residents_month_1_UB",
    suffix="_UB",
)
diff_w_ref_LB = diff_with_ref(
    residents_intermediate_components,
    "residents_LB",
    "residents_month_1_LB",
    suffix="_LB",
)

In [None]:
residents_intermediate_components = (
    residents_intermediate_components.merge(
        diff_w_ref, left_index=True, right_index=True
    )
    .merge(diff_w_ref_UB, left_index=True, right_index=True)
    .merge(diff_w_ref_LB, left_index=True, right_index=True)
    .merge(pct_change_w_ref, left_index=True, right_index=True)
    .merge(pct_change_w_ref_UB, left_index=True, right_index=True)
    .merge(pct_change_w_ref_LB, left_index=True, right_index=True)
)
residents_intermediate_components

### b. Residents per km2

In [None]:
def residents_p_adm3(residents_intermediate_components, residents_column, suffix=""):
    return (
        (
            residents_intermediate_components[residents_column]
            / residents_intermediate_components.admin3_area_km2
        )
        .rename("residents_perKm2" + suffix)
        .to_frame()
    )


residents_per_adm3 = residents_p_adm3(residents_intermediate_components, "residents")
residents_per_adm3_UB = residents_p_adm3(
    residents_intermediate_components, "residents_UB", suffix="_UB"
)
residents_per_adm3_LB = residents_p_adm3(
    residents_intermediate_components, "residents_LB", suffix="_LB"
)

In [None]:
residents_intermediate_components = (
    residents_intermediate_components.merge(
        residents_per_adm3, left_index=True, right_index=True
    )
    .merge(residents_per_adm3_LB, left_index=True, right_index=True)
    .merge(residents_per_adm3_UB, left_index=True, right_index=True)
)
residents_intermediate_components

### c. Abnormality

In [None]:
def _mad(baseline: np.array):
    return np.median(np.abs(baseline - np.median(baseline)))


def _meanad(baseline: np.array):
    return np.mean(np.abs(baseline - np.mean(baseline)))


def _mzscore(value, mad, meanad, median):
    if mad != 0:
        abnormality = (value - median) / (1.4826 * mad)
    elif meanad != 0:
        abnormality = (value - median) / (1.253314 * meanad)
    else:
        abnormality = np.nan
    return abnormality

In [None]:
def abnorm(
    residents_intermediate_components,
    residents_column,
    residents_past_6_months_column,
    suffix="",
):
    residents_mad = (
        residents_intermediate_components[residents_past_6_months_column]
        .apply(lambda z: _mad(z))
        .rename("mad")
    )
    residents_meanad = (
        residents_intermediate_components[residents_past_6_months_column]
        .apply(lambda z: _meanad(z))
        .rename("meanad")
    )
    residents_median = (
        residents_intermediate_components[residents_past_6_months_column]
        .apply(lambda z: np.median(z))
        .rename("median")
    )

    abnormality_intermediate = pd.concat(
        [
            residents_mad,
            residents_meanad,
            residents_median,
            residents_intermediate_components[residents_column],
        ],
        axis=1,
    )

    abnormality = (
        abnormality_intermediate.apply(
            lambda z: _mzscore(z[residents_column], z["mad"], z["meanad"], z["median"]),
            axis=1,
        )
        .rename("abnormality" + suffix)
        .to_frame()
    )
    return abnormality


abnormality = abnorm(
    residents_intermediate_components, "residents_diff", "residents_past_6_months"
)
abnormality

In [None]:
abnormality_UB = abnorm(
    residents_intermediate_components,
    "residents_UB",
    "residents_past_6_months_UB",
    suffix="_UB",
)
abnormality_UB

In [None]:
abnormality_LB = abnorm(
    residents_intermediate_components,
    "residents_LB",
    "residents_past_6_months_LB",
    suffix="_LB",
)
abnormality_LB

In [None]:
residents_intermediate_components = (
    residents_intermediate_components.merge(
        abnormality, left_index=True, right_index=True
    )
    .merge(abnormality_UB, left_index=True, right_index=True)
    .merge(abnormality_LB, left_index=True, right_index=True)
)
residents_intermediate_components

### Add all derived indicators to intermediary dataframe to then clean later

In [None]:
residents_intermediate_components.index = (
    residents_intermediate_components.index.rename("pcod")
)

residents_intermediate_components = residents_intermediate_components.reset_index()

In [None]:
residents_intermediate_components

# Residents release indicators

Leaving these unredacted as we need to refer to them in the next month. **Redaction needs to be performed at the ingestion stage.**

In [None]:
month_residents_indicators_all = residents_intermediate_components[
    [
        "pcod",
        "residents",
        "residents_perKm2",
        "arrived",
        "departed",
        "net_arrived",
        "residents_diffwithref",
        "abnormality",
        "residents_pctchangewithref",
        "residents_LB",
        "residents_perKm2_LB",
        "arrived_LB",
        "departed_LB",
        "net_arrived_LB",
        "residents_diffwithref_LB",
        "abnormality_LB",
        "residents_pctchangewithref_LB",
        "residents_UB",
        "residents_perKm2_UB",
        "arrived_UB",
        "departed_UB",
        "net_arrived_UB",
        "residents_diffwithref_UB",
        "abnormality_UB",
        "residents_pctchangewithref_UB",
    ]
].rename(columns={"net_arrived": "delta_arrived"})
month_residents_indicators_all.insert(0, "date", data_date)

month_residents_indicators_all.to_csv(
    Path(indicators_dir) / f'residents_indicators_{data_date.strftime("%Y-%m")}.csv',
    index=False,
)

In [None]:
month_residents_indicators_all

### Write CDR subscriber population to a separate file

This is required later for redaction

In [None]:
home_locs_month_X = pd.read_csv(
    Path(aggregates_dir)
    / "residence_relocations"
    / f"residence_relocations_aggregates_{data_date.date()}"
    / f"resident-counts_{data_date.date()}.csv"
).dropna()

home_locs_month_X.to_csv(Path(dagrun_data_dir) / "cdr_subscriber_population.csv")


---



# Relocations

# Data needed to compute indicators

NB: some slight reuse of code here modified to be relocations specific, some refactoring could be done to make this notebook a lot cleaner

#### Relocation aggregates at month _X_

In [None]:
# Note: using redacted aggregates this time
home_reloc_month_X = pd.read_csv(
    Path(aggregates_dir)
    / "residence_relocations"
    / f"residence_relocations_aggregates_{data_date.date()}"
    / f"home-relocations_full-outer_from{(data_date-pd.DateOffset(months=1)).date()}_to{data_date.date()}.csv"
)
home_reloc_month_X

In [None]:
home_reloc_month_X = home_reloc_month_X[
    home_reloc_month_X[agg_spatial_unit_from] != home_reloc_month_X[agg_spatial_unit_to]
].dropna()
home_reloc_month_X

#### Scaling factors

In [None]:
scaling_factors = pd.read_csv(scaling_factors_bilateral_pairs_filepath)

#### Adjusted relocation aggregates at month _X_

In [None]:
def adjust_flows(df):
    adjusted_home_reloc_month_X = df.merge(
        scaling_factors,
        left_on=[agg_spatial_unit_from, agg_spatial_unit_to],
        right_on=[sf_spatial_unit_from, sf_spatial_unit_to],
    ).assign(
        relocations=lambda z: np.ceil(z.value * z.combi_movbi_tot),
        relocations_LB=lambda z: np.ceil(z.value * z.combi_movbi_tot_LB),
        relocations_UB=lambda z: np.ceil(z.value * z.combi_movbi_tot_UB),
    )

    adjusted_home_reloc_month_X = adjusted_home_reloc_month_X[
        [
            agg_spatial_unit_from,
            agg_spatial_unit_to,
            "relocations",
            "relocations_LB",
            "relocations_UB",
        ]
    ]

    adjusted_home_reloc_month_X["date"] = data_date

    return adjusted_home_reloc_month_X


adjusted_home_reloc_month_X = adjust_flows(home_reloc_month_X)
adjusted_home_reloc_month_X

#### Scaled relocations at month _1_

used for baseline, if we are looking at the second month of data the file we are looking for has not yet been created, so lets make an empty df with the correct columns

In [None]:
try:
    relocations_month_1_all = pd.read_csv(
        Path(indicators_dir)
        / f'relocations_indicators_{(relocations_reference_date - pd.DateOffset(months = 1)).strftime("%Y-%m")}to{relocations_reference_date.strftime("%Y-%m")}.csv'
    )
    relocations_month_1_all = relocations_month_1_all[
        [
            agg_spatial_unit_from,
            agg_spatial_unit_to,
            "relocations",
            "relocations_UB",
            "relocations_LB",
        ]
    ]
except:  # if the prior month does not exist, we need to make it from this month, diff and pct change will be 0/0 (really should be in a seperate init notebook)
    relocations_month_1_all = adjusted_home_reloc_month_X[
        [
            agg_spatial_unit_from,
            agg_spatial_unit_to,
            "relocations",
            "relocations_LB",
            "relocations_UB",
        ]
    ]
relocations_month_1_all

In [None]:
relocations_month_1_UB = relocations_month_1_all.set_index(
    [agg_spatial_unit_from, agg_spatial_unit_to]
).relocations_UB.rename("relocations_month_1_UB")
relocations_month_1_UB

In [None]:
relocations_month_1_LB = relocations_month_1_all.set_index(
    [agg_spatial_unit_from, agg_spatial_unit_to]
).relocations_LB.rename("relocations_month_1_LB")
relocations_month_1_LB

In [None]:
relocations_month_1 = relocations_month_1_all.set_index(
    [agg_spatial_unit_from, agg_spatial_unit_to]
).relocations.rename("relocations_month_1")
relocations_month_1

#### Scaled relocations at month X-12, ..., X-6, X-5, X-4, X-3, X-2, X-1

In [None]:
def last_6_mo(relocations_column, suffix=""):
    last_6_mo = [
        [
            (data_date - pd.DateOffset(months=i + 1)).date().strftime("%Y-%m"),
            (data_date - pd.DateOffset(months=i)).date().strftime("%Y-%m"),
        ]
        for i in range(12, 0, -1)
    ]

    ref_periods = 0
    month_dfs = []
    for year_month in last_6_mo:
        with Path(indicators_dir) / (
            f"relocations_indicators_{year_month[0]}to{year_month[1]}.csv"
        ) as filepath:
            if filepath.is_file():
                print(f"reading {filepath}")
                ref_periods += 1
                month_dfs.append(pd.read_csv(filepath))
    print(f"{ref_periods} months of data for abnormality calculations")

    if ref_periods > 0:
        relocations_past_6_months = pd.concat(month_dfs)

        relocations_past_6_months = (
            relocations_past_6_months[
                [agg_spatial_unit_from, agg_spatial_unit_to, relocations_column]
            ]
            .groupby([agg_spatial_unit_from, agg_spatial_unit_to])[relocations_column]
            .apply(lambda x: np.array(x))
        )
    else:
        relocations_past_6_months = (
            adjusted_home_reloc_month_X.reset_index()[
                [agg_spatial_unit_from, agg_spatial_unit_to, relocations_column]
            ]
            .groupby([agg_spatial_unit_from, agg_spatial_unit_to])[relocations_column]
            .apply(lambda x: np.array(x))
        )

    # need to pad the arrays so we include days with no data as 0's (as this will affect the abnormality)
    prev_6_mo_max_months = relocations_past_6_months.apply(len).max()

    relocations_past_6_months = relocations_past_6_months.apply(
        lambda z: np.pad(z, (0, prev_6_mo_max_months - len(z)), "constant")
    )

    return relocations_past_6_months.rename("relocations_past_6_months" + suffix)


relocations_past_6_months_UB = last_6_mo("relocations_UB", suffix="_UB")
relocations_past_6_months_LB = last_6_mo("relocations_LB", suffix="_LB")
relocations_past_6_months = last_6_mo("relocations")

In [None]:
relocations_past_6_months

In [None]:
adjusted_home_reloc_month_X = (
    adjusted_home_reloc_month_X.set_index(["pcod_from", "pcod_to"])
    .merge(
        relocations_month_1.to_frame(), left_index=True, right_index=True, how="left"
    )
    .merge(
        relocations_month_1_UB.to_frame(), left_index=True, right_index=True, how="left"
    )
    .merge(
        relocations_month_1_LB.to_frame(), left_index=True, right_index=True, how="left"
    )
    .merge(
        relocations_past_6_months.to_frame(),
        left_index=True,
        right_index=True,
        how="left",
    )
    .merge(
        relocations_past_6_months_LB.to_frame(),
        left_index=True,
        right_index=True,
        how="left",
    )
    .merge(
        relocations_past_6_months_UB.to_frame(),
        left_index=True,
        right_index=True,
        how="left",
    )
)
adjusted_home_reloc_month_X

# Base indicators

We have these from the scaling process:

In [None]:
adjusted_home_reloc_month_X[["relocations", "relocations_LB", "relocations_UB"]]

# Derived indicators

### pct_change_w_ref

In [None]:
def pct_change_with_ref(
    adjusted_home_reloc_month_X,
    relocations_column,
    relocations_month_1_column,
    suffix="",
):
    return (
        (
            100
            * (
                adjusted_home_reloc_month_X[relocations_column]
                - adjusted_home_reloc_month_X[relocations_month_1_column]
            )
            / adjusted_home_reloc_month_X[relocations_month_1_column]
        )
        .rename("relocations_pctchangewithref" + suffix)
        .to_frame()
    )


pct_change_w_ref = pct_change_with_ref(
    adjusted_home_reloc_month_X, "relocations", "relocations_month_1"
)
pct_change_w_ref_UB = pct_change_with_ref(
    adjusted_home_reloc_month_X,
    "relocations_UB",
    "relocations_month_1_UB",
    suffix="_UB",
)
pct_change_w_ref_LB = pct_change_with_ref(
    adjusted_home_reloc_month_X,
    "relocations_LB",
    "relocations_month_1_LB",
    suffix="_LB",
)

### diff_w_ref

In [None]:
def diff_with_ref(
    adjusted_home_reloc_month_X,
    relocations_column,
    relocations_month_1_column,
    suffix="",
):
    return (
        (
            adjusted_home_reloc_month_X[relocations_column]
            - adjusted_home_reloc_month_X[relocations_month_1_column]
        )
        .rename("relocations_diffwithref" + suffix)
        .to_frame()
    )


diff_w_ref = diff_with_ref(
    adjusted_home_reloc_month_X, "relocations", "relocations_month_1"
)
diff_w_ref_UB = diff_with_ref(
    adjusted_home_reloc_month_X,
    "relocations_UB",
    "relocations_month_1_UB",
    suffix="_UB",
)
diff_w_ref_LB = diff_with_ref(
    adjusted_home_reloc_month_X,
    "relocations_LB",
    "relocations_month_1_LB",
    suffix="_LB",
)

### abnormality

In [None]:
def abnorm(
    adjusted_home_reloc_month_X,
    relocations_column,
    relocations_past_6_months_column,
    suffix="",
):
    residents_mad = (
        adjusted_home_reloc_month_X[relocations_past_6_months_column]
        .apply(lambda z: _mad(z))
        .rename("mad")
    )
    residents_meanad = (
        adjusted_home_reloc_month_X[relocations_past_6_months_column]
        .apply(lambda z: _meanad(z))
        .rename("meanad")
    )
    residents_median = (
        adjusted_home_reloc_month_X[relocations_past_6_months_column]
        .apply(lambda z: np.median(z))
        .rename("median")
    )

    abnormality_intermediate = pd.concat(
        [
            residents_mad,
            residents_meanad,
            residents_median,
            adjusted_home_reloc_month_X[relocations_column],
        ],
        axis=1,
    )

    abnormality = (
        abnormality_intermediate.apply(
            lambda z: _mzscore(
                z[relocations_column], z["mad"], z["meanad"], z["median"]
            ),
            axis=1,
        )
        .rename("abnormality" + suffix)
        .to_frame()
    )
    return abnormality


abnormality = abnorm(
    adjusted_home_reloc_month_X, "relocations", "relocations_past_6_months"
)
abnormality_UB = abnorm(
    adjusted_home_reloc_month_X,
    "relocations_UB",
    "relocations_past_6_months_UB",
    suffix="_UB",
)
abnormality_LB = abnorm(
    adjusted_home_reloc_month_X,
    "relocations_LB",
    "relocations_past_6_months_LB",
    suffix="_LB",
)

In [None]:
relocations_intermediate_indicators = (
    adjusted_home_reloc_month_X.merge(abnormality, left_index=True, right_index=True)
    .merge(abnormality_UB, left_index=True, right_index=True)
    .merge(abnormality_LB, left_index=True, right_index=True)
    .merge(diff_w_ref, left_index=True, right_index=True)
    .merge(diff_w_ref_LB, left_index=True, right_index=True)
    .merge(diff_w_ref_UB, left_index=True, right_index=True)
    .merge(pct_change_w_ref, left_index=True, right_index=True)
    .merge(pct_change_w_ref_LB, left_index=True, right_index=True)
    .merge(pct_change_w_ref_UB, left_index=True, right_index=True)
)

# Putting it together (relocations)

In [None]:
full_fp = (
    Path(indicators_dir)
    / f'relocations_indicators_{(data_date - pd.DateOffset(months = 1)).strftime("%Y-%m")}to{data_date.strftime("%Y-%m")}.csv'
)

In [None]:
relocations_all_indicators = relocations_intermediate_indicators.reset_index()[
    [
        "pcod_from",
        "pcod_to",
        "relocations",
        "relocations_diffwithref",
        "abnormality",
        "relocations_pctchangewithref",
        "relocations_UB",
        "relocations_diffwithref_UB",
        "abnormality_UB",
        "relocations_pctchangewithref_UB",
        "relocations_LB",
        "relocations_diffwithref_LB",
        "abnormality_LB",
        "relocations_pctchangewithref_LB",
    ]
]

relocations_all_indicators.insert(0, "date", data_date)

relocations_all_indicators.to_csv(full_fp, index=False)