# Stage E: Active Crisis

##### Useful links

Stages and full technical proposal: https://docs.google.com/document/d/13Ttnu4SgC30OinOnWkCKZk7w9VZrcqBq5zBBPclMxE0/edit#heading=h.hl12xr1wx1f5


In [None]:
# What language are we making the report in:
language = "EN"


# What spatial level the reports are using:
regional_spatial_level = "admin1name"
subregional_spatial_level = "admin3name"

readable_regional_spatial_descriptor = "Department"
readable_subregional_spatial_descriptor = "Communal section"


# Where things are located:
static_dir = "./static/"
spatial_geometry_file = "geo/ht_admin3.geojson"

aggregate_dir = "../unredacted_crisis_response_aggregates/"
relocations_data_regex = (
    "weekly_aggregates_*/home-relocations_consecutive_nosubset_from*_unredacted.csv"
)
affected_areas_file = "affected_areas/affected_areas.geojson"

# What are columns called / what level should we be working at?:
spatial_geometry_unit_column = "admin3pcod"  # column that identifies the name of the host locations in the shapefile.
date_column = "date"  # column identifying where date for residents counts is stored

shared_data_dir = "data_dir"
dagrun_data_dir = "dagrun_data_dir"

# When the report is being generated:
report_date = "2021-08-21"

# When did the event happen
event_date = "2021-08-14"

In [None]:
from pathlib import Path

# Postprocessing params
aggregate_path = Path(shared_data_dir) / aggregate_dir

---

# 0a. Imports and static data loading


In [None]:
import matplotlib

matplotlib.rcParams["font.sans-serif"] = "Roboto"
matplotlib.rcParams["font.family"] = "sans-serif"

In [None]:
import glob, re
import os

import numpy as np
import pandas as pd
import geopandas as gpd

from datetime import datetime

from slugify import slugify
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.ticker as mtick
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from matplotlib.ticker import ScalarFormatter, FuncFormatter
from cycler import cycler

from common_utilities import map_boundaries, add_scalebar, add_plot_basemap


# register_custom_colormaps() Done in common_utilities
color_cycle = cycler(
    color=["#034174", "#CBA45A", "#701F53", "#006E8C", "#BF6799", "#00989A", "#9E6257"]
)
matplotlib.rcParams["axes.prop_cycle"] = color_cycle
px = 1 / plt.rcParams["figure.dpi"]

In [None]:
def get_current_color(cycler_iterator):
    try:
        current_color = next(cycler_iterator)
        return current_color
    except StopIteration:
        # Reset the iterator if it reaches the end
        cycler_iterator = iter(cycler)
        current_color = next(cycler_iterator)
        return current_color

#### Geometries


In [None]:
"""
Read file containing spatial geometry, extract relevant columns, spit out an error if we can't find the named columns in passed arguements.
"""

spatial_geometry = gpd.read_file(Path(dagrun_data_dir) / spatial_geometry_file)
spatial_geometry

In [None]:
assert (
    spatial_geometry_unit_column in spatial_geometry.columns
), f"Column {spatial_geometry_unit_column} not found in {spatial_geometry_file}."
assert (
    regional_spatial_level in spatial_geometry.columns
), f"Column {regional_spatial_level} not found in {spatial_geometry_file}."
assert (
    subregional_spatial_level in spatial_geometry.columns
), f"Column {subregional_spatial_level} not found in {spatial_geometry_file}."
assert (
    "geometry" in spatial_geometry.columns
), f"Column {spatial_geometry_file} does not have a geometry column."

#### Affected areas


In [None]:
affected_areas = gpd.read_file(Path(static_dir) / affected_areas_file)
if any(col not in ["name", "geometry"] for col in affected_areas.columns):
    raise ValueError("Affected area features should only have 'name' column")
affected_areas

---

# 0b. Output folder + dict creation


In [None]:
outputs_dict = {}
outputs_dict["national"] = {}
outputs_dict["affected_areas"] = {}

In [None]:
for folder in [
    Path(dagrun_data_dir) / "active_crisis",
    Path(dagrun_data_dir) / "active_crisis" / "affected_areas",
    Path(dagrun_data_dir) / "active_crisis" / "national",
]:
    if not os.path.exists(folder):
        os.mkdir(folder)

for region in affected_areas["name"].values:
    folder = (
        Path(dagrun_data_dir) / "active_crisis" / "affected_areas" / slugify(region)
    )
    if not os.path.exists(folder):
        os.mkdir(folder)
    outputs_dict["affected_areas"][region] = {}


output_path = Path(dagrun_data_dir) / "active_crisis"

---

# 1. Read crisis aggregates, derive residents from net relocations, derive excess residents and relocations for figures.

**Active crisis indicators are derived from the weekly crisis indicators.**


In [None]:
key_dates = {
    "event_date_minus_3_months": pd.to_datetime(event_date) - pd.DateOffset(months=3),
    "event_date_minus_2_months": pd.to_datetime(event_date) - pd.DateOffset(months=2),
    "report_date_minus_1_week": pd.to_datetime(report_date) - pd.DateOffset(weeks=1),
    "event_date": pd.to_datetime(event_date),
    "report_date": pd.to_datetime(report_date),
}
print(key_dates)

In [None]:
from dataclasses import dataclass
from pprint import pprint
from typing import Generator


fmt_str = "%Y-%m-%d"
run_weekly_agg_folder = f"weekly_aggregates_lon-lat_{report_date}"
agg_file = "home-relocations_consecutive_nosubset_from{yesterday}_to{today}.csv"


@dataclass
class DateSet:
    today: datetime.date
    yesterday: datetime.date


def date_generator(start_date: datetime.date, end_date: datetime.date):
    # We stick the offset in as pd.date_range rounds up
    for today in pd.date_range(start_date - pd.DateOffset(days=1), end_date, freq="D"):
        yesterday = today - pd.DateOffset(days=1)
        yield DateSet(today, yesterday)


def path_to_date(ds: DateSet) -> Path:
    ds_str = dict(
        yesterday=ds.yesterday.strftime(fmt_str),
        today=ds.today.strftime(fmt_str),
    )
    return aggregate_path / run_weekly_agg_folder / agg_file.format(**ds_str)


dates = date_generator(key_dates["event_date_minus_3_months"], key_dates["report_date"])
wanted_paths = {date_set.today: path_to_date(date_set) for date_set in dates}
home_relocations_files = {
    date: path for date, path in wanted_paths.items() if path.exists()
}
missing_paths = {date: path for date, path in wanted_paths.items() if not path.exists()}

In [None]:
dates

In [None]:
wanted_paths

In [None]:
home_relocations_files

In [None]:
missing_paths

In [None]:
loaded_files = []

for date, file in home_relocations_files.items():
    loaded_files.append(
        pd.read_csv(file).assign(
            date=date
        )  # find the date from the filepath, add it to a new column
    )

# Join individually loaded files, parse date
home_relocations = pd.concat(loaded_files)

home_relocations = home_relocations[
    (home_relocations.lat_from != home_relocations.lat_to)
    & (home_relocations.lon_from != home_relocations.lon_to)
]

home_relocations.date = pd.to_datetime(home_relocations.date)

# Remove any appeared / disappeared
home_relocations = home_relocations.dropna()

# Add geometry objects for 'to' and 'from' cluster
home_relocations["loc_from"] = gpd.points_from_xy(
    home_relocations["lon_from"], home_relocations["lat_from"]
)
home_relocations["loc_to"] = gpd.points_from_xy(
    home_relocations["lon_to"], home_relocations["lat_to"]
)

# gdf
home_relocations = gpd.GeoDataFrame(home_relocations, geometry="loc_from").set_crs(
    epsg=4326
)

In [None]:
home_relocations.date.min(), home_relocations.date.max()

In [None]:
# NOTE: We shouldn't need the filtering now we do that on the filename instead of the data
home_relocations

In [None]:
affected_areas

### relocations from AA


In [None]:
relocations_from_aas = gpd.sjoin(
    home_relocations, affected_areas, predicate="within", rsuffix=("AA")
).to_crs(epsg=4326)
relocations_from_aas

In [None]:
# attach spatial information about clusters where people are going from the AA
relocations_from_aas = relocations_from_aas.set_geometry("loc_to").sjoin(
    spatial_geometry
)
relocations_from_aas

### arrivals from AA


In [None]:
arrivals_from_aa = (
    relocations_from_aas.groupby(
        ["index_AA", "admin2name", "admin3name", "admin3pcod", "date"]
    )
    .value.sum()
    .reset_index()
)
arrivals_from_aa

### excess arrivals from AA


In [None]:
date_range = pd.date_range(
    key_dates["event_date_minus_3_months"], report_date, name="date"
)

In [None]:
def MAD(df, col):
    return abs(df[col] - df[col].median()).median()


def remove_negatives(df, col, inplace=True):
    df[df[col] <= 0] = np.nan


def find_excess(cluster_df):
    excess = pd.DataFrame(
        cluster_df.value
        - (
            cluster_df[
                (cluster_df.index >= key_dates["event_date_minus_3_months"])
                & (cluster_df.index < key_dates["event_date"])
            ].value.median()
            + (3 * 1.486)
            * MAD(
                cluster_df[
                    (cluster_df.index >= key_dates["event_date_minus_3_months"])
                    & (cluster_df.index < key_dates["event_date"])
                ],
                "value",
            )
        )
    )

    excess.index = cluster_df.index
    return excess


def find_deficit(cluster_df):
    deficit = pd.DataFrame(
        (
            cluster_df[
                (cluster_df.index >= key_dates["event_date_minus_3_months"])
                & (cluster_df.index < key_dates["event_date"])
            ].value.median()
            - (3 * 1.486)
            * MAD(
                cluster_df[
                    (cluster_df.index >= key_dates["event_date_minus_3_months"])
                    & (cluster_df.index < key_dates["event_date"])
                ],
                "value",
            )
        )
        - cluster_df.value
    )

    deficit.index = cluster_df.index
    return deficit


def counts_2_excess(df):
    excess_df = find_excess(df.set_index("date").reindex(date_range).fillna(0))
    remove_negatives(excess_df, "value")

    return excess_df


def counts_2_deficit(df):
    deficit_df = find_deficit(df.set_index("date").reindex(date_range).fillna(0))
    remove_negatives(deficit_df, "value")

    return deficit_df

In [None]:
excess_arrivals_from_each_aa = (
    arrivals_from_aa.groupby(["index_AA", "admin3name", "admin3pcod", "admin2name"])
    .apply(counts_2_excess)
    .reset_index()
).dropna()
excess_arrivals_from_each_aa

In [None]:
excess_arrivals_past_week = excess_arrivals_from_each_aa[
    excess_arrivals_from_each_aa.date >= key_dates["report_date_minus_1_week"]
]
excess_arrivals_past_week

In [None]:
excess_arrivals_past_week = excess_arrivals_past_week[
    excess_arrivals_past_week.value > 15
]
excess_arrivals_past_week

In [None]:
excess_arrivals_past_week.date.min(), excess_arrivals_past_week.date.max()

In [None]:
total_excess_arrivals_past_week_per_AA = (
    excess_arrivals_past_week.groupby(["index_AA", "admin3pcod"])
    .value.sum()
    .reset_index()
)

total_excess_arrivals_past_week = (
    excess_arrivals_past_week.groupby("admin3pcod").value.sum().reset_index()
)
total_excess_arrivals_past_week = gpd.GeoDataFrame(
    total_excess_arrivals_past_week.merge(spatial_geometry, on="admin3pcod")
)

In [None]:
excess_arrivals_since_event = excess_arrivals_from_each_aa[
    excess_arrivals_from_each_aa.date >= key_dates["event_date"]
]

In [None]:
excess_arrivals_since_event = excess_arrivals_since_event[
    excess_arrivals_since_event.value > 15
]
excess_arrivals_since_event

In [None]:
excess_arrivals_since_event.date.min(), excess_arrivals_since_event.date.max()

# host areas per affected area


In [None]:
hosts = (
    excess_arrivals_since_event[excess_arrivals_since_event.value > 0]
    .groupby(["index_AA", "admin3pcod"])
    .value.sum()
    .reset_index()
    .drop(columns="value")
    .merge(affected_areas["name"], left_on="index_AA", right_index=True)
)
hosts

In [None]:
host_areas = gpd.GeoDataFrame(hosts.merge(spatial_geometry, on="admin3pcod"))
host_areas

### residents (net cumsum)


In [None]:
# arrivals
arrivals = (
    home_relocations[home_relocations.date >= key_dates["event_date_minus_3_months"]]
    .groupby(["lon_to", "lat_to", "date"])
    .value.sum()
    .reset_index()
    .rename(columns={"lon_to": "lon", "lat_to": "lat"})
)
arrivals

In [None]:
# departures
departures = (
    home_relocations[home_relocations.date >= key_dates["event_date_minus_3_months"]]
    .groupby(["lon_from", "lat_from", "date"])
    .value.sum()
    .reset_index()
    .rename(columns={"lon_from": "lon", "lat_from": "lat"})
)

departures

In [None]:
def reindex_dates(df):
    return df.set_index("date").reindex(date_range)[["value"]].fillna(0)


departures = departures.groupby(["lon", "lat"]).apply(reindex_dates).reset_index()
arrivals = arrivals.groupby(["lon", "lat"]).apply(reindex_dates).reset_index()

# netflows
arrivals_departures_net_flows = arrivals.merge(
    departures, on=["lon", "lat", "date"], suffixes=("_arrived", "_departed")
).assign(net_arrived=lambda x: x.value_arrived - x.value_departed)

arrivals_departures_net_flows

In [None]:
# netflow cumsum


def netflow_cumsum(df):
    return df.set_index("date").sort_index()[["net_arrived"]].cumsum()


net_cumsums = (
    arrivals_departures_net_flows.groupby(["lat", "lon"])
    .apply(netflow_cumsum)
    .reset_index()
    .rename(columns={"net_arrived": "net_arrived_cumsum"})
)

In [None]:
net_cumsums["geometry"] = gpd.points_from_xy(net_cumsums.lon, net_cumsums.lat)

In [None]:
net_cumsums = gpd.GeoDataFrame(net_cumsums).set_crs(epsg=4326)

In [None]:
net_cumsums.columns = ["lat", "lon", "date", "value", "geometry"]

In [None]:
net_cumsums

In [None]:
net_cumsums_agged = (
    net_cumsums.sjoin(spatial_geometry)
    .groupby([spatial_geometry_unit_column, "date"])
    .value.sum()
    .reset_index()
)
net_cumsums_agged

In [None]:
excess_residents = (
    net_cumsums_agged.groupby(spatial_geometry_unit_column)
    .apply(counts_2_excess)
    .reset_index()
    .fillna(0.0)
)
excess_residents = excess_residents.merge(
    spatial_geometry, on=spatial_geometry_unit_column
)
excess_residents

In [None]:
excess_residents_hosts_since_event = excess_residents[
    excess_residents.date > event_date
]
excess_residents_hosts_since_event = excess_residents_hosts_since_event[
    excess_residents_hosts_since_event.admin3pcod.isin(host_areas.admin3pcod.unique())
]
excess_residents_hosts_since_event

In [None]:
most_recent_excess_residents_per_host = gpd.GeoDataFrame(
    excess_residents_hosts_since_event.sort_values(
        ["admin3pcod", "date"]
    ).drop_duplicates("admin3pcod", keep="last")
)
most_recent_excess_residents_per_host

In [None]:
excess_residents_per_host = most_recent_excess_residents_per_host[
    most_recent_excess_residents_per_host.value > 15
].query("admin3pcod in @host_areas.admin3pcod")
excess_residents_per_host

In [None]:
# defecit residents in affected areas

In [None]:
net_cumsums_aa_agged = (
    net_cumsums.sjoin(affected_areas)
    .groupby(["name", "date"])
    .value.sum()
    .reset_index()
)
net_cumsums_aa_agged

In [None]:
deficit_aas = (
    net_cumsums_aa_agged.groupby("name")
    .apply(counts_2_deficit)
    .reset_index()
    .fillna(0.0)
)
deficit_aas

In [None]:
deficit_residents_AA = deficit_aas.merge(affected_areas, on="name")
deficit_residents_AA

---

# 2. Information products

**Produces figures for both national and regional locations depending on the passed parameter '_report_type_'**


###### a. Automated text area with numbers (maybe next to some icons)

1. “total displaced subscribers” (i.e. ‘remaining’ displaced - this is the ‘stocks’) , inferred from resident loss in affected areas
2. “total displaced subscribers in known neighbourhoods” , inferred from excess residents in host locations
3. “newly displaced subscribers”, inferred from excess arrivals this week to host locations


In [None]:
newly_displaced = (
    total_excess_arrivals_past_week.value.sum()
)  # newly displaced subscribers from AA
newly_displaced

In [None]:
total_excess_arrivals_past_week_per_AA.groupby("index_AA").sum()

In [None]:
excess_residents_hosts_since_event

In [None]:
total_displaced_in_known_areas = (
    excess_residents_hosts_since_event.groupby("date").value.sum().sort_index().iloc[-1]
)  # excess residents found in host areas
total_displaced_in_known_areas

In [None]:
total_displaced_from_aa = (
    deficit_residents_AA.groupby("date").value.sum().sort_index().iloc[-1]
)  # defecit residents in affected area
total_displaced_from_aa

In [None]:
displaced_stats_boxes = pd.DataFrame(
    [[newly_displaced, total_displaced_in_known_areas, total_displaced_from_aa]],
    columns=[
        "newly_displaced",
        "total_displaced_in_known_areas",
        "total_displaced_from_aa",
    ],
)
displaced_stats_boxes

In [None]:
out_filepath = output_path / "national" / "displaced_stats_boxes.csv"
outputs_dict["national"]["displaced_stats_boxes"] = str(out_filepath)
displaced_stats_boxes.to_csv(out_filepath)

In [None]:
# deficit per aa

In [None]:
for AA_name, AA_df in deficit_residents_AA.groupby("name"):
    print(AA_name, AA_df.set_index("date").value.sort_index().iloc[[-1]])
    out_filepath = (
        output_path / "affected_areas" / slugify(AA_name) / "displaced_stats_boxes.csv"
    )
    outputs_dict["affected_areas"][AA_name]["displaced_stats_boxes"] = str(out_filepath)
    AA_df.set_index("date").value.sort_index().iloc[[-1]].to_csv(
        out_filepath
    )  # deficit residents in affected area

###### b. Overview map of new arrivals (and showing areas with ‘stocks’ if any at time of reporting


In [None]:
start_date = datetime.strftime(key_dates["report_date_minus_1_week"], "%d %b %y")
end_date = datetime.strftime(key_dates["report_date"], "%d %b %y")
date_string = f"{start_date} and\n{end_date}"

_National_


In [None]:
host_areas = gpd.GeoDataFrame(hosts.merge(spatial_geometry, on="admin3pcod"))

In [None]:
# assert that there actually are excesses this week

In [None]:
from visualisations.active_crisis.mapping import (
    plot_national_map,
    plot_regional_map,
    plot_zoomed_regional_map,
    set_default_style_to_flowminder,
)
from visualisations.active_crisis.tabulation import (
    excess_national_arrivals_table,
    excess_regional_arrivals_table,
)
from visualisations.active_crisis.plotting import (
    excess_arrivals_excess_residents_barchart,
    top_3_adm_hosts,
    residents_hosts_vs_aa,
    top_3_adm_hosts_multi_aa,
)

set_default_style_to_flowminder()

national_path = plot_national_map(
    out_folder=output_path,
    spatial_geometry=spatial_geometry,
    total_excess_arrivals_past_week=total_excess_arrivals_past_week,
    host_areas=host_areas,
    affected_areas=affected_areas,
    date_string=date_string,
)
outputs_dict["national"]["all_excess_arrivals"] = str(national_path)

excess_national_arrivals_table_path = excess_national_arrivals_table(
    excess_arrivals_past_week=excess_arrivals_past_week,
    most_recent_excess_residents_per_host=most_recent_excess_residents_per_host,
    out_folder=output_path,
)
outputs_dict["national"]["excess_arrivals_excess_residents_table"] = str(
    excess_national_arrivals_table_path
)

excess_barchart_path = excess_arrivals_excess_residents_barchart(
    excess_arrivals_past_week=excess_arrivals_past_week,
    most_recent_excess_residents_per_host=most_recent_excess_residents_per_host,
    out_folder=output_path,
)
outputs_dict["national"]["excess_arrivals_excess_residents_barchart"] = str(
    excess_barchart_path
)

_regional_


In [None]:
start_date = datetime.strftime(key_dates["event_date_minus_3_months"], "%d %b %y")
end_date = datetime.strftime(
    key_dates["event_date"] - pd.DateOffset(days=1), "%d %b %y"
)
date_string = f"{start_date} and\n{end_date}"
residents_hosts = net_cumsums_agged[net_cumsums_agged.admin3pcod.isin(hosts.admin3pcod)]
residents_aa = net_cumsums_aa_agged

excess_arrivals_past_week_w_geo = gpd.GeoDataFrame(
    total_excess_arrivals_past_week_per_AA.merge(
        affected_areas, left_on="index_AA", right_index=True
    ).merge(
        spatial_geometry,
        left_on="admin3pcod",
        right_on="admin3pcod",
        suffixes=("_", ""),
    )
)
excess_arrivals_past_week_w_geo

In [None]:
for aa, regional_dataframe in excess_arrivals_past_week_w_geo.groupby("name"):
    print(f"Plotting {aa}")

    regional_map_path = plot_regional_map(
        regional_dataframe=regional_dataframe,
        affected_areas=affected_areas,
        spatial_geometry=spatial_geometry,
        host_areas=host_areas,
        output_folder=output_path,
        aa=aa,
        date_str=date_string,
    )
    outputs_dict["affected_areas"][aa]["all_excess_arrivals"] = str(regional_map_path)

    regional_map_path_zoomed = plot_zoomed_regional_map(
        regional_dataframe=regional_dataframe,
        affected_areas=affected_areas,
        spatial_geometry=spatial_geometry,
        host_areas=host_areas,
        output_folder=output_path,
        aa_name=aa,
        date_string=date_string,
    )
    outputs_dict["affected_areas"][aa]["all_excess_arrivals_zoomed"] = str(
        regional_map_path_zoomed
    )

    # TODO: Check if these two dataframes are compatible
    excess_regional_arrivals_table_path = excess_regional_arrivals_table(
        regional_dataframe=regional_dataframe,
        most_recent_excess_residents_per_host=most_recent_excess_residents_per_host,
        aa_name=aa,
        out_folder=output_path,
    )
    outputs_dict["affected_areas"][aa]["excess_arrivals_excess_residents_table"] = str(
        excess_regional_arrivals_table_path
    )

    top_3_adm_hosts_path = top_3_adm_hosts(
        excess_arrivals_df=regional_dataframe,
        arrivals_from_aa=arrivals_from_aa,
        key_dates=key_dates,
        aa_name=excess_arrivals_past_week_w_geo.index_AA.unique().values[0],
        output_folder=output_path,
        date_string=date_string,
    )
    outputs_dict["affected_areas"][aa]["top3_adm_hosts"] = str(top_3_adm_hosts_path)

# National level top3 hosts
top_3_adm_hosts_multi_aa_path = top_3_adm_hosts_multi_aa(
    excess_arrivals_df=excess_arrivals_past_week_w_geo,
    arrivals_from_aa=arrivals_from_aa,
    key_dates=key_dates,
    aa_name=list(excess_arrivals_past_week_w_geo.index_AA.unique()),
    output_folder=output_path,
    date_string=date_string,
)
outputs_dict["national"]["top3_adm_hosts_multi_aa"] = str(top_3_adm_hosts_multi_aa_path)

# Does this need to be in it's own loop?
for aa, df in residents_aa.groupby("name"):
    residents_hosts_path = residents_hosts_vs_aa(
        residents_hosts_df=df,
        hosts=hosts,
        residents_hosts=residents_hosts,
        key_dates=key_dates,
        date_string=date_string,
        aa_name=aa,
        output_folder=output_path,
    )

    outputs_dict["affected_areas"][aa]["residents_hosts_vs_aa"] = str(
        residents_hosts_path
    )

In [None]:
most_recent_excess_residents_per_host

In [None]:
excess_arrivals_past_week

###### f. Time series of excess arrivals


###### g. Time series of residents in the AA (departure area) and in the host locations (arrivals area)


# Save the json


In [None]:
import json

In [None]:
(output_path / "active_crisis.json").write_text(json.dumps(outputs_dict))

In [None]:
outputs_dict