This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0.

If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/.

# Populating the database

This notebook will guide you through the process of adding data to the database.

First we import the required libraries and check the connection works.

**Note for CGP**:
Because containers are spun up only when "poked", you may need to run this twice in order to give the container time to spin up if you receive a `TimeoutError` after making the request.

In [1]:
import httpx
import csv
import json
import os
import gzip
import asyncio
import time
import itertools
import pandas as pd
from datetime import datetime
from dateutil import parser

# whether to ingest synthetic data or real data
SYNTHETIC = False

AUDIENCE = os.getenv("AUTH0_AUDIENCE")
BASE_URL = os.getenv("INGESTION_BASE_URL")
AUTH0_DOMAIN = os.getenv("INGESTION_AUTH0_DOMAIN")
AUTH0_CLIENT_ID_UPDATER = os.getenv("INGESTION_AUTH0_CLIENT_ID_UPDATER")
AUTH0_CLIENT_SECRET_UPDATER = os.getenv("INGESTION_AUTH0_CLIENT_SECRET_UPDATER")
AUTH0_CLIENT_ID_ADMIN = os.getenv("INGESTION_AUTH0_CLIENT_ID_ADMIN")
AUTH0_CLIENT_SECRET_ADMIN = os.getenv("INGESTION_AUTH0_CLIENT_SECRET_ADMIN")


def log(response):
    to_print = f"{response.status_code}: " if response.status_code != 200 else ""
    if hasattr(response, "content") and response.content is not None and response.content != b"":
        try:
            to_print += json.dumps(json.loads(response.content), indent=4)
        except Exception as e:
            to_print += "<could not decode response>"
    else:
        to_print += "<no content>"
    print(to_print)


response = httpx.get(f"{BASE_URL}/heartbeat")

print(f"Welcome to {os.getenv('APP_NAME')}!")
log(response)

Welcome to flowkit-ui-backend!
{
    "datetime": "2023-07-17T09:21:56.715448+00:00",
    "docker_image": "flowminder/flowkit-ui-backend:013015e",
    "git_branch": "main",
    "git_commit": "013015e",
    "git_tag": null,
    "python_package": "flowkit-ui-backend",
    "python_version": "3.9.17",
    "api_version": "1.3.0",
    "api_version_url_appendix": "v1"
}


Then we obtain M2M tokens to execute the requests.

In [2]:
AUDIENCE

'https://flowkit-ui-backend.flowminder.org'

In [4]:
response = httpx.post(
    url=f"https://{AUTH0_DOMAIN}/oauth/token",
    headers={"Content-Type": "application/json"},
    data=f'{{"client_id":"{AUTH0_CLIENT_ID_ADMIN}","client_secret":"{AUTH0_CLIENT_SECRET_ADMIN}","audience":"{AUDIENCE}","grant_type":"client_credentials"}}',
)
admin_token = json.loads(response.content)["access_token"]
print(response)

response = httpx.post(
    url=f"https://{AUTH0_DOMAIN}/oauth/token",
    headers={"Content-Type": "application/json"},
    data=f'{{"client_id":"{AUTH0_CLIENT_ID_UPDATER}","client_secret":"{AUTH0_CLIENT_SECRET_UPDATER}","audience":"{AUDIENCE}","grant_type":"client_credentials"}}',
)
updater_token = json.loads(response.content)["access_token"]
print(response)

<Response [200 OK]>
<Response [200 OK]>


Now we get some info from the backend so we know what's already in the database.
If the database has been re-provisioned, this may come back empty. If that happens, don't worry and proceed to the next step where the cause for this issue will be rectified.

We'll do a quick check for `categories` but you can also check `languages`, `indicators` or any other top-level element in the `config.json` file.

In [7]:
response = httpx.get(
    url=f"{BASE_URL}/categories", headers={"Authorization": f"Bearer {admin_token}"}
)
log(response)
categories = json.loads(response.content)["categories"]

{
    "categories": [
        {
            "category_id": "residents",
            "type": "single_location",
            "order": 1,
            "flowgeek_url": "https://www.flowgeek.org/methods/calculating-mobility-indicators/residents-indicators",
            "label": "Residents",
            "description": "Residents-class indicators describe long-term (monthly) changes in the number of people whose home location is within each area.",
            "label_fr": "R\u00e9sidents",
            "description_fr": "Les indicateurs relatifs aux r\u00e9sidents d\u00e9crivent les variations (mensuelles) \u00e0 long terme du nombre de personnes dont le lieu de r\u00e9sidence se trouve dans chaque zone."
        },
        {
            "category_id": "relocations",
            "type": "flow",
            "order": 2,
            "flowgeek_url": "https://www.flowgeek.org/methods/calculating-mobility-indicators/relocation-indicators/",
            "label": "Relocation",
            "description"

If any of categories, indicators, spatial or temporal resolutions are missing, we need to load the config first and then repeat the data retrieval.

Since the payload can get quite large, we'll compress it before sending it to the API. The backend API supports both compressed and uncompressed requests; provided you set the appropriate encoding in the header:

```python
headers={
    # always send the type
    "Content-Type": "application/json",
    # encoding required for gzip-compressed payloads
    "Content-Encoding": "gzip",
    [...]
}
```

In [8]:
# get config directly from the resources
with open(f"../impl/resources/config.json") as json_data:
    config = json.load(json_data)

In [None]:
response = httpx.post(
    url=f"{BASE_URL}/setup",
    headers={
        "Content-Type": "application/json",
        "Content-Encoding": "gzip",
        "Authorization": f"Bearer {admin_token}",
    },
    data=gzip.compress(json.dumps(config).encode("utf-8")),
    timeout=3600,
)
log(response)

Either way, the db should now have a basic setup.
Let's check if we have all the metadata we need before we proceed.
While we're at it, we save the categories so we can use them for the ingestion in the next step. We'll do the same for the indicators, and spatial and temporal resolutions as well.

In [9]:
response = httpx.get(
    url=f"{BASE_URL}/categories", headers={"Authorization": f"Bearer {admin_token}"}
)
log(response)
categories = json.loads(response.content)["categories"]
category_type_lookup = {ct['category_id']:ct['type'] for ct in categories}

{
    "categories": [
        {
            "category_id": "residents",
            "type": "single_location",
            "order": 1,
            "flowgeek_url": "https://www.flowgeek.org/methods/calculating-mobility-indicators/residents-indicators",
            "label": "Residents",
            "description": "Residents-class indicators describe long-term (monthly) changes in the number of people whose home location is within each area.",
            "label_fr": "R\u00e9sidents",
            "description_fr": "Les indicateurs relatifs aux r\u00e9sidents d\u00e9crivent les variations (mensuelles) \u00e0 long terme du nombre de personnes dont le lieu de r\u00e9sidence se trouve dans chaque zone."
        },
        {
            "category_id": "relocations",
            "type": "flow",
            "order": 2,
            "flowgeek_url": "https://www.flowgeek.org/methods/calculating-mobility-indicators/relocation-indicators/",
            "label": "Relocation",
            "description"

In [10]:
response = httpx.get(
    url=f"{BASE_URL}/indicators", headers={"Authorization": f"Bearer {admin_token}"}
)
log(response)
indicators = json.loads(response.content)["indicators"]

{
    "indicators": [
        {
            "indicator_id": "residents.residents",
            "category_id": "residents",
            "order": 1,
            "flowgeek_url": "https://www.flowgeek.org/methods/calculating-mobility-indicators/residents-indicators#residents",
            "label": "Residents",
            "description": "Estimates the number of people residing in an area during the month selected by the user.",
            "method": "The residents indicator estimates the number of people residing in each area during the month selected by the user.\n\nThe indicator is calculated from the net inflow indicator and the baseline number of residents in the area during a reference period.\n\nA subscriber's home location is determined by the area containing the cell tower which most frequently routed the subscriber's last network event (e.g. call, SMS message, or mobile data) of the day over the previous four weeks, updated monthly.",
            "scale": "sequential",
           

In [11]:
response = httpx.get(
    url=f"{BASE_URL}/spatial_resolutions", headers={"Authorization": f"Bearer {admin_token}"}
)
log(response)
spatial_resolutions = json.loads(response.content)["spatial_resolutions"]
srid_lookup = {f"adm{sr['index']}":sr['srid'] for sr in spatial_resolutions}

{
    "spatial_resolutions": [
        {
            "srid": 3,
            "label": "Communal section",
            "index": 3,
            "description": "A communal section is a third-level administrative division in Haiti.",
            "boundaries": null,
            "label_fr": "Section communale",
            "description_fr": "La section communale est une division administrative de troisi\u00e8me niveau en Ha\u00efti."
        }
    ]
}


In [12]:
response = httpx.get(
    url=f"{BASE_URL}/temporal_resolutions", headers={"Authorization": f"Bearer {admin_token}"}
)
log(response)
temporal_resolutions = json.loads(response.content)["temporal_resolutions"]
trid_lookup = {tr['relativedelta_unit']:tr['trid'] for tr in temporal_resolutions}

{
    "temporal_resolutions": [
        {
            "trid": 2,
            "label": "Month",
            "index": 1,
            "relativedelta_unit": "months",
            "relativedelta_num": 1,
            "date_format": "%Y-%m",
            "default_selected": 12,
            "description": "A calendar month",
            "label_fr": "Mois",
            "description_fr": "Mois civil"
        },
        {
            "trid": 4,
            "label": "Day",
            "index": 3,
            "relativedelta_unit": "days",
            "relativedelta_num": 1,
            "date_format": "%Y-%m-%d",
            "default_selected": 7,
            "description": "A day starting at 00:00:00 and ending at 23:59:59 on the same date",
            "label_fr": "Journ\u00e9e",
            "description_fr": "Une journ\u00e9e commen\u00e7ant \u00e0 00:00:00 et se terminant \u00e0 23:59:59 \u00e0 la m\u00eame date"
        }
    ]
}


## Data cleaning

Each file is a csv containing indicators for multiple dates for one category, at one spatial resolution and one temporal resolution. At present, there are not multiple resolutions per indicator, so we're assuming the files are just named for the category, e.g. `residents.csv`.

We're going to rename the date and spatial columns, then create dataset files for them, which are a json representation structured:

```
{
"metadata": {
    "revision": <version>,
    # adding a date here which will be overwritten later when it is actually added to the db
    # this is to avoid a fastapi.exceptions.RequestValidationError for checking the length of a "None" type
        "date_added": <datetime_now>,
        "category_id": category_id,
        "indicator_id": indicator_id,
        "srid": <srid>,
        "trid": <trid>,
        "dt": <date>,
    },
    "data_type": <category>,
    "data_input": [
        {
            "spatial_unit_ids": <list_of_ids>,
            "data": <value>,
        }
    ],
}
```

In [13]:
import pathlib

In [15]:
real_data_files = {
    "residents": {"category_id": "residents", "srid": srid_lookup['adm3'], "trid": trid_lookup['months']},
    "relocations": {"category_id": "relocations", "srid":srid_lookup['adm3'], "trid": trid_lookup['months']},
    "presence": {"category_id": "presence", "srid":srid_lookup['adm3'], "trid": trid_lookup['days']},
    "movements": {"category_id": "movements", "srid":srid_lookup['adm3'], "trid": trid_lookup['days']},
}
synthetic_files = {
    "residents_admin3_monthly_small": {"category_id": "residents", "srid": srid_lookup['adm3'], "trid": trid_lookup['months']},
    "relocations_admin3_monthly_small": {"category_id": "relocations", "srid":srid_lookup['adm3'], "trid": trid_lookup['months']},
    "presence_admin3_daily_small": {"category_id": "presence", "srid":srid_lookup['adm3'], "trid": trid_lookup['days']},
    "movements_admin3_daily_small": {"category_id": "movements", "srid":srid_lookup['adm3'], "trid": trid_lookup['days']},
}

data_version = "v1.0.2"
files = synthetic_files if SYNTHETIC else real_data_files

parent_dir = f"../impl/resources"
data_dir = f"{parent_dir}/data/synthetic" if SYNTHETIC else f"{parent_dir}/data"

# Discard inf as well as na
pd.set_option('use_inf_as_na', True)

def to_su_list(val):
    if isinstance(val, tuple):
        return [*val]
    else:
        return [val]

for file_name, meta in files.items():
    file_path = f"{data_dir}/{file_name}.csv"
    preprocessed_path = f"/tmp/{file_name}_preprocessed.csv"
    df = pd.read_csv(file_path) # First column is the date
    if file_name in ["residents", "presence"]:
        # min columns: date, spatial unit, one data column
        df = df.rename(columns={"pcod": "spatial_unit"})
        df["date"] = pd.to_datetime(df.date)
        df = df.set_index(["date", "spatial_unit"])   
    elif file_name in ["relocations", "movements"]:
        # min columns: date, 2 spatial units, one data column
        df = df.rename(
            columns={'month': "date", "pcod_from": "origin", "pcod_to": "destination"}
        )
        df["date"] = pd.to_datetime(df.date)
        df = df.set_index(["date", "origin", "destination"])
    df = df.sort_index()
    for column in df.columns:
        indicator_df = df[[column]].dropna()
        dates = indicator_df.index.levels[0]
        for dt in dates:
            fname = f"./tmp/{meta['category_id']}_{column}_{meta['srid']}_{meta['trid']}_{dt.strftime('%Y-%m-%dT%H:%M:%S')}_{data_version}.json"
            if pathlib.Path(fname).exists():
                continue
            try:
                dataset = {
                    "metadata": {
                        "revision": data_version,
                        # adding a date here which will be overwritten later when it is actually added to the db
                        # this is to avoid a fastapi.exceptions.RequestValidationError for checking the length of a "None" type
                            "date_added": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S"),
                            "category_id": meta['category_id'],
                            "indicator_id": f"{meta['category_id']}.{column}",
                            "srid": meta['srid'],
                            "trid": meta['trid'],
                            "dt": dt.strftime("%Y-%m-%dT%H:%M:%S"),
                        },
                        "data_type": category_type_lookup[meta['category_id']],
                        "data_input": [
                            {
                                "spatial_unit_ids": to_su_list(rw[0]),
                                "data": rw[1],
                            } for rw in indicator_df.loc[dt].itertuples()
                        ],
                    }
                fname = f"./tmp/{meta['category_id']}_{column}_{meta['srid']}_{meta['trid']}_{dt.strftime('%Y-%m-%dT%H:%M:%S')}_{data_version}.json"
                with open(fname, "w") as fout:
                    json.dump(dataset, fout)
                    print(f"Wrote {fname}")
            except KeyError:
                pass


## Data ingestion

Now we can load the data we want to ingest. We'll glob all the files for the data version, and post them up. (In fact, we're going to send a PATCH so we replace what's there.)


It's still recommended to compress the request body using `gzip`.

In [51]:
len(list(pathlib.Path("/tmp").glob(f"*_{data_version}.json")))

10384

In [23]:
CHUNK_SIZE = 20

import pathlib

def chunked_iterable(iterable, size):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, size))
        if not chunk:
            break
        yield chunk


async def post_async(ds, client):
    return await client.patch(
        url=f"{BASE_URL}/data",
        headers={
            "Content-Type": "application/json",
            "Content-Encoding": "gzip",
            "Authorization": f"Bearer {admin_token}",
        },
        data=gzip.compress(json.dumps(ds, default=str).encode("utf-8")),
        timeout=3600,
    )

def yield_files_for_version(data_version):
    files = pathlib.Path("tmp").glob(f"*_{data_version}.json")
    for fname in files:
        with open(fname) as fin:
            try:
                yield json.load(fin)
            except Exception as exc:
                print(exc)
                print(fname)

def count_files_for_version(data_version):
    return len(list(pathlib.Path("tmp").glob(f"*_{data_version}.json")))

async def ingest_data_version(data_version, chunksize):
    n_datasets = count_files_for_version(data_version)
    print(f"Starting ingestion of data for {count_files_for_version(data_version)} indicators...")
    num = 0
    async with httpx.AsyncClient() as client:
        for chunk in chunked_iterable(yield_files_for_version(data_version), size=chunksize):
            print(".", end="", flush=True)
            responses = await asyncio.gather(*(post_async(ds, client) for ds in chunk))
            for response in responses:
                if response.status_code not in [201, 204]:
                    print("")
                    log(response)
                else:
                    num += 1

In [None]:
await ingest_data_version(data_version, CHUNK_SIZE)
# loop = asyncio.get_event_loop()
# task = loop.create_task(doit())
# if not loop.is_running():
#    loop.run_until_complete(task)

Done! Provided you got all `201` or `204` responses (i.e. no errors), the data should now be in the database!

## Data permissions & access

The data is now in the database, but without access management, only administrators will be able to see the data by default.
To enable access by users depending on their roles, we need to define what scopes give access to which part of the data.
We use an "allow-list" style access management so we have to define each bit of data that will be accessible to users that aren't admins.
We do that using JSON. Each key is the name of a scope as defined in Auth0 (see also the API spec) and each value is a set of queries (as per API spec) that define a set of data.

In [16]:
data_access = {
    "read:free_data":{
            "start_date": "2020-01",
            "duration": 5,
        },
    "read:premium_data":{
            "start_date": "2020-01",
            "duration": 9999,
        },
}

In [17]:
trid_lookup

{'months': 2, 'days': 4}

Now we need to get the metadata IDs of the specified data:

In [18]:
scope_mappings = []
for scope, query in data_access.items():
    for indicator in indicators:
        query_to_sub = dict(**query)
        query_to_sub["mdids_only"] = True
        query_to_sub['category_id'] = indicator['category_id']
        query_to_sub['indicator_id'] = indicator['indicator_id']
        query_to_sub['srid'] = files[indicator['category_id']]['srid']
        query_to_sub['trid'] = files[indicator['category_id']]['trid']
        if files[indicator['category_id']]['trid'] == trid_lookup['days']:
            query_to_sub['duration'] = query['duration']*28
        response = httpx.post(
            url=f"{BASE_URL}/query",
            headers={
                "Content-Type": "application/json",
                "Content-Encoding": "gzip",
                "Authorization": f"Bearer {admin_token}",
            },
            data=gzip.compress(json.dumps(query_to_sub).encode("utf-8")),
        )
        scope_mappings += [(scope, mdid) for mdid in json.loads(response.content)["mdids"]]
print(scope_mappings)

[('read:free_data', '64740'), ('read:free_data', '66692'), ('read:free_data', '62032'), ('read:free_data', '60468'), ('read:free_data', '63270'), ('read:free_data', '62677'), ('read:free_data', '58347'), ('read:free_data', '65389'), ('read:free_data', '66975'), ('read:free_data', '61872'), ('read:free_data', '58148'), ('read:free_data', '62585'), ('read:free_data', '66239'), ('read:free_data', '64062'), ('read:free_data', '59708'), ('read:free_data', '60469'), ('read:free_data', '61198'), ('read:free_data', '67626'), ('read:free_data', '64787'), ('read:free_data', '58986'), ('read:free_data', '21632'), ('read:free_data', '21632'), ('read:free_data', '21632'), ('read:free_data', '21637'), ('read:free_data', '21637'), ('read:free_data', '21637'), ('read:free_data', '58706'), ('read:free_data', '64986'), ('read:free_data', '60337'), ('read:free_data', '63455'), ('read:free_data', '66908'), ('read:free_data', '61926'), ('read:free_data', '65805'), ('read:free_data', '63056'), ('read:free_d

Next we can ingest the scope mappings using the `/scope_mapping` endpoint:

In [19]:
for scope, mdid in scope_mappings:
        scope_mapping = {"scope": scope, "mdid": mdid}
        response = httpx.post(
            url=f"{BASE_URL}/scope_mapping",
            headers={
                "Content-Type": "application/json",
                "Content-Encoding": "gzip",
                "Authorization": f"Bearer {admin_token}",
            },
            data=gzip.compress(json.dumps(scope_mapping).encode("utf-8")),
        )
        if response.status_code not in [201, 204, 303]:
            log(response)
print("Done.")

KeyboardInterrupt: 

In [None]:
response = httpx.post(
            url=f"{BASE_URL}/scope_mapping",
            headers={
                "Content-Type": "application/json",
                "Content-Encoding": "gzip",
                "Authorization": f"Bearer {admin_token}",
            },
            data=gzip.compress(json.dumps(scope_mapping).encode("utf-8")),
        )

In [30]:
num = 0
async with httpx.AsyncClient() as client:
    for chunk in chunked_iterable(scope_mappings, size=25):
        print(".", end="", flush=True)
        responses = await asyncio.gather(*(client.post(
            url=f"{BASE_URL}/scope_mapping",
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {admin_token}",
            },
            data=json.dumps({"scope": scope, "mdid": mdid}).encode("utf-8"),
        ) for scope, mdid in chunk))
        for response in responses:
            if response.status_code not in [201, 204]:
                pass
            else:
                num += 1

............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

If not status codes other than `204` come back, then the ingestion of scope mappings worked and the data is now tagged.

Note that to delete data, you need to use the `httpx.request()` method as in the example below.
The reason is that the `httpx.delete()` method does not support a body although the spec does not explicitly forbid it.

```python
response = httpx.request(
    url=f"{BASE_URL}/scope_mapping",
    method="DELETE",
    headers={
        "Content-Type": "application/json",
        "Content-Encoding": "gzip",
        "Authorization": f"Bearer {admin_token}",
    },
    data=gzip.compress(json.dumps(scope_mapping).encode("utf-8")),
)
```

In [None]:
for indicator in indicators:
    for d
        query_to_sub = {
            "start_date": "2020-01",
            "duration": 1,
        }
        query_to_sub["mdids_only"] = True
        query_to_sub['category_id'] = indicator['category_id']
        query_to_sub['indicator_id'] = indicator['indicator_id']
        query_to_sub['srid'] = files[indicator['category_id']]['srid']
        query_to_sub['trid'] = files[indicator['category_id']]['trid']
        if files[indicator['category_id']]['trid'] == trid_lookup['days']:
            query_to_sub['duration'] = query['duration']*28
        response = httpx.post(
            url=f"{BASE_URL}/query",
            headers={
                "Content-Type": "application/json",
                "Content-Encoding": "gzip",
                "Authorization": f"Bearer {admin_token}",
            },
            data=gzip.compress(json.dumps(query_to_sub).encode("utf-8")),
        )
        scope_mappings += [(scope, mdid) for mdid in json.loads(response.content)["mdids"]]

In [None]:
# Clean out non 1.0.2 data

async with httpx.AsyncClient() as client:
    for chunk in chunked_iterable(scope_mappings, size=30):
        print(".", end="", flush=True)
        responses = await asyncio.gather(*(client.post(
            url=f"{BASE_URL}/scope_mapping",
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {admin_token}",
            },
            data=json.dumps({"scope": scope, "mdid": mdid}).encode("utf-8"),
        ) for scope, mdid in chunk))
        for response in responses:
            if response.status_code not in [201, 204]:
                print("")
                log(response)
            else:
                num += 1