# Extract data from NOMIS

This notebook extracts data from the NOMIS API. The purpose of this notebook is to test the API interface and demonstrate missing data when extracting data with multiple dates and other parameters such as cause_of_death, gender and age.

When the notebook was created no data is returned for the first date.

In [7]:
import pandas as pd
from tqdm.notebook import tqdm
from pyjstat import pyjstat
from typing import List

In [8]:
nomis_la_codes = pd.read_csv("./A_Assumptions/nomis_la_codes.csv")

In [9]:
def nomis_url(table_name: str, geography: str) -> str:
    # tables:
    # NM_161_1 - Deaths
    # NM_31_1 - Population

    url_base = f"https://www.nomisweb.co.uk/api/v01/dataset/{table_name}.jsonstat.json?"
    url_geography_base="geography="
    url_date_base="date="    

    url_params = {}
    url_params["NM_161_1"] = "&cause_of_death=0&gender=0&age=0&measure=1&measures=20100"
    url_params["NM_31_1"] = "&sex=5...7&age=0...19&measures=20100"

    dates = [
        "latest",
        "latestMINUS1",
        "latestMINUS2",
        "latestMINUS3",
        "latestMINUS4",
        "latestMINUS5",
        "latestMINUS6",
    ]
    date_enc = ",".join(dates)

    url = (
        url_base
        + url_geography_base
        + geography
        + "&"
        + url_date_base
        + date_enc
        + url_params[table_name]
    )

    return url


In [10]:
def write_list(output_list: List, output_filename: str) -> None:
    with open(f"./X_Output/{output_filename}", "w") as textfile:
        for el in output_list:
            textfile.write(el + "\n")

In [11]:
first = True
geography_urls = []
deaths = pd.DataFrame()
for _, geography in tqdm(nomis_la_codes.iterrows()):
    url = nomis_url("NM_161_1", str(geography["GEOGRAPHY"]))
    geography_urls.append(url)
    dataset = pyjstat.Dataset.read(url)
    df:pd.DataFrame = dataset.write('dataframe')  # type: ignore
    df['GEOGRAPHY'] = geography["GEOGRAPHY"]
    df['GEOGRAPHY_NAME'] = geography["GEOGRAPHY_NAME"]
    df['GEOGRAPHY_CODE'] = geography["GEOGRAPHY_CODE"]
    if first:
        deaths = df
        first = False
        break  # NOTE: THIS WILL STOP THE LOOP
    else:
        deaths = pd.concat([deaths, df], axis=0)

0it [00:00, ?it/s]

In [12]:
deaths.to_csv("./X_Output/la_deaths_all_causes_only.csv")
write_list(geography_urls, "mortality_urls_all_causes_only.txt")