# Extract data from NOMIS

This notebook extracts data from the NOMIS API. The notebook is targetted at the following table:

+ NM_608_1 - Ethnicity by LSOA (Lower Super Output Area)

The notebook downloads data by local super output area (listed in nomis_lsoa_codes.csv), for all ethnicity groups and total across rural and urban areas.

**Note: The provided lsoa code file provided is for the ceremonial county of Dorset.**

**This makes around 453 calls to the NOMIS API. Completing this for all of England will make 32,844 individual calls.**

**For more complicated queries it may be appropriate to batch-up LSOA into fewer queries.** 

Data is exported to the folder "X_Output" which needs to be created.

In [17]:
import pandas as pd
from tqdm.notebook import tqdm
from pyjstat import pyjstat
from typing import List

In [18]:
nomis_lsoa_codes = pd.read_csv("./A_Assumptions/nomis_lsoa_codes_dorset.csv")

In [19]:
def nomis_url(table_name: str, geography: str) -> str:
    # tables:
    # NM_608_1 - Ethnicity (LSOA)

    url_base = f"https://www.nomisweb.co.uk/api/v01/dataset/{table_name}.jsonstat.json?"
    url_geography_base="geography="
    url_date_base="date="    

    url_params = {}
    url_params["NM_608_1"] = "&rural_urban=0&cell=0...18&measures=20100"

    dates = [
        "latest",
    ]
    date_enc = ",".join(dates)

    url = (
        url_base
        + url_geography_base
        + geography
        + "&"
        + url_date_base
        + date_enc
        + url_params[table_name]
    )

    return url


In [20]:
def write_list(output_list: List, output_filename: str) -> None:
    with open(f"./X_Output/{output_filename}", "w") as textfile:
        for el in output_list:
            textfile.write(el + "\n")

In [21]:
nomis_lsoa_codes.columns

Index(['UTLA20CD', 'UTLA20NM', 'LSOA11CD', 'LSOA11NM', 'NOMIS Code'], dtype='object')

In [22]:
first = True
population_urls = []
population = pd.DataFrame()
for _, geography in tqdm(nomis_lsoa_codes.iterrows(), total=len(nomis_lsoa_codes)):
    url = nomis_url("NM_608_1", str(geography["NOMIS Code"]))
    dataset = pyjstat.Dataset.read(url)

    df: pd.DataFrame = dataset.write('dataframe')  # type: ignore
    df['NOMIS Code'] = geography["NOMIS Code"]
    df['LSOA11NM'] = geography['LSOA11NM']
    df['LSOA11CD'] = geography["LSOA11CD"]
    if first:
        population = df
        first = False
    else:
        population = pd.concat([population, df], axis=0)

  0%|          | 0/452 [00:00<?, ?it/s]

In [23]:
population.to_csv("./X_Output/lsoa_ethnicity_dorset.csv", index=False)