Download MSOA layer

In [None]:
import pathlib, requests, tqdm, geopandas as gpd, os

DATA = pathlib.Path(".")
GJ   = DATA / "msoa_2021.geojson"        # plain, not gzipped

URL  = ("https://services1.arcgis.com/ESMARspQHYMw9BZ9/arcgis/rest/services/Middle_layer_Super_Output_Areas_December_2021_Boundaries_EW_BGC_V3/FeatureServer/0/query?where=1%3D1&outFields=*&geometry=&geometryType=esriGeometryEnvelope&inSR=4326&spatialRel=esriSpatialRelIntersects&outSR=&f=json")   # ONS 20 m generalisation

if not GJ.exists():
    print("[+] Downloading MSOA boundaries …")
    with requests.get(URL, stream=True) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", 0))
        with open(GJ, "wb") as f, tqdm.tqdm(
            total=total, unit="B", unit_scale=True, unit_divisor=1024
        ) as bar:
            for chunk in r.iter_content(chunk_size=1 << 20):
                f.write(chunk)
                bar.update(len(chunk))

size_mb = os.path.getsize(GJ) / 1_048_576
print(f"[✓] Download complete ({size_mb:.1f} MB)")

gdf = gpd.read_file(GJ)
print(f"[✓] GeoPandas read {len(gdf):,} MSOAs")


Compress MSOA

In [None]:
import gzip, shutil

GZ = DATA / "msoa_2021.geojson.gz"
if not GZ.exists():
    print("[+] Gzipping to save disk space …")
    with open(GJ, "rb") as src, gzip.open(GZ, "wb") as dst:
        shutil.copyfileobj(src, dst)
    print(f"[✓] Gzipped file is {GZ.stat().st_size/1_048_576:.1f} MB")


Download ONS statistics

In [12]:
import geopandas as gpd
import pandas as pd

# 1. Load inputs
gdf = gpd.read_file("./msoa_2021.geojson")    # your GeoJSON file
df  = pd.read_csv("./msoa_attributes.csv")        # your CSV with matching key

# 2. Keep only the join key and geometry from the GeoJSON
gdf = gdf[['MSOA21CD', 'geometry']]

# 3. Remember original CRS
orig_crs = gdf.crs

# 4. Reproject to a metric CRS (units in metres) for simplification
gdf = gdf.to_crs(epsg=3857)

# 5. Simplify geometries to ~50 m tolerance
gdf['geometry'] = gdf['geometry'].simplify(
    tolerance=50,
    preserve_topology=True
)

# 6. Reproject back to original CRS
gdf = gdf.set_geometry('geometry').to_crs(orig_crs)

# 7. Merge attributes from CSV on MSOA21CD
gdf = gdf.merge(df, on='MSOA21CD', how='left')

# 8. Filter out any MSOA21CD starting with 'W'
gdf = gdf[~gdf['MSOA21CD'].str.startswith('W')]

# 9. Write to GeoParquet
gdf.to_parquet(
    "msoa_2021_data.parquet",
    engine="pyarrow",
    index=False
)
