Filter and create geoparquet file

In [12]:
import geopandas as gpd
import pandas as pd

# 1. Load inputs
gdf = gpd.read_file("./msoa_2021.geojson")    # your GeoJSON file
df  = pd.read_csv("./msoa_attributes.csv")        # your CSV with matching key

# 2. Keep only the join key and geometry from the GeoJSON
gdf = gdf[['MSOA21CD', 'geometry']]

# 3. Remember original CRS
orig_crs = gdf.crs

# 4. Reproject to a metric CRS (units in metres) for simplification
gdf = gdf.to_crs(epsg=3857)

# 5. Simplify geometries to ~50 m tolerance
gdf['geometry'] = gdf['geometry'].simplify(
    tolerance=50,
    preserve_topology=True
)

# 6. Reproject back to original CRS
gdf = gdf.set_geometry('geometry').to_crs(orig_crs)

# 7. Merge attributes from CSV on MSOA21CD
gdf = gdf.merge(df, on='MSOA21CD', how='left')

# 8. Filter out any MSOA21CD starting with 'W'
gdf = gdf[~gdf['MSOA21CD'].str.startswith('W')]

# 8. Filter out any MSOA21CD starting with 'S'
gdf = gdf[~gdf['MSOA21CD'].str.startswith('S')]

# 9. Write to GeoParquet
gdf.to_parquet(
    "msoa_2021_data.parquet",
    engine="pyarrow",
    index=False
)


Create LOD files

In [17]:
import geopandas as gpd
import pandas as pd
from pathlib import Path

def create_simplified_files(input_file, output_dir):
    """Create simplified versions of a geoparquet file."""
    # Load the original data
    print(f"Loading {input_file}...")
    gdf = gpd.read_parquet(input_file)

    # Create output directory if it doesn't exist
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save original (high detail)
    original_path = output_dir / "high_detail.parquet"
    print(f"Saving high detail to {original_path}...")
    gdf.to_parquet(original_path)

    # Create and save medium detail
    medium_gdf = gdf.copy()
    print("Creating medium detail...")
    medium_gdf['geometry'] = medium_gdf['geometry'].simplify(
        tolerance=0.001, preserve_topology=True)
    medium_path = output_dir / "medium_detail.parquet"
    print(f"Saving medium detail to {medium_path}...")
    medium_gdf.to_parquet(medium_path)

    # Create and save low detail
    low_gdf = gdf.copy()
    print("Creating low detail...")
    low_gdf['geometry'] = low_gdf['geometry'].simplify(
        tolerance=0.005, preserve_topology=True)
    low_path = output_dir / "low_detail.parquet"
    print(f"Saving low detail to {low_path}...")
    low_gdf.to_parquet(low_path)

    # Print statistics
    orig_size = len(gdf.geometry.to_wkt().sum())
    med_size = len(medium_gdf.geometry.to_wkt().sum())
    low_size = len(low_gdf.geometry.to_wkt().sum())

    print(f"Original size: {orig_size:,}, Medium: {med_size:,}, Low: {low_size:,}")
    print(f"Reduction - Medium: {(1-med_size/orig_size)*100:.1f}%, Low: {(1-low_size/orig_size)*100:.1f}%")

    return {
        "high": str(original_path),
        "medium": str(medium_path),
        "low": str(low_path)
    }

# Example usage
if __name__ == "__main__":
    input_file = Path("./msoa_2021_data.parquet")
    output_dir = Path("./lod_versions")
    file_paths = create_simplified_files(input_file, output_dir)

    # Print the resulting file paths
    print("\nGenerated files:")
    for level, path in file_paths.items():
        print(f"  {level}: {path}")

    print("\nTo use these optimized files with the MSOA Explorer app:")
    print("1. Make sure the files are in the 'lod_versions' subdirectory")
    print("2. Launch the Streamlit app normally - it will detect and use the optimized files")

Loading msoa_2021_data.parquet...
Saving high detail to lod_versions\high_detail.parquet...
Creating medium detail...
Saving medium detail to lod_versions\medium_detail.parquet...
Creating low detail...
Saving low detail to lod_versions\low_detail.parquet...
Original size: 12,485,076, Medium: 5,797,155, Low: 1,757,769
Reduction - Medium: 53.6%, Low: 85.9%

Generated files:
  high: lod_versions\high_detail.parquet
  medium: lod_versions\medium_detail.parquet
  low: lod_versions\low_detail.parquet

To use these optimized files with the MSOA Explorer app:
1. Make sure the files are in the 'lod_versions' subdirectory
2. Launch the Streamlit app normally - it will detect and use the optimized files


In [18]:
import geopandas as gpd
import pandas as pd
from pathlib import Path

# Paths
geojson_path = Path("./msoa_2021.geojson")
csv_dir = Path("./csv")
parquet_dir = Path("./parquet")
lod_dir = Path("./lod_versions")

# Ensure output directories exist
parquet_dir.mkdir(parents=True, exist_ok=True)
lod_dir.mkdir(parents=True, exist_ok=True)

# Load base GeoJSON and keep only key + geometry
gdf_base = gpd.read_file(geojson_path)[['MSOA21CD', 'geometry']]
orig_crs = gdf_base.crs
# Reproject to metric for simplification
gdf_base = gdf_base.to_crs(epsg=3857)
# Simplify geometries once at tolerance 50m
gdf_base['geometry'] = gdf_base['geometry'].simplify(
    tolerance=50,
    preserve_topology=True
)
# Back to original CRS
gdf_base = gdf_base.set_geometry('geometry').to_crs(orig_crs)

# Iterate over each CSV in csv_dir
def preprocess_csv(csv_file: Path):
    # Read attributes
    df = pd.read_csv(csv_file)
    # Merge with base geometries
    gdf = gdf_base.merge(df, on='MSOA21CD', how='left')
    # Filter out unwanted MSOAs
    gdf = gdf[~gdf['MSOA21CD'].str.startswith(('W', 'S'))]

    # Define output paths
    stem = csv_file.stem  # e.g. 'msoa_attributes'
    parquet_path = parquet_dir / f"{stem}.parquet"

    # Save preprocessed parquet
    gdf.to_parquet(parquet_path, engine='pyarrow', index=False)
    print(f"Saved preprocessed: {parquet_path}")
    return parquet_path

# LOD creation function
def create_lod_files(input_parquet: Path, output_subdir: Path):
    gdf = gpd.read_parquet(input_parquet)
    output_subdir.mkdir(parents=True, exist_ok=True)

    # High detail
    high_path = output_subdir / "high_detail.parquet"
    gdf.to_parquet(high_path, engine='pyarrow', index=False)

    # Medium detail
    med = gdf.copy()
    med['geometry'] = med['geometry'].simplify(tolerance=0.001, preserve_topology=True)
    med.to_parquet(output_subdir / "medium_detail.parquet", engine='pyarrow', index=False)

    # Low detail
    low = gdf.copy()
    low['geometry'] = low['geometry'].simplify(tolerance=0.005, preserve_topology=True)
    low.to_parquet(output_subdir / "low_detail.parquet", engine='pyarrow', index=False)

    print(f"LOD created in: {output_subdir}")
    return output_subdir

# Main workflow
if __name__ == "__main__":
    for csv_file in csv_dir.glob("*.csv"):
        print(f"Processing {csv_file.name}...")
        # Preprocess and get parquet
        parquet_path = preprocess_csv(csv_file)
        # Create LOD folder per file
        subdir = lod_dir / csv_file.stem
        create_lod_files(parquet_path, subdir)

    print("All files processed.")


Processing distance-to-work.csv...
Saved preprocessed: parquet\distance-to-work.parquet
LOD created in: lod_versions\distance-to-work
Processing emp-prop-filtered.csv...
Saved preprocessed: parquet\emp-prop-filtered.parquet
LOD created in: lod_versions\emp-prop-filtered
Processing emp-prop.csv...
Saved preprocessed: parquet\emp-prop.parquet
LOD created in: lod_versions\emp-prop
Processing emp-totals.csv...
Saved preprocessed: parquet\emp-totals.parquet
LOD created in: lod_versions\emp-totals
Processing highest-quali.csv...
Saved preprocessed: parquet\highest-quali.parquet
LOD created in: lod_versions\highest-quali
Processing msoa_attributes.csv...
Saved preprocessed: parquet\msoa_attributes.parquet
LOD created in: lod_versions\msoa_attributes
Processing msoa_attributes_v2.csv...
Saved preprocessed: parquet\msoa_attributes_v2.parquet
LOD created in: lod_versions\msoa_attributes_v2
Processing unemp.csv...
Saved preprocessed: parquet\unemp.parquet
LOD created in: lod_versions\unemp
All fi