In [None]:
import geopandas as gpd
import pandas as pd
import os
from pathlib import Path
import fiona
import re
from dotenv import load_dotenv
import rasterio
from rasterstats import zonal_stats
import numpy as np
from pathlib import Path

In [None]:
lcz_data = {
    'LCZ' : list(range(1, 18)),
    'lcz_code': [
        'LCZ 1', 'LCZ 2', 'LCZ 3', 'LCZ 4', 'LCZ 5', 'LCZ 6', 'LCZ 7', 'LCZ 8', 'LCZ 9', 'LCZ 10',
        'LCZ 11 (A)', 'LCZ 12 (B)', 'LCZ 13 (C)', 'LCZ 14 (D)', 'LCZ 15 (E)', 'LCZ 16 (F)', 'LCZ 17 (G)'
    ],
    'description': [
        'Compact highrise', 'Compact midrise', 'Compact lowrise', 'Open highrise', 'Open midrise',
        'Open lowrise', 'Lightweight low-rise', 'Large lowrise', 'Sparsely built', 'Heavy Industry',
        'Dense trees', 'Scattered trees', 'Bush, scrub', 'Low plants', 'Bare rock or paved',
        'Bare soil or sand', 'Water'
    ],
    'color': [
        '#910613', '#D9081C', '#FF0A22', '#C54F1E', '#FF6628', '#FF985E', '#FDED3F', '#BBBBBB',
        '#FFCBAB', '#565656', '#006A18', '#00A926', '#628432', '#B5DA7F', '#000000', '#FCF7B1',
        '#656BFA'
    ]
}

lcz_label_df = pd.DataFrame(lcz_data)

print(lcz_label_df)

In [None]:
def process_grid_data_by_city(
        grid_path, 
        csv_path, 
        lcz_path, 
        lst_files, 
        output_dir, 
        id_field='id', 
        exclude_layers=None, 
        lcz_label_df=lcz_label_df):
    """
    Process grid data from a GeoPackage and CSV for multiple layers,
    creating separate GeoJSON files for each city.
    
    Parameters:
    -----------
    grid_path : str
        Path to the GeoPackage file containing the grid layers
    csv_path : str
        Path to the CSV file containing data associated with each grid ID
    lcz_path : str
        Path to the LCZ classification file
    lst_files : dict
        Dictionary mapping city names to LST file paths, e.g.,
        {'zurich': 'path/to/zurich.tif', 'geneva': 'path/to/geneva.tif'}
    output_dir : str
        Directory where the output GeoJSON files will be saved
    id_field : str
        The field name used to join the datasets
    exclude_layers : list, optional
        List of layer names to exclude from processing
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize exclude_layers as empty list if None
    if exclude_layers is None:
        exclude_layers = []
    
    # Get all available layers
    available_layers = fiona.listlayers(grid_path)
    
    # Filter out excluded layers
    available_layers = [layer for layer in available_layers if layer not in exclude_layers]
    
    lcz_df = pd.read_csv(lcz_path)
    lcz_df = lcz_df[[id_field, 'City', 'LCZ']].copy()
    lcz_df = lcz_df.merge(lcz_label_df, left_on='LCZ', right_on='LCZ', how='left')

    # Read the CSV data once
    full_data_df = pd.read_csv(csv_path)
    full_data_df = full_data_df.merge(lcz_df, on=[id_field, 'City'], how='left')
    
    # Group layers by city
    geneva_layers = []
    zurich_layers = []
    
    for layer in available_layers:
        # Normalize layer name to lowercase for comparison
        layer_lower = layer.lower()
        if 'geneva' in layer_lower:
            geneva_layers.append(layer)
        elif 'zurich' in layer_lower:
            zurich_layers.append(layer)
    
    # Process Geneva layers
    if geneva_layers:
        # Filter CSV data for Geneva only
        geneva_data_df = full_data_df[full_data_df['City'].str.lower() == 'geneva'].copy()
        print(f"Filtered {len(geneva_data_df)} Geneva records from CSV data")
        
        lst_file = lst_files.get('geneva')
        process_city_layers(
            grid_path, 
            geneva_data_df, 
            lst_file,  # Pass single file path instead of folder
            geneva_layers, 
            os.path.join(output_dir, "geneva_grid_data.geojson"), 
            id_field, 
            city_suffix="geneva"
        )
    
    # Process Zurich layers
    if zurich_layers:
        # Filter CSV data for Zurich only
        zurich_data_df = full_data_df[full_data_df['City'].str.lower() == 'zurich'].copy()
        print(f"Filtered {len(zurich_data_df)} Zurich records from CSV data")

        lst_file = lst_files.get('zurich')
        process_city_layers(
            grid_path, 
            zurich_data_df, 
            lst_file,  # Pass single file path instead of folder
            zurich_layers,
            os.path.join(output_dir, "zurich_grid_data.geojson"),
            id_field, 
            city_suffix="zurich"
        )
    
    return geneva_layers, zurich_layers

def clean_layer_name(layer_name, city_suffix):
    """
    Extract the base layer name without city suffixes.
    
    Parameters:
    -----------
    layer_name : str
        Original layer name from GeoPackage
    city_suffix : str
        City suffix to remove (e.g., 'geneva', 'zurich')
    
    Returns:
    --------
    str
        Cleaned layer name without city-specific parts
    """
    # Convert to lowercase for case-insensitive matching
    layer_lower = layer_name.lower()
    city_suffix = city_suffix.lower()
    
    # Remove common prefixes or suffixes
    cleaned_name = layer_name
    
    # Pattern 1: Remove "_cityname"
    cleaned_name = re.sub(f"_{city_suffix}", "", cleaned_name, flags=re.IGNORECASE)

    # Pattern 2: Remove "-cityname"
    cleaned_name = re.sub(f"-{city_suffix}", "", cleaned_name, flags=re.IGNORECASE)

    return cleaned_name

def calculate_lst_averages(grid_gdf, lst_file_path):
    """
    Calculate LST values for each grid cell from a single GeoTIFF file.
    
    Parameters:
    -----------
    grid_gdf : GeoDataFrame
        GeoDataFrame containing grid cells
    lst_file_path : str or Path
        Path to the LST GeoTIFF file
        
    Returns:
    --------
    GeoDataFrame
        Input GeoDataFrame with new 'lst_mean' column
    """
    lst_file_path = Path(lst_file_path)
    
    if not lst_file_path.exists():
        print(f"LST file not found: {lst_file_path}")
        return grid_gdf
    
    print(f"Processing LST file: {lst_file_path.name}")
    
    try:
        with rasterio.open(lst_file_path) as src:
            # Calculate zonal statistics for each grid cell
            stats = zonal_stats(
                grid_gdf.geometry,
                src.read(1),  # Read first band
                affine=src.transform,
                stats=['mean'],
                nodata=src.nodata
            )
            
            # Extract mean values and handle None values
            means = []
            for stat in stats:
                if stat is not None and 'mean' in stat and stat['mean'] is not None:
                    means.append(float(stat['mean']) - 273.15)  # Convert Kelvin to Celsius
                else:
                    means.append(np.nan)
    
            # Add to GeoDataFrame
            grid_gdf = grid_gdf.copy()
            grid_gdf['LST_mean'] = means
            
            # Add validation info
            valid_count = np.sum(~np.isnan(means))
            print(f"  Found {valid_count} cells with valid LST values")
            
            return grid_gdf
            
    except Exception as e:
        print(f"Error processing LST file: {e}")
        return grid_gdf

def process_city_layers(grid_path, data_df, lst_file, layers, output_path, id_field, city_suffix=""):
    """
    Process and merge multiple layers for a city into a single GeoJSON with one feature per grid cell.
    Dynamically uses the spatial join keys that are present in each layer.
    """
    print(f"Processing layers for {os.path.basename(output_path)}:")
    
    # All potential spatial properties to use for merging layers
    all_spatial_join_keys = ['id', 'left', 'top', 'right', 'bottom', 'row_index', 'col_index', 'geometry', 'typology']
    
    # Dictionary to store each layer's GeoDataFrame
    layer_gdfs = {}
    
    # Track the CRS for consistent projection
    common_crs = None
    
    # First, read all layers
    for layer in layers:
        print(f"  - Reading layer: {layer}")
        try:
            # Read the layer
            layer_gdf = gpd.read_file(grid_path, layer=layer)
            
            # Store CRS for later use
            if common_crs is None:
                common_crs = layer_gdf.crs
                
            # Clean the layer name to remove city suffix
            clean_name = clean_layer_name(layer, city_suffix)
            
            # Identify which spatial join keys are present in this layer
            available_keys = [key for key in all_spatial_join_keys if key in layer_gdf.columns]
            
            # Store the available keys with the layer
            layer_info = {
                'gdf': layer_gdf,
                'available_keys': available_keys
            }
            
            # Store in dictionary
            layer_gdfs[clean_name] = layer_info
            
        except Exception as e:
            print(f"    Error processing layer {layer}: {str(e)}")
    
    # If no layers were read successfully, return
    if not layer_gdfs:
        print(f"No layers could be processed for {os.path.basename(output_path)}")
        return
    
    # Start with the first layer as our base
    base_layer_name = list(layer_gdfs.keys())[0]
    combined_gdf = layer_gdfs[base_layer_name]['gdf'].copy()
    
    # Merge remaining layers one by one
    for layer_name, layer_info in list(layer_gdfs.items())[1:]:
        print(f"  - Merging layer: {layer_name}")
        
        # Get the layer GeoDataFrame
        layer_gdf = layer_info['gdf']
        
        # Determine which keys to use for merging with this layer
        # Find keys that exist in both the combined data and this layer
        available_keys = layer_info['available_keys']
        merge_keys = [key for key in available_keys if key in combined_gdf.columns]
        
        # Ensure we have at least some keys for joining, including 'geometry'
        if 'geometry' not in merge_keys:
            merge_keys.append('geometry')
        
        print(f"    Using merge keys: {merge_keys}")
        
        # Use outer join to keep all features
        combined_gdf = combined_gdf.merge(
            layer_gdf,
            on=merge_keys,
            how='outer'
        )
    
    # Add a column to track which city this is
    combined_gdf['City'] = city_suffix.lower()
    
    # Now join with the CSV data on id_field
    if id_field in combined_gdf.columns:
        print(f"  - Joining with CSV data on field: {id_field}")
        combined_gdf = combined_gdf.merge(data_df, on=id_field, how='left')
    else:
        print(f"  Warning: Combined data does not contain id_field '{id_field}', skipping CSV join")
    
    # Add LST data if available
    if os.path.exists(lst_file):
        print(f"  - Calculating LST averages")
        combined_gdf = calculate_lst_averages(combined_gdf, lst_file)
    
    if 'solar_winter' in combined_gdf.columns:
        print(f"  - Renaming solar_winter to Irradiance_W")
        combined_gdf = combined_gdf.rename(columns={'solar_winter': 'Irradiance_W'})
    elif 'solar_winter_2' in combined_gdf.columns:
        print(f"  - Renaming solar_winter_2 to Irradiance_W")
        combined_gdf = combined_gdf.rename(columns={'solar_winter_2': 'Irradiance_W'})

    if 'solar_summer' in combined_gdf.columns:
        print(f"  - Renaming solar_summer to Irradiance_S")
        combined_gdf = combined_gdf.rename(columns={'solar_summer': 'Irradiance_S'})

    # Round all floating-point columns to 2 decimal places
    float_columns = combined_gdf.select_dtypes(include=['float64']).columns
    if len(float_columns) > 0:
        print(f"  - Rounding {len(float_columns)} float columns to 2 decimal places")
        combined_gdf[float_columns] = combined_gdf[float_columns].round(2)

    # Export to GeoJSON
    combined_gdf.to_file(output_path, driver="GeoJSON")
    print(f"Exported {len(combined_gdf)} unique grid cells to {output_path}")
    
    return combined_gdf

In [None]:

load_dotenv()

data_folder = os.getenv('CITYTHERM_DATA_FOLDER', str(Path.home() / "Data"))
grid_path = os.path.join(data_folder, "open data multidomain neighbourhood types and environmental quality.gpkg")
csv_path = os.path.join(data_folder, "open_data_neighbourhood_parameters.csv")
output_path = os.path.join(data_folder, "grid_data.geojson")
lcz_path = os.path.join(data_folder, "citytherm.csv")
updated_lcz_path = os.path.join(data_folder, "citytherm_updated.csv")
lst_folder = os.path.join(data_folder, "lst")

lst_files = {
    'zurich': os.path.join(lst_folder, 'LST-zurich-19-7-24.tif'),
    'geneva': os.path.join(lst_folder, 'LST-geneva-23-8-11.tif')
}

In [None]:
lcz_data = pd.read_csv(lcz_path)
water_cells = [11, 311, 312, 330, 331, 332, 333, 334, 335, 336, 337, 362, 386, 435]
lcz_data.loc[lcz_data['id'].isin(water_cells), 'LCZ'] = 17
lcz_data.loc[lcz_data['id'] == 165, 'LCZ'] = 2

open_lowrise_cells = [39, 64, 89]
lcz_data.loc[lcz_data['id'].isin(open_lowrise_cells), 'LCZ'] = 6

lcz_data.to_csv(updated_lcz_path, index=False)

In [None]:
lcz_data.loc[lcz_data['id'].isin(water_cells)]

In [None]:
# Create output directory
output_dir = os.path.join(data_folder, "processed")

# Process all layers by city
geneva_layers, zurich_layers = process_grid_data_by_city(
    grid_path=grid_path,
    csv_path=csv_path,
    lcz_path=updated_lcz_path,
    lst_files=lst_files,
    output_dir=output_dir,
    id_field='id',
    exclude_layers=["LST-Geneva"]  # Exclude problematic layer
)

print("\nProcessed Geneva layers:")
for layer in geneva_layers:
    print(f"- {layer}")

print("\nProcessed Zurich layers:")
for layer in zurich_layers:
    print(f"- {layer}")