In [1]:
import os

import pandas as pd
import geopandas as gpd
import regionmask
import xarray as xr
import dask
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from src.data_utils import *

# ERA5 Data Transformation and Regional Aggregation

This notebook processes raw ERA5 NetCDF files by:
1. Combining monthly variable files into unified datasets
2. Applying unit transformations (K→°C, Pa→hPa)
3. Computing derived variables (wind speed)
4. Aggregating to Arctic regions using spatial masks

## Load Sample Data

Load a single month of ERA5 data to explore the structure and variables.

In [2]:
msl = xr.open_dataset("../data/raw/era5/1979/era5_1979_01_meansealevelpressure.nc", engine="netcdf4")
uwind = xr.open_dataset("../data/raw/era5/1979/era5_1979_01_10mucompwind.nc", engine="netcdf4")
vwind = xr.open_dataset("../data/raw/era5/1979/era5_1979_01_10mvcompwind.nc", engine="netcdf4")
temp = xr.open_dataset("../data/raw/era5/1979/era5_1979_01_2mtemperature.nc")

In [3]:
def combine_era5_monthly_files():
    """
    Goes through all raw ERA5 files, combines them into one monthly file,
    performs transformations, and saves it to the interim folder.
    
    This function will:
    - Iterate through years 1979 to 2023 and months 1 to 12.
    - Create a destination folder in ../data/interim/era5/ if it doesn't exist.
    - Skip the file creation if the monthly file already exists.
    - Combine mean sea level pressure, u-wind, v-wind, temperature, and precipitation data.
    - Transform units (Kelvin to Celsius, Pa to hPa).
    - Calculate wind speed.
    - Save the processed data as a NetCDF file.
    """
    years = range(1979, 2024)
    months = range(1, 13)

    for year in years:
        output_folder = f"../data/interim/era5/{year}"
        os.makedirs(output_folder, exist_ok=True)

        for month in months:
            month_str = f"{month:02d}"
            output_filepath = f"{output_folder}/{year}_{month_str}.nc"

            if os.path.exists(output_filepath):
                print(f"Skipping {year}-{month_str}: File already exists.")
                continue

            print(f"Processing {year}-{month_str}")

            try:
                msl = xr.open_dataset(f"../data/raw/era5/{year}/era5_{year}_{month_str}_meansealevelpressure.nc",
                                      engine="netcdf4")
                uwind = xr.open_dataset(f"../data/raw/era5/{year}/era5_{year}_{month_str}_10mucompwind.nc",
                                        engine="netcdf4")
                vwind = xr.open_dataset(f"../data/raw/era5/{year}/era5_{year}_{month_str}_10mvcompwind.nc",
                                        engine="netcdf4")
                temp = xr.open_dataset(f"../data/raw/era5/{year}/era5_{year}_{month_str}_2mtemperature.nc",
                                       engine="netcdf4")
                precip = xr.open_dataset(f"../data/raw/era5/{year}/era5_{year}_{month_str}_totalprecipitation.nc",
                                         engine="netcdf4")

                ds = xr.merge([msl, uwind, vwind, temp, precip])
                ds["tp"] = ds["tp"].fillna(0)
                ds["tp"] = ds["tp"].where(ds["tp"] >= 0, 0)

                # Convert temperature from Kelvin to Celsius
                ds["t2m"] = ds["t2m"] - 273.15

                # Convert pressure from Pa to hPa
                ds["msl"] = ds["msl"] / 100.0

                # Calculate wind speed magnitude
                ds["wind_speed"] = np.sqrt(ds["u10"] ** 2 + ds["v10"] ** 2)

                ds.to_netcdf(output_filepath)
            except FileNotFoundError:
                print(f"Could not process {year}-{month_str}: One or more source files not found.")


## Combine Monthly Files

Merge individual variable files into unified monthly NetCDF files with unit conversions applied.

In [4]:
combine_era5_monthly_files()

Skipping 1979-01: File already exists.
Skipping 1979-02: File already exists.
Skipping 1979-03: File already exists.
Skipping 1979-04: File already exists.
Skipping 1979-05: File already exists.
Skipping 1979-06: File already exists.
Skipping 1979-07: File already exists.
Skipping 1979-08: File already exists.
Skipping 1979-09: File already exists.
Skipping 1979-10: File already exists.
Skipping 1979-11: File already exists.
Skipping 1979-12: File already exists.
Skipping 1980-01: File already exists.
Skipping 1980-02: File already exists.
Skipping 1980-03: File already exists.
Skipping 1980-04: File already exists.
Skipping 1980-05: File already exists.
Skipping 1980-06: File already exists.
Skipping 1980-07: File already exists.
Skipping 1980-08: File already exists.
Skipping 1980-09: File already exists.
Skipping 1980-10: File already exists.
Skipping 1980-11: File already exists.
Skipping 1980-12: File already exists.
Skipping 1981-01: File already exists.
Skipping 1981-02: File al

## Define Arctic Regions

List NSIDC-defined Arctic regions for spatial aggregation. Pan-Arctic represents the combined Arctic domain.

In [5]:
regions = [
    'Central_Arctic',
    'Beaufort',
    'Chukchi-NA',
    'Chukchi-Asia',
    'E_Siberian',
    'Laptev',
    'Kara',
    'Barents',
    'E_Greenland',
    'Baffin',
    'St_Lawr',
    'Hudson',
    'Can_Arch',
    'Bering-NA',
    'Bering-Asia',
    'Okhotsk',
    'Bohai',
    'Baltic'
]

# Add pan_arctic as a special region for full Arctic aggregation
all_regions = regions + ['pan_arctic']

## Regional Aggregation

Create spatial masks for each region and compute daily statistics (mean, std, p15, p85) for all variables. Output saved as long-format Parquet files.

In [12]:
def aggregate_to_region_parquets():
    """
    Aggregates ERA5 data by Arctic regions and saves to Parquet files.

    This function:
    - Creates spatial masks for each Arctic region
    - Creates a combined pan-arctic mask from all Arctic regions
    - Processes ERA5 NetCDF files by year/month
    - Computes regional statistics (mean, std, percentiles) for each variable
    - Saves aggregated data in Parquet format with schema: date, region, variable, stat, value
    """
    years = range(1979, 2024)
    months = range(1, 13)
    variables = ['t2m', 'msl', 'u10', 'v10', 'tp', 'wind_speed']
    stats = ['mean', 'std', 'p15', 'p85']

    # Define Arctic regions for pan-arctic aggregation (exclude non-Arctic seas)
    arctic_regions_for_pan_arctic = [
        'Central_Arctic', 'Beaufort', 'Chukchi-NA', 'Chukchi-Asia', 'E_Siberian',
        'Laptev', 'Kara', 'Barents', 'E_Greenland', 'Baffin', 'St_Lawr',
        'Hudson', 'Can_Arch', 'Bering-NA', 'Bering-Asia', 'Okhotsk'
    ]

    shapefiles = get_region_shapefiles()

    sample_ds = xr.open_dataset('../data/interim/era5/1979/1979_01.nc')
    region_masks = {}

    print("Creating region masks...")
    for region in regions:
        region_shape = shapefiles[shapefiles['Region'] == region]['geometry']
        if region_shape.empty:
            print(f"Warning: No geometry found for region {region}")
            continue

        mask = regionmask.mask_geopandas(
            region_shape,
            sample_ds.longitude,
            sample_ds.latitude
        )
        region_masks[region] = mask

        valid_points = (~mask.isnull()).sum()
        print(f"  {region}: {valid_points} valid grid points")

    # Create combined pan-arctic mask from all Arctic regions
    print("Creating pan-arctic combined mask...")
    pan_arctic_shapes = shapefiles[shapefiles['Region'].isin(arctic_regions_for_pan_arctic)]['geometry']

    if not pan_arctic_shapes.empty:
        pan_arctic_mask = regionmask.mask_geopandas(
            pan_arctic_shapes,
            sample_ds.longitude,
            sample_ds.latitude,
            overlap=False
        )
        region_masks['pan_arctic'] = pan_arctic_mask

        valid_points = (~pan_arctic_mask.isnull()).sum()
        print(f"  pan_arctic: {valid_points} valid grid points (combined from {len(arctic_regions_for_pan_arctic)} Arctic regions)")
    else:
        print("  Warning: Could not create pan-arctic mask")

    sample_ds.close()

    output_dir = "../data/processed/parquet"
    os.makedirs(output_dir, exist_ok=True)

    # Process all regions including pan_arctic
    all_processing_regions = regions + ['pan_arctic']

    for year in years:
        print(f"Processing year {year}...")
        yearly_data = []

        for month in months:
            month_str = f"{month:02d}"
            nc_filepath = f"../data/interim/era5/{year}/{year}_{month_str}.nc"

            if not os.path.exists(nc_filepath):
                print(f"  Skipping {year}-{month_str}: File not found")
                continue

            print(f"  Processing {year}-{month_str}")

            try:
                ds = xr.open_dataset(nc_filepath)

                for time_idx in range(len(ds.valid_time)):
                    date = pd.to_datetime(ds.valid_time.values[time_idx]).date()
                    daily_data = ds.isel(valid_time=time_idx)

                    # Process all regions (regional + pan_arctic)
                    for region in all_processing_regions:
                        if region not in region_masks:
                            continue

                        mask = region_masks[region]

                        for var in variables:
                            if var not in daily_data.data_vars:
                                continue

                            masked_data = daily_data[var].where(~mask.isnull())

                            if masked_data.isnull().all():
                                continue

                            for stat in stats:
                                if stat == 'mean':
                                    value = float(masked_data.mean().values)
                                elif stat == 'std':
                                    value = float(masked_data.std().values)
                                elif stat == 'p15':
                                    value = float(masked_data.quantile(0.15).values)
                                elif stat == 'p85':
                                    value = float(masked_data.quantile(0.85).values)

                                if pd.isna(value):
                                    continue

                                yearly_data.append({
                                    'date': date,
                                    'region': region,
                                    'variable': var,
                                    'stat': stat,
                                    'value': value
                                })

                ds.close()

            except Exception as e:
                print(f"  Error processing {year}-{month_str}: {e}")
                continue

        if yearly_data:
            df = pd.DataFrame(yearly_data)
            parquet_filepath = f"{output_dir}/era5_regional_{year}.parquet"
            df.to_parquet(parquet_filepath, index=False)
            print(f"  Saved {len(df)} records to {parquet_filepath}")
        else:
            print(f"  No data to save for year {year}")



def get_region_shapefiles():
    shapefile = gpd.read_file("../data/raw/shapefiles_regions/NSIDC-0780_SeaIceRegions_NH_v1.0.shp")
    return shapefile

In [13]:
aggregate_to_region_parquets()

Creating region masks...
  Central_Arctic: <xarray.DataArray 'mask' ()> Size: 8B
array(63300)
Coordinates:
    number   int64 8B 0 valid grid points
  Beaufort: <xarray.DataArray 'mask' ()> Size: 8B
array(4661)
Coordinates:
    number   int64 8B 0 valid grid points
  Chukchi-NA: <xarray.DataArray 'mask' ()> Size: 8B
array(3800)
Coordinates:
    number   int64 8B 0 valid grid points
  Chukchi-Asia: <xarray.DataArray 'mask' ()> Size: 8B
array(190)
Coordinates:
    number   int64 8B 0 valid grid points
  E_Siberian: <xarray.DataArray 'mask' ()> Size: 8B
array(5802)
Coordinates:
    number   int64 8B 0 valid grid points
  Laptev: <xarray.DataArray 'mask' ()> Size: 8B
array(6922)
Coordinates:
    number   int64 8B 0 valid grid points
  Kara: <xarray.DataArray 'mask' ()> Size: 8B
array(9749)
Coordinates:
    number   int64 8B 0 valid grid points
  Barents: <xarray.DataArray 'mask' ()> Size: 8B
array(11424)
Coordinates:
    number   int64 8B 0 valid grid points
  E_Greenland: <xarray.DataArra

  mask = regionmask.mask_geopandas(
  mask = regionmask.mask_geopandas(


  Bohai: <xarray.DataArray 'mask' ()> Size: 8B
array(0)
Coordinates:
    number   int64 8B 0 valid grid points
  Baltic: <xarray.DataArray 'mask' ()> Size: 8B
array(3169)
Coordinates:
    number   int64 8B 0 valid grid points
Creating pan-arctic combined mask...
  pan_arctic: <xarray.DataArray 'mask' ()> Size: 8B
array(159300)
Coordinates:
    number   int64 8B 0 valid grid points (combined from 16 Arctic regions)
Processing year 1979...
  Processing 1979-01
  Processing 1979-02
  Processing 1979-03
  Processing 1979-04
  Processing 1979-05
  Processing 1979-06
  Processing 1979-07
  Processing 1979-08
  Processing 1979-09
  Processing 1979-10
  Processing 1979-11
  Processing 1979-12
  Saved 148920 records to ../data/processed/parquet/era5_regional_1979.parquet
Processing year 1980...
  Processing 1980-01
  Processing 1980-02
  Processing 1980-03
  Processing 1980-04
  Processing 1980-05
  Processing 1980-06
  Processing 1980-07
  Processing 1980-08
  Processing 1980-09
  Processing 1

## Merge with Ice Extent Data

Load regional atmospheric statistics and merge with sea ice extent from database. Handles region name mapping between ERA5 and NSIDC conventions.

In [14]:
def load_data_for_year(year, region):
    """
    Loads and merges ERA5 atmospheric data with sea ice extent data for a specific year and region.

    Handles the format mismatch between long-format parquet files and wide-format database tables
    by pivoting the atmospheric data and mapping region names between datasets.

    Parameters:
    - year (int): The year for which to load the data
    - region (str): The target region name (using database naming convention)

    Returns:
    - pd.DataFrame: Merged dataset with atmospheric features as columns and ice extent data
    """
    region_mapping = {
        'Central_Arctic': 'Central',
        'Chukchi-NA': 'Chukchi',
        'Chukchi-Asia': 'Chukchi',
        'Bering-NA': 'Bering',
        'Bering-Asia': 'Bering',
        'Can_Arch': 'CanadianArchipelago',
        'E_Greenland': 'Greenland',
        'E_Siberian': 'East',
        'Baffin': 'Baffin',
        'Barents': 'Barents',
        'Beaufort': 'Beaufort',
        'Hudson': 'Hudson',
        'Kara': 'Kara',
        'Laptev': 'Laptev',
        'Okhotsk': 'Okhotsk',
        'pan_arctic': 'pan_arctic'  # Pan-arctic maps to itself
    }

    parquet_filepath = f"../data/processed/parquet/era5_regional_{year}.parquet"

    if not os.path.exists(parquet_filepath):
        raise FileNotFoundError(f"No data found for year {year} at {parquet_filepath}")

    df_atmospheric = pd.read_parquet(parquet_filepath)

    # Handle pan-arctic region - it maps directly
    if region == 'pan_arctic':
        df_filtered = df_atmospheric[df_atmospheric['region'] == 'pan_arctic'].copy()
        df_filtered['mapped_region'] = 'pan_arctic'
    else:
        # Handle regional data with mapping
        df_atmospheric['mapped_region'] = df_atmospheric['region'].map(region_mapping)
        df_filtered = df_atmospheric[df_atmospheric['mapped_region'] == region].copy()

    if df_filtered.empty:
        available_regions = df_atmospheric['region'].unique() if region == 'pan_arctic' else df_atmospheric['mapped_region'].dropna().unique()
        raise ValueError(f"No data found for region '{region}'. Available regions: {sorted(available_regions)}")

    df_filtered['var_stat'] = df_filtered['variable'] + '_' + df_filtered['stat']
    df_wide = df_filtered.pivot_table(
        index=['date', 'mapped_region'],
        columns='var_stat',
        values='value',
        aggfunc='first'
    ).reset_index()

    df_wide = df_wide.rename(columns={'mapped_region': 'region'})
    df_wide.columns.name = None

    df_ice_extent = load_yearly_data_from_database(year, region)
    df_merged = pd.merge(df_wide, df_ice_extent, on=['date', 'region'], how='left')

    return df_merged

In [17]:
test = load_data_for_year(2000, 'pan_arctic')


Merged dataset shape: (366, 27)
Columns: ['date', 'region', 'msl_mean', 'msl_p15', 'msl_p85', 'msl_std', 't2m_mean', 't2m_p15', 't2m_p85', 't2m_std', 'tp_mean', 'tp_p15', 'tp_p85', 'tp_std', 'u10_mean', 'u10_p15', 'u10_p85', 'u10_std', 'v10_mean', 'v10_p15', 'v10_p85', 'v10_std', 'wind_speed_mean', 'wind_speed_p15', 'wind_speed_p85', 'wind_speed_std', 'extent']

Sample merged data:
         date      region     msl_mean     msl_p15      msl_p85    msl_std  \
0  2000-01-01  pan_arctic  1008.869080  985.222260  1034.979431  21.064962   
1  2000-01-02  pan_arctic  1010.056396  989.708649  1034.662048  18.560568   
2  2000-01-03  pan_arctic  1012.788940  996.131491  1034.177856  17.217007   
3  2000-01-04  pan_arctic  1014.597534  998.398438  1036.304169  17.397156   
4  2000-01-05  pan_arctic  1016.404175  999.982178  1038.880463  17.708139   

    t2m_mean    t2m_p15   t2m_p85    t2m_std  ...   u10_std  v10_mean  \
0 -17.471605 -28.960815 -6.537964  10.528247  ...  5.381142  0.618569   
