# Hazard assessment for Infrastructures using Euro-Cordex datasets

## Calculation of the indicator "Percentiles of the temperature"

- See our [how to use risk workflows](https://handbook.climaax.eu/notebooks/workflows_how_to.html) page for information on how to run this notebook.

## Hazard assessment methodology
We utilized outputs from 14 models within the EURO-CORDEX framework to evaluate hazards affecting infrastructure, in this notebook we used the total daily precipitation as an indicator of the hazzard. Our analysis included three Representative Concentration Pathways (RCPs): RCP2.6, RCP4.5, and RCP8.5. To structure the future projections, we divided the RCP scenarios into three distinct periods: 2021–2050, 2041–2070, and 2071–2100. Additionally, we used the historical period (1981–2010) as a baseline for comparison.

For each period, we calculated the ensemble average across all 14 models, producing a single representative dataset for each timeframe, including both the historical and RCP scenarios. Using these averaged datasets, we computed the percentile of the daily maximum temperature 95 and 99.9 for each time period.

Finally, we calculated the anomalies by subtracting the historical dataset values from each of the future scenario datasets to quantify changes. The results of these computations will be visualized in the notebook *05_cordexTempPercentiles_plots.ipynb* to assess potential future hazards.

## Preparation work

### Select area of interest
Before downloading the data, we will define the coordinates of the area of interest, for this workflow we selected the Italy region. Based on the shapefile of the country we will be able to clip the datasets for further processing, and display hazard and damage maps for this area.

### Load libraries

In [None]:
import os
import xarray as xr
import xclim
import re

from collections import defaultdict

### Create the directory structure

In [None]:
# Define paths
nc_files = "/work/cmcc/dg07124/climax/data/cordex"
general_path = "/work/cmcc/dg07124/climax/indicators/cordex"
subfolders = ['historical','rcp26', 'rcp45', 'rcp85']

# Temperature thresholds
percentiles = ['0.95', '0.999']

# Time ranges to process
rcp_time_ranges = [('2021', '2050'), ('2041', '2070'), ('2071', '2100')]
historical_time_range = [('1981', '2010')]

In [None]:
# Function to process each NetCDF file for a given time range
def process_file(file_path, percentile, save_path, start_year, end_year):
    print("---------------------------------------------------")
    print(f"Processing {file_path} for time range {start_year}-{end_year} and {percentile}")
    ds = xr.open_dataset(file_path)

    # Select daily max temperature for the given time range
    ds_sliced = ds.sel(time=slice(start_year, end_year))
    dailyMaxTemp = (ds_sliced['tasmax'] - 273.15).resample(time='D').max()
    dailyMaxTemp.attrs['units'] = 'C'
    
    # Get the minimum and maximum values
    min_value = dailyMaxTemp.min(skipna=True).item()  # Convert to a scalar with .item()
    max_value = dailyMaxTemp.max(skipna=True).item()

    # Print the results
    print(f"Temp min value: {min_value}")
    print(f"Temp max value: {max_value}")

    # Calculate the number of days above the threshold using xclim

    # Calculate the percentiles across all time steps
    dailyMaxTemp_nonan = dailyMaxTemp.dropna(dim='time', how='all')
    calc_percentile = dailyMaxTemp_nonan.quantile(percentile, dim='time')


    # Create the new filename with the time range and threshold information
    filename = os.path.basename(file_path)  # Extract original filename
    file_name_no_ext = os.path.splitext(filename)[0]  # Remove extension
    new_filename = f"{file_name_no_ext}_p{percentile}_{start_year}-{end_year}.nc"

    # Save the result to the new file path
    calc_percentile.to_netcdf(os.path.join(save_path, new_filename))

    # Get the minimum and maximum values
    min_value_indic = calc_percentile.min(skipna=True).item()  # Convert to a scalar with .item()
    max_value_indic = calc_percentile.max(skipna=True).item()

    # Print the results
    print(f"Minimum percentile {percentile}: {min_value_indic}")
    print(f"Maximum percentile {percentile}: {max_value_indic}")

    print(f"Saved {new_filename} to {save_path}")

    return os.path.join(save_path, new_filename)  # Return path of processed file

In [None]:
# Loop through each subfolder (rcp26, rcp45, rcp85)
for subfolder in subfolders:
    print(subfolder)
    folder_path = os.path.join(nc_files, subfolder)
    save_subfolder = os.path.join(general_path, 'tempPercentiles', subfolder)

    # Create the destination subfolder if it doesn't exist
    os.makedirs(save_subfolder, exist_ok=True)

    # Choose the time ranges based on the subfolder
    if subfolder == 'historical':
        time_ranges = historical_time_range
    else:
        time_ranges = rcp_time_ranges

    # Initialize a dictionary to store processed files per threshold and time range
    processed_files_by_threshold = {percentile: [] for percentile in percentiles}

    # Loop through each NetCDF file in the subfolder
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)

        # Check if it's a NetCDF file (usually ends with .nc)
        if file.endswith('.nc'):
            # Loop through the temperature thresholds
            for percentile in percentiles:
                # Loop through the defined time ranges
                for start_year, end_year in time_ranges:
                    print(f"Processing Percentile {percentile} for time range {start_year}-{end_year}")

                    # Process and save the file with the new name for each time range
                    processed_file_path = process_file(file_path, percentile, save_subfolder, start_year, end_year)

print("Percentile calculation complete!")

## Average the Cordex models

In [None]:
# Define base directory where your subfolders are located
base_dir = "/work/cmcc/dg07124/climax/indicators/cordex/tempPercentiles"

# Define the subfolders for each scenario
subfolders = ['historical', 'rcp26', 'rcp45', 'rcp85']

# Dictionary to store the file paths grouped by subfolder, threshold, and time period
all_file_groups = {}

# Create a new folder for averaged models
averaged_results_dir = os.path.join(base_dir, 'avg_models')
os.makedirs(averaged_results_dir, exist_ok=True)

In [None]:
# Loop over each subfolder separately
for subfolder in subfolders:
    folder_path = os.path.join(base_dir, subfolder)

    # Initialize a dictionary for each subfolder to store grouped files
    file_groups = defaultdict(list)

    # Loop over all files in the current subfolder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".nc"):
            # Parse the threshold and time period from the filename
            parts = file_name.split("_")
            threshold = parts[-2]  # e.g., 'above45'
            time_period = parts[-1].replace(".nc", "")  # e.g., '2021-2050'

            # Create the group key based on threshold and time period
            group_key = (threshold, time_period)

            # Save the file path under its group
            file_path = os.path.join(folder_path, file_name)
            file_groups[group_key].append(file_path)

    # Save the grouped files for the current subfolder
    all_file_groups[subfolder] = file_groups


# Display the grouped file paths for verification
for subfolder, file_groups in all_file_groups.items():
    print(f"\nSubfolder: {subfolder}")
    for group_key, file_paths in file_groups.items():
        threshold, time_period = group_key
        print(f"  Group: Threshold = {threshold}, Time Period = {time_period}")
        for file_path in file_paths:
            print(f"    - {file_path}")

In [None]:
# After verification, proceed with averaging the files for each group in each subfolder
for subfolder, file_groups in all_file_groups.items():
    for group_key, file_paths in file_groups.items():
        print("---------------------------------------")

        # Unpack the group key (threshold and time_period)
        threshold, time_period = group_key  # Correctly use time_period from the group

        # Load all datasets in the group
        datasets = [xr.open_dataset(fp) for fp in file_paths]
        # Initialize a variable to hold the sum and count of valid datasets
        summed = None
        count = 0
        invalid_files = []  # To store the filenames with NaN values only

        for ds, fp in zip(datasets, file_paths):
            # Select the 'tx_days_above' variable and check for non-NaN values
            valid_data = ds['tasmax'].notnull()

            # Print min and max value for the current dataset
            min_value = ds['tasmax'].min().item()
            max_value = ds['tasmax'].max().item()
            print(f"File: {fp}")
            print(f"  Minimum value: {min_value}")
            print(f"  Maximum value: {max_value}")


            if valid_data.any():  # If there's at least one valid value
                if summed is None:
                    summed = ds['tasmax'].copy()  # Initialize summed with the first valid dataset
                else:
                    summed += ds['tasmax']  # Add to the sum
                count += 1  # Increment the count of valid datasets
            else:
                # If no valid data, add the file path to the invalid_files list
                invalid_files.append(fp)

        print(f"Number of valid datasets {count}")


        # Print filenames that are fully NaN
        if invalid_files:
            print("Files with NaN values:")
            for invalid_file in invalid_files:
                print(f"  - {invalid_file}")

        # Compute the average across the datasets only if count > 0
        if count > 0:
            averaged = summed / count

            # Define the subfolder for saving the averaged models for the current group
            subfolder_avg_dir = os.path.join(averaged_results_dir, subfolder)
            os.makedirs(subfolder_avg_dir, exist_ok=True)  # Create subfolder if it doesn't exist

            # Define output file path using the subfolder, threshold, and time period
            output_filename = f"{subfolder}_avg_{threshold}_{time_period}.nc"
            output_path = os.path.join(subfolder_avg_dir, output_filename)

            # Convert back to a dataset and save to a NetCDF file
            averaged_ds = averaged.to_dataset(name='tasmax')

            # Get the minimum and maximum values of the averaged dataset
            min_value_avg = averaged_ds['tasmax'].min().item()
            max_value_avg = averaged_ds['tasmax'].max().item()

            # Print the results for the averaged dataset
            print(f"  Averaged Dataset: Minimum value: {min_value_avg}")
            print(f"  Averaged Dataset: Maximum value: {max_value_avg}")
            averaged_ds.assign_coords({'lon' : ds.lon, 'lat':ds.lat})
            averaged_ds.to_netcdf(output_path)  # Save the averaged dataset
            print(f"Averaged data saved to: {output_path}")
        else:
            print("No valid datasets found for this group; skipping averaging.")

## Substract the future scenarios from the historical datasets

In [None]:
# Define the base directory containing the model subfoldersscenario
base_dir = "/work/cmcc/dg07124/climax/indicators/cordex/tempPercentiles"
scenario_dir = os.path.join(base_dir, 'avg_models')
historical_dir = os.path.join(scenario_dir, 'historical')
output_folder = os.path.join(base_dir, 'subtracted_rcps')
os.makedirs(output_folder, exist_ok=True)

# Define the subfolders
scenario_subfolders = ['rcp26', 'rcp45', 'rcp85']

In [None]:
       
# Loop over each scenario subfolder (rcp26, rcp45, rcp85)
for scenario in scenario_subfolders:
    scenario_folder = os.path.join(scenario_dir, scenario)

    # Loop through each file in the scenario subfolder
    for scenario_file in os.listdir(scenario_folder):
        if scenario_file.endswith(".nc"):
            # Extract the threshold from the scenario filename (e.g., 'above35' from 'rcp26_avg_above35_2021-2050.nc')
            scenario_parts = scenario_file.split("_")
            scenario_threshold = scenario_parts[-2]  # e.g., 'above35'

            # Find the corresponding historical file with the same threshold
            matching_historical_file = None
            for hist_file in os.listdir(historical_dir):
                if hist_file.endswith(".nc"):
                    hist_parts = hist_file.split("_")
                    historical_threshold = hist_parts[-2]  # e.g., 'above35'

                    # Check if the thresholds match
                    if scenario_threshold == historical_threshold:
                        matching_historical_file = hist_file
                        break  # Exit loop once a matching file is found

            # If a matching historical file is found, proceed with subtraction
            if matching_historical_file:
                historical_file_path = os.path.join(historical_dir, matching_historical_file)

                print(f"Scenario file: {scenario_file}")
                print(f"Historical file: {matching_historical_file}")

                # Load both scenario and historical datasets
                scenario_ds = xr.open_dataset(os.path.join(scenario_folder, scenario_file))
                historical_ds = xr.open_dataset(historical_file_path)

                # Ensure both datasets contain the same variable ('tx_days_above') and then subtract
                if 'tasmax' in scenario_ds and 'tasmax' in historical_ds:
                    # Subtract historical from scenario
                    diff = scenario_ds['tasmax'] - historical_ds['tasmax']

                    # Save the difference to a new NetCDF file
                    scenario_time_period = scenario_parts[-1].replace(".nc", "")  # Extract the time period
                    percentile_thresh = scenario_threshold.replace('.', '')
                    diff_filename = f"diff_{scenario}_{percentile_thresh}_{scenario_time_period}.nc"
                    diff_filepath = os.path.join(output_folder, diff_filename)
                    diff = diff.assign_coords({'lon' : historical_ds.lon, 'lat': historical_ds.lat})


                    diff.to_dataset(name='tasmax').to_netcdf(diff_filepath)

                    print(f"Difference saved to: {diff_filepath}")
                else:
                    print(f"Variable 'tasmax' not found in one of the datasets.")
            else:
                print(f"Corresponding historical file not found for threshold: {scenario_threshold} in {scenario_file}")

## Contributors
- Giuseppe Giugliano (giuseppe.giugliano@cmcc.it)
- Carmela de Vivo (carmela.devivo@cmcc.it)
- Daniela Quintero (daniela.quintero@cmcc.it)