In [45]:
import pandas as pd
import numpy as np
import os
import re
import shutil
import xarray as xr
from scipy.spatial import cKDTree

### Hazard Extraction for UERRA Datasets

### Hazard Extraction for temperature extremes

In [2]:
# Step 1: Load the coordinates of the airports from the Excel file
csv_file = '/users_home/cmcc/dg07124/climax/airports_coordinates.csv'
airports_df = pd.read_csv(csv_file)

# List all the netCDF files of temperature in the 'uerra' subfolder
netcdf_files = [
    "NumbDaysAbove35.nc", "NumbDaysAbove40.nc", "NumbDaysAbove45.nc", 
    "Temp_P95.nc", "Temp_P999.nc"]

netcdf_folder = '/work/cmcc/dg07124/climax/indicators/uerra'
output_df = r'/work/cmcc/dg07124/climax/indicators/hazards_csv'

# Check if output_maps exists, if not, create it
if not os.path.exists(output_df):
    os.makedirs(output_df)
    
# Step 1: Create an empty DataFrame to store the results
# final_df = pd.DataFrame(index=airports_df['Airports'])  # Use airport names as the index

In [3]:
airports_df

Unnamed: 0,Airports,Lat,Lon
0,Milano Malpensa,45.63,8.73
1,Bergamo Orio al Serio,45.67,9.71
2,Milano Linate,45.45,9.28
3,Roma Fiumicino,41.8,12.25
4,Roma Ciampino,41.8,12.59
5,Napoli Capodichino,40.88,14.29
6,Palermo Punta Raisi,38.18,13.1
7,Catania Fontanarossa,37.47,15.07
8,Cagliari Elmas,39.25,9.06


In [5]:
# Step 1: Start with the airports DataFrame
Hazard_temp_df = airports_df.copy()  # Make a copy to keep the original columns
Hazard_temp_df = Hazard_temp_df.drop(columns=['Lat', 'Lon'])

# Step 2: Loop over each airport and extract values for each netCDF file
for index, row in airports_df.iterrows():
    airport_name = row['Airports']
    lat_airport = row['Lat']
    lon_airport = row['Lon']
    
    airport_coords = (lat_airport, lon_airport)
    
    # Loop through each netCDF file
    for netcdf_file in netcdf_files:
        # Path to the current netCDF file
        nc_file_path = os.path.join(netcdf_folder, netcdf_file)
        
        # Open the netCDF file using xarray
        ds = xr.open_dataset(nc_file_path)
        
        # Step 3: Extract the 2D latitudes and longitudes from the netCDF file using dimensions y, x
        lat = ds['latitude'].values  # Access the latitude variable
        lon = ds['longitude'].values  # Access the longitude variable

        # Flatten the 2D arrays to 1D for KDTree usage
        lat_flat = lat.flatten()
        lon_flat = lon.flatten()
        
        # Create a 2D array of lat/lon coordinates
        coords_flat = np.vstack((lat_flat, lon_flat)).T  # Shape: (n, 2)

        # Create a KDTree for fast nearest-neighbor search
        tree = cKDTree(coords_flat)
        
        # Step 4: Find the closest coordinates from the netCDF lat/lon grid for the current airport
        closest_idx = tree.query([airport_coords], k=1)[1]  # Get index of closest point
        
        # Step 5: Convert the flattened index to (y, x) coordinates
        closest_y, closest_x = np.unravel_index(closest_idx, lat.shape)
        
        # Step 6: Dynamically select and extract the relevant variable from the netCDF
        # print(f"Available variables in {netcdf_file}: {ds.variables}")
        
        # Example logic to handle different types of variables
        if 'tx_days_above' in ds.variables:
            variable_name = 'tx_days_above'
        elif 't2m' in ds.variables:
            variable_name = 't2m'
        else:
            raise ValueError(f"Unknown variable type in {netcdf_file}")
        
        # Step 7: Extract the value for the chosen variable at the closest lat/lon index
        extracted_value = ds[variable_name].isel(y=closest_y, x=closest_x).values
        
        # Step 8: Add the extracted value to the DataFrame for this airport
        column_name = f"{os.path.splitext(netcdf_file)[0]}"  # e.g., "NumbDaysAbove30"
        
        # Add the extracted value to the DataFrame
        Hazard_temp_df.loc[index, column_name] = extracted_value[0]  # [0] to get the scalar value, not an arra


In [6]:
# Calculate the average for each airport
Hazard_temp_df = Hazard_temp_df.set_index('Airports')
Hazard_temp_df['average'] = Hazard_temp_df.mean(axis=1)
Hazard_temp_df

Unnamed: 0_level_0,NumbDaysAbove35,NumbDaysAbove40,NumbDaysAbove45,Temp_P95,Temp_P999,average
Airports,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Milano Malpensa,0.0,0.0,0.0,29.125305,33.314549,12.487971
Bergamo Orio al Serio,0.433333,0.0,0.0,29.815527,35.204168,13.090606
Milano Linate,0.533333,0.0,0.0,30.817786,35.254721,13.321168
Roma Fiumicino,0.166667,0.0,0.0,28.725146,33.325572,12.443477
Roma Ciampino,3.4,0.0,0.0,31.914014,37.625475,14.587898
Napoli Capodichino,3.433333,0.0,0.0,32.37041,37.54934,14.670617
Palermo Punta Raisi,1.266667,0.1,0.0,29.616589,37.131341,13.622919
Catania Fontanarossa,8.933333,1.133333,0.2,33.401172,43.414801,17.416528
Cagliari Elmas,6.7,0.266667,0.0,32.359521,39.712394,15.807716


In [None]:
# Optionally: Save the resulting DataFrame to a CSV or Excel file
Hazard_temp_df.to_csv('/work/cmcc/dg07124/climax/indicators/hazard_uerra_temp.csv', index=True)

### Hazard Extraction for Extreme Precipitation

In [18]:
#  List all the netCDF files of precipitation in the 'uerra' subfolder
netcdf_files_precip = ["Precip_P99.nc", "Precip_P999.nc","return_levels_gumbel.nc"]
netcdf_folder = '/work/cmcc/dg07124/climax/indicators/uerra'

In [19]:
# Step 1: Start with the airports DataFrame
Hazard_prep_df = airports_df.copy()  # Make a copy to keep the original columns
Hazard_prep_df = Hazard_prep_df.drop(columns=['Lat', 'Lon'])

# Step 2: Loop over each airport and extract values for each netCDF file
for index, row in airports_df.iterrows():
    airport_name = row['Airports']
    lat_airport = row['Lat']
    lon_airport = row['Lon']
    
    airport_coords = (lat_airport, lon_airport)
    
    # Loop through each netCDF file
    for netcdf_file in netcdf_files_precip:
        # print(netcdf_file)
        # Path to the current netCDF file
        nc_file_path = os.path.join(netcdf_folder, netcdf_file)
        
        # Open the netCDF file using xarray
        ds = xr.open_dataset(nc_file_path)
        
        # Step 3: Extract the 2D latitudes and longitudes from the netCDF file using dimensions y, x
        lat = ds['latitude'].values  # Access the latitude variable
        lon = ds['longitude'].values  # Access the longitude variable

        # Flatten the 2D arrays to 1D for KDTree usage
        lat_flat = lat.flatten()
        lon_flat = lon.flatten()
        
        # Create a 2D array of lat/lon coordinates
        coords_flat = np.vstack((lat_flat, lon_flat)).T  # Shape: (n, 2)

        # Create a KDTree for fast nearest-neighbor search
        tree = cKDTree(coords_flat)
        
        # Step 4: Find the closest coordinates from the netCDF lat/lon grid for the current airport
        closest_idx = tree.query([airport_coords], k=1)[1]  # Get index of closest point
        
        # Step 5: Convert the flattened index to (y, x) coordinates
        closest_y, closest_x = np.unravel_index(closest_idx, lat.shape)
        
        # Handle the variable extraction
        if 'tp' in ds.variables:
            variable_name = 'tp'
            # Step 7: Extract the value for the 'tp' variable at the closest lat/lon index
            extracted_value = ds[variable_name].isel(y=closest_y, x=closest_x).values
            # print(f"Extracted value for 'tp' from {netcdf_file}: {extracted_value}")
            
            # Get the scalar value from the 2D array
            scalar_value = extracted_value.item()
            
            # Step 8: Add the extracted value to the DataFrame for this airport
            column_name = f"{os.path.splitext(netcdf_file)[0]}"  # e.g., ""
            # Hazard_prep_df.loc[index, column_name] = extracted_value[0]  # [0] to get the scalar value, not an array
            Hazard_prep_df.loc[index, column_name] = scalar_value
                
        elif 'return_levels_gumbel.nc'== netcdf_file:  # Special case for return levels
            # Loop over each return level and extract the values
            return_levels = ['return_period_10_y', 'return_period_20_y', 'return_period_30_y', 'return_period_50_y', 'return_period_100_y', 'return_period_150_y']
            for level in return_levels:
                # Step 7: Extract the value for each return level variable
                if level in ds.variables:
                    extracted_value = ds[level].isel(y=closest_y, x=closest_x).values
                    # print(f"Extracted value for {level} from {netcdf_file}: {extracted_value}")
                    
                    # Get the scalar value from the 2D array
                    scalar_value = extracted_value.item()  # Use .item() to get the scalar value
                    
                    # Step 8: Add the extracted value to the DataFrame for this airport
                    column_name = f"{level}"  # e.g., "10y_return_level_NumbDaysAbove30"
                    Hazard_prep_df.loc[index, column_name] = scalar_value
                else:
                    raise ValueError(f"Unknown variable type in {netcdf_file}")    

In [20]:
# Display the DataFrame
Hazard_prep_df = Hazard_prep_df.set_index('Airports')
Hazard_prep_df['average'] = Hazard_temp_df.mean(axis=1)
display(Hazard_prep_df)

Unnamed: 0_level_0,Precip_P99,Precip_P999,return_period_10_y,return_period_20_y,return_period_30_y,return_period_50_y,return_period_100_y,return_period_150_y,average
Airports,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Milano Malpensa,49.335938,96.395313,117.986531,131.458017,139.207831,148.895475,161.962396,169.582033,12.487971
Bergamo Orio al Serio,33.169922,51.742188,65.793917,72.556649,76.447082,81.310312,87.86995,91.695034,13.090606
Milano Linate,35.072266,64.171484,82.442278,91.711901,97.044486,103.710476,112.70172,117.944731,13.321168
Roma Fiumicino,26.423828,58.964844,78.221836,89.645754,96.217653,104.432845,115.513688,121.975196,12.443477
Roma Ciampino,25.951172,49.223047,63.375103,71.791038,76.632517,82.684602,90.847796,95.607951,14.587898
Napoli Capodichino,33.529297,82.900781,85.480144,97.357128,104.189663,112.730665,124.250968,130.968734,14.670617
Palermo Punta Raisi,39.536133,97.288281,105.600625,123.070083,133.119832,145.682506,162.627334,172.508272,13.622919
Catania Fontanarossa,25.761719,67.757031,87.909041,101.170601,108.799649,118.33633,131.199628,138.700528,17.416528
Cagliari Elmas,17.943359,36.934375,50.436781,57.035887,60.832191,65.577754,71.978681,75.711216,15.807716


In [21]:
# Save the DataFrame to a CSV file
Hazard_prep_df.to_csv('/work/cmcc/dg07124/climax/indicators/hazard_uerra_precip.csv', index=True)

## Hazard extraction for Cordex Datasets

## Extreme Temperature Hazards

In [4]:
# Temperature indicators folders for the extreme temperature hazards
temp_daysAbove_folder = '/work/cmcc/dg07124/climax/indicators/cordex2/tempdays/tempDaysAbove/averaged_ensembles'
temp_percentiles_folder = '/work/cmcc/dg07124/climax/indicators/cordex2/tempPercentiles/averaged_ensembles'

temp_avg_ensembles_folder = '/work/cmcc/dg07124/climax/indicators/cordex2/temp_avg_ensembles'

In [5]:
if not os.path.exists(temp_avg_ensembles_folder):
    os.makedirs(temp_avg_ensembles_folder)

# Step 2: Copy files from temp_daysAbove_folder and temp_percentiles_folder to temp_avg_ensembles_folder
def copy_files_to_new_folder(source_folder, destination_folder):
    for file_name in os.listdir(source_folder):
        source_file = os.path.join(source_folder, file_name)
        destination_file = os.path.join(destination_folder, file_name)
        
        if os.path.isfile(source_file):
            shutil.copy(source_file, destination_file)  # Copy file

# Copy files from both folders
copy_files_to_new_folder(temp_daysAbove_folder, temp_avg_ensembles_folder)
copy_files_to_new_folder(temp_percentiles_folder, temp_avg_ensembles_folder)

In [6]:
# Define time periods and RCP scenarios
time_periods = ['2021-2050', '2041-2070', '2071-2100']
rcp_scenarios = ['rcp26', 'rcp45', 'rcp85']

In [7]:
# Create the KDTree for fast nearest-neighbor search
def create_kd_tree(latitudes, longitudes):
    coords = np.vstack((latitudes, longitudes)).T  # Shape (n, 2)
    tree = cKDTree(coords)
    return tree

In [8]:
# Function to process the netCDF files for each time period and RCP scenario
def process_netCDF_files(netcdf_folder, time_period, rcp_scenario, airports_df, tree):
    # Initialize the list to hold the extracted values (one row for each airport)
    extracted_values = []

    # Define columns for the indicators (fixed set of columns)
    columns = ['Airports', 'p95', 'p999', 'tempabove35', 'tempabove40', 'tempabove45']

    # Initialize the dictionary to hold values for each airport
    for _, row in airports_df.iterrows():
        extracted_values_airports = {'Airports': row['Airports']}
        for col in columns[1:]:
            extracted_values_airports[col] = np.nan  # Initialize all indicators to NaN for this airport

        # Loop through each netCDF file and check if it belongs to the given time period and RCP scenario
        for file in os.listdir(netcdf_folder):
            if file.endswith('.nc') and time_period in file and rcp_scenario in file:
                # print(f"Processing file: {file}")

                # Open the netCDF file using xarray
                nc_file_path = os.path.join(netcdf_folder, file)
                ds = xr.open_dataset(nc_file_path)

                # Extract the lat/lon from the netCDF file
                lat = ds['lat'].values
                lon = ds['lon'].values

                # Flatten the 2D arrays to 1D for KDTree usage
                lat_flat = lat.flatten()
                lon_flat = lon.flatten()

                # Create a 2D array of lat/lon coordinates
                coords_flat = np.vstack((lat_flat, lon_flat)).T  # Shape: (n, 2)

                # Create a KDTree for fast nearest-neighbor search
                tree = cKDTree(coords_flat)

                # Extract the coordinates for each airport
                airport_coords = (row['Lat'], row['Lon'])

                # Find the closest coordinates from the netCDF lat/lon grid for the current airport
                closest_idx = tree.query([airport_coords], k=1)[1]  # Get index of closest point
                
                # Convert the flattened index to (y, x) coordinates
                closest_y, closest_x = np.unravel_index(closest_idx, lat.shape)

                # Extract the relevant variable value for each indicator
                extracted_value = ds['tasmax'].isel(y=closest_y, x=closest_x).values

                # Check the indicator based on the filename and update the corresponding column
                if 'tempabove35' in file:
                    extracted_values_airports['tempabove35'] = extracted_value.flatten()[0]
                elif 'tempabove40' in file:
                    extracted_values_airports['tempabove40'] = extracted_value.flatten()[0]
                elif 'tempabove45' in file:
                    extracted_values_airports['tempabove45'] = extracted_value.flatten()[0]
                elif 'p95' in file:
                    extracted_values_airports['p95'] = extracted_value.flatten()[0]
                elif 'p999' in file:
                    extracted_values_airports['p999'] = extracted_value.flatten()[0]
                    

        # Add the extracted values for this airport to the list
        extracted_values.append(extracted_values_airports)

    # Convert the extracted values to a DataFrame
    df = pd.DataFrame(extracted_values, columns=columns)
    return df

In [9]:
# Create a KDTree using airport coordinates
tree = create_kd_tree(airports_df['Lat'], airports_df['Lon'])

# Process the files for each time period and RCP scenario and create separate DataFrames
dfs = {}  # Dictionary to store DataFrames for each time period + RCP combination

for time_period in time_periods:
    for rcp_scenario in rcp_scenarios:
        # Process the files for the given time period and RCP scenario
        df = process_netCDF_files(temp_avg_ensembles_folder, time_period, rcp_scenario, airports_df, tree)
        if df is not None:
            dfs[f'{rcp_scenario}_{time_period}'] = df

# Display the DataFrames for each time period and RCP scenario
for key, df in dfs.items():
    print(f"Data for {key}:")
    df = df.set_index('Airports')
    # df['average'] = df.mean(axis=1)
    print(df)
    # save to CSV
    df.to_csv(f'/work/cmcc/dg07124/climax/indicators/cordex2/temp_avg_ensembles/temp_{key}_data.csv', index=True)

Data for rcp26_2021-2050:
                            p95      p999  tempabove35  tempabove40  \
Airports                                                              
Milano Malpensa        1.468656  1.341404     3.209444     0.360556   
Bergamo Orio al Serio  1.479139  1.303156     4.001111     0.138333   
Milano Linate          1.434630  1.373967     5.490556     0.450000   
Roma Fiumicino         1.403533  1.540112     5.499444     0.179444   
Roma Ciampino          1.445868  1.552179     6.056111     0.215556   
Napoli Capodichino     1.408447  1.337206     3.443333     0.222778   
Palermo Punta Raisi    1.382173  1.396151     2.182222     0.192222   
Catania Fontanarossa   1.396419  1.468293    10.577778     1.347222   
Cagliari Elmas         1.283076  1.732129     9.347778     0.867778   

                       tempabove45  
Airports                            
Milano Malpensa          -0.000556  
Bergamo Orio al Serio     0.000000  
Milano Linate             0.016111  
Roma Fi

## Example of the output for the hazard of extrem temperature for the scenario RCP 2.6 and time period 2021-2050

In [11]:
df = pd.read_csv('/work/cmcc/dg07124/climax/indicators/cordex2/temp_avg_ensembles/temp_rcp26_2021-2050_data.csv', index_col='Airports')
df

Unnamed: 0_level_0,p95,p999,tempabove35,tempabove40,tempabove45
Airports,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Milano Malpensa,1.468656,1.341404,3.209444,0.360556,-0.000556
Bergamo Orio al Serio,1.479139,1.303156,4.001111,0.138333,0.0
Milano Linate,1.43463,1.373967,5.490556,0.45,0.016111
Roma Fiumicino,1.403533,1.540112,5.499444,0.179444,0.0
Roma Ciampino,1.445868,1.552179,6.056111,0.215556,0.0
Napoli Capodichino,1.408447,1.337206,3.443333,0.222778,0.03
Palermo Punta Raisi,1.382173,1.396151,2.182222,0.192222,0.0
Catania Fontanarossa,1.396419,1.468293,10.577778,1.347222,0.063889
Cagliari Elmas,1.283076,1.732129,9.347778,0.867778,0.005556


In [15]:
# Define the folder path where your CSV files are stored
csv_folder_path = '/work/cmcc/dg07124/climax/indicators/cordex2/temp_avg_ensembles'

# List all the CSV files in the folder
csv_files = [f for f in os.listdir(csv_folder_path) if f.endswith('.csv')]
csv_files

['temp_rcp45_2071-2100_data.csv',
 'temp_rcp45_2021-2050_data.csv',
 'temp_rcp85_2021-2050_data.csv',
 'temp_rcp26_2041-2070_data.csv',
 'temp_rcp26_2021-2050_data.csv',
 'temp_rcp85_2071-2100_data.csv',
 'temp_rcp85_2041-2070_data.csv',
 'temp_rcp26_2071-2100_data.csv',
 'temp_rcp45_2041-2070_data.csv']

In [16]:
# Function to normalize a DataFrame
def normalize_dataframe(df):
    # Normalize the DataFrame using Min-Max scaling along the columns (axis=0)
    df_normalized = df.apply(lambda x: (x - x.min()) / (x.max() - x.min()), axis=0)
    
    # Calculate the average along the rows (Airports)
    df_normalized['average'] = df_normalized.mean(axis=1)
    
    return df_normalized

In [18]:
# Loop through each CSV file, process and normalize
for csv_file in csv_files:
    # Read the CSV file
    file_path = os.path.join(csv_folder_path, csv_file)
    df = pd.read_csv(file_path, index_col='Airports')  # Assuming 'Airports' is the index column

    # Print the original DataFrame
    print(f"Original Data for {csv_file}:")
    print(df)
    
    # Normalize the DataFrame
    df_normalized = normalize_dataframe(df)
    
    # Print the normalized DataFrame
    print(f"Normalized Data for {csv_file}:")
    print(df_normalized)
    
    # Optionally, save the normalized DataFrame to a new CSV file
    normalized_file_path = os.path.join(csv_folder_path, f"temp_normalized_{csv_file}")
    df_normalized.to_csv(normalized_file_path)
    
    print("-" * 50)  # Separator for clarity between files

Original Data for temp_rcp45_2071-2100_data.csv:
                            p95      p999  tempabove35  tempabove40  \
Airports                                                              
Milano Malpensa        2.824759  3.187580     8.071609     1.139291   
Bergamo Orio al Serio  2.832634  3.163505     9.351973     0.904042   
Milano Linate          2.784055  3.213331    11.884521     1.870307   
Roma Fiumicino         2.487909  2.370082    11.162663     0.629061   
Roma Ciampino          2.599896  2.611216    12.224598     0.840460   
Napoli Capodichino     2.489651  2.634704     8.489119     0.706973   
Palermo Punta Raisi    2.528859  2.706914     5.504732     0.614828   
Catania Fontanarossa   2.615941  3.063626    21.375766     4.254598   
Cagliari Elmas         2.561815  2.961145    19.176130     2.865192   

                       tempabove45  
Airports                            
Milano Malpensa           0.056916  
Bergamo Orio al Serio     0.011398  
Milano Linate        