![logo](./img/LogoLine_horizon_C3S.png)

<br>

# Merge Reanalysis (ERA5) and Hindcast (SEAS5) for modelling

### About


The notebook has the following outline:
* 1 - Join hindcast files (bias corrected): tp and t2m for each month, year and member
* 2 - Merge ERA5 with hindcast files (ERA5: warm up period=5 years)
* 3 - Combine datasets of subcatchments (C1 and C2) for modelling (PERSiST)

### Install packages

In [1]:
# Miscellaneous operating system interfaces
import os

# Libraries for working with multi-dimensional arrays
import numpy as np
import xarray as xr
import pandas as pd
import scipy

# Libraries for plotting and geospatial data visualisation
import matplotlib.path as mpath
import matplotlib.pyplot as plt

import cartopy.crs as ccrs
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import cartopy.feature as cfeature

# To work with data labels in dictionary format
from collections import OrderedDict

# Date and time related libraries
from dateutil.relativedelta import relativedelta
from calendar import monthrange
import datetime

import glob

# 1. Join precipitation and temperature hindcast files

In [11]:
reach = 'C1' #reach1=C1, reach2=C2

In [12]:
input_path = f'C:/Users/apedregal/Documents/inventWater_docs/Modelling/Seasonal forecasts/seasonal/hindcast_bc/hindcast_bc_{reach}'
output_path = f'C:/Users/apedregal/Documents/inventWater_docs/Modelling/Seasonal forecasts/seasonal/hindcast_bc_joined/hindcast_joined_{reach}'

In [13]:
# Function to generate temperature filenames
def generate_temperature_filename(member, year, month):
    return f"hind_{reach}_t2m_member{member}_year{year}_month{month}.csv"

# Function to generate precipitation filenames
def generate_precipitation_filename(member, year, month):
    return f"hind_{reach}_tp_member{member}_year{year}_month{month}.csv"

# Time range for precipitation and temperature files
years = range(0, 24)  # Example range for 24 years (from 0 to 23 in the filenames)
months = range(0, 12)  # Months from 0 to 11 (for January to December, 0 to 11)

# Loop through each member
for member in range(25):
    # Loop through years and months to generate the filenames for both temperature and precipitation
    for year in years:
        for month in months:
            # Generate filenames for temperature and precipitation
            temperature_filename = generate_temperature_filename(member, year, month)
            precipitation_filename = generate_precipitation_filename(member, year, month)
            
            # Construct full file paths
            temperature_filepath = os.path.join(input_path, temperature_filename)
            precipitation_filepath = os.path.join(input_path, precipitation_filename)
            
            # Check if files exist before processing
            if not os.path.exists(temperature_filepath) or not os.path.exists(precipitation_filepath):
                print(f"Skipping non-existing file pair: {temperature_filename} and {precipitation_filename}")
                continue
            
            # Read the temperature and precipitation CSV files
            temperature_df = pd.read_csv(temperature_filepath, header=None, names=['Temperature'])
            precipitation_df = pd.read_csv(precipitation_filepath, header=None, names=['Precipitation'])
            
            # Merge the data
            merged_df = pd.concat([precipitation_df, temperature_df], axis=1)
            
            # Save the merged data to a new CSV file
            output_filename = f"hind_{reach}_joined_member{member}_year{year}_month{month}.csv"
            merged_df.to_csv(os.path.join(output_path, output_filename), sep=' ', index=False)

# 2 - Merge ERA5 with hindcast files (ERA5: warm up period= 1 or 5 years)

In [14]:
reach='C2' #Subcatchments, reach1=C1, reach2=C2

In [15]:
#Set path directories
path_reanalysis = 'C:/Users/apedregal/Documents/inventWater_docs/Modelling/Seasonal forecasts/reanalysis'
path_joined = f'C:/Users/apedregal/Documents/inventWater_docs/Modelling/Seasonal forecasts/seasonal/hindcast_bc_joined/hindcast_joined_{reach}'
path_merged = 'C:/Users/apedregal/Documents/inventWater_docs/Modelling/Seasonal forecasts/seasonal/hindcast_bc_merged'

In [16]:
def extract_date(year_numeric, month_numeric):
    #Extracts the first day of the actual 'joined' dataset period
    year = 1993 + year_numeric
    return pd.to_datetime(f'{year}-{month_numeric + 1}-01')

def get_reanalysis_data_for_period(reanalysis_df, start_date):
    #Extracts reanalysis data for the full year before start_date
    start_date = pd.to_datetime(start_date)  
    start_reanalysis = start_date - pd.DateOffset(years=5)  # IMPORTANT!!!! Change warmup period as required
    end_reanalysis = start_date - pd.DateOffset(days=1)  

    # Ensure 'time' column is in datetime format
    reanalysis_df['time'] = pd.to_datetime(reanalysis_df['time'])

    # Extract exactly 1 year of data
    df_warmup_years = reanalysis_df[(reanalysis_df['time'] >= start_reanalysis) & 
                                    (reanalysis_df['time'] <= end_reanalysis)]

    return df_warmup_years[['tp', 't2m']]  # Keep only tp and t2m columns

# Load reanalysis data (with correct separator for whitespace)
df_reanalysis = pd.read_csv(f'{path_reanalysis}/reanalysis_daily_{reach}_all_withDates.csv', sep=r'\s+')

# Convert time column to datetime format
df_reanalysis['time'] = pd.to_datetime(df_reanalysis['time'])

# Specify the ranges for member, year, and month
members = range(25) 
years = range(24)  
months = range(12)  

for member in members:
    for year in years:
        for month in months:
            # Construct the filename for hindcast data
            csv_file = f"{path_joined}/hind_{reach}_joined_member{member}_year{year}_month{month}.csv"
            
            # Check if the hindcast file exists
            if os.path.exists(csv_file):
                # Extract date using the function
                date = extract_date(year, month)
                
                # Get reanalysis data for the warmup years before the extracted date
                df_era5 = get_reanalysis_data_for_period(df_reanalysis, date)

                # Load hindcast data and ensure correct data types for relevant columns
                df_hind = pd.read_csv(csv_file, sep=r'\s+', dtype={'Precipitation': float, 'Temperature': float})

                # Check if 'Precipitation Temperature' column exists in hindcast data
                if 'Precipitation Temperature' in df_hind.columns:
                    df_hind[['tp', 't2m']] = df_hind['Precipitation Temperature'].str.split(expand=True)
                    df_hind['tp'] = df_hind['tp'].astype(float)
                    df_hind['t2m'] = df_hind['t2m'].astype(float)
                    df_hind.drop(columns=['Precipitation Temperature'], inplace=True)
                else:
                    # If there are columns with different names, rename them to 'tp' and 't2m'
                    if 'Precipitation' in df_hind.columns and 'Temperature' in df_hind.columns:
                        df_hind = df_hind.rename(columns={'Precipitation': 'tp', 'Temperature': 't2m'})

                # Ensure both DataFrames have exactly two columns: 'tp' and 't2m'
                df_era5 = df_era5[['tp', 't2m']]  # Ensure the same two columns for era5
                df_hind = df_hind[['tp', 't2m']]  # Ensure the same two columns for hindcast

                # Concatenate df_era5 and df_hind vertically
                df_merged = pd.concat([df_era5, df_hind], axis=0, ignore_index=True)

                # Save the merged DataFrame to a new CSV file with no header and only the 'tp' and 't2m' columns
                output_file = os.path.join(path_merged, f"hind_merged_{reach}_member{member}_year{year}_month{month}.csv")
                df_merged.to_csv(output_file, index=False, header=False, sep=" ")  # No header, no index, and using space separator

print("Processing complete.")

Processing complete.


# 3 - Combine datasets of subcatchments (C1 and C2) for modelling (PERSiST)

In [17]:
input_path = 'C:/Users/apedregal/Documents/inventWater_docs/Modelling/Seasonal forecasts/seasonal/hindcast_bc_merged'
output_path = 'C:/Users/apedregal/Documents/inventWater_docs/Modelling/Seasonal forecasts/seasonal/hindcast_bc_persist_input'

In [18]:
# Find all CSV files in the input directory
csv_files = glob.glob(os.path.join(input_path, '*.csv'))

# Create a dictionary to hold pairs of files
file_pairs = {}

# Identify and pair up files with 'C1' and 'C2' in the filename
for file in csv_files:
    filename = os.path.basename(file)
    if 'C1' in filename:
        key = filename.replace('C1', '')
        if key not in file_pairs:
            file_pairs[key] = {}
        file_pairs[key]['C1'] = file
    elif 'C2' in filename:
        key = filename.replace('C2', '')
        if key not in file_pairs:
            file_pairs[key] = {}
        file_pairs[key]['C2'] = file

# Process each pair of files
for key, pair in file_pairs.items():
    if 'C1' in pair and 'C2' in pair:
        # Load the CSV files into pandas DataFrames
        df1 = pd.read_csv(pair['C1'], header=None, names=['tp','t2m'])
        df2 = pd.read_csv(pair['C2'], header=None, names=['tp','t2m'])

        # Get the number of rows in df1
        num_rows_df1 = len(df1)

        # Create new rows without decimals
        new_row_df1 = pd.DataFrame({'tp': [str(num_rows_df1)], 't2m': [None]})
        new_row_2 = pd.DataFrame({'tp': [str(2)], 't2m': [None]})
        new_row_3 = pd.DataFrame({'tp': ['C1'], 't2m': [None]})

        # Add the rows in the correct order to df1
        df1 = pd.concat([new_row_df1, new_row_2, new_row_3, df1]).reset_index(drop=True)

        # Add the name 'C2' as a new row at index 0 in df2
        new_row_4 = pd.DataFrame({'tp': ['C2'], 't2m': [None]})
        df2 = pd.concat([new_row_4, df2], ignore_index=True)

        # Concatenate the DataFrames
        result = pd.concat([df1, df2], ignore_index=True)

        # Write the result to a new CSV file without column headers and with space separator
        output_filename = f'{key}'
        result.to_csv(os.path.join(output_path, output_filename), index=False, header=False, sep=" ")