In [7]:
import requests
from xml.etree import ElementTree
from datetime import datetime, date
from pathlib import Path
import xarray as xr
import numpy as np
import os
from django.core.cache import cache
from dateutil.relativedelta import relativedelta



In [8]:

BASE_CATALOG_URL = "https://esgf-data.dwd.de/thredds/catalog/esgf3/data/climatepredictionsde/seasonal/output/public/DE-0075x005/DWD/GCFS21/svh2023{month:02}01/sfc{year}{month:02}01/{scenario}/DWD-EPISODES2022/v1-r1/day/{variable}/"

BASE_DOWNLOAD = "https://esgf-data.dwd.de/thredds/fileServer/esgf3/data/climatepredictionsde/seasonal/output/public/DE-0075x005/DWD/GCFS22/"
BBBBBBBBBBBBB = 'https://esgf-data.dwd.de/thredds/fileServer/esgf3/data/climatepredictionsde/seasonal/output/public/DE-0075x005/DWD/GCFS22/svh20230501/sfc20250501/r1i1p1/DWD-EPISODES2022/v1-r1/day/hurs/v2025506/hurs_day_GCFS22--DWD-EPISODES2022--DE-0075x005_sfc20250501_r1i1p1_20250501-20251130.nc'
CCCCCCCCCCCCC = 'https://esgf-data.dwd.de/thredds/fileServer/esgf3/data/climatepredictionsde/seasonal/output/public/DE-0075x005/DWD/GCFS22/svh20230501/sfc20250501/r1i1p1/DWD-EPISODES2022/v1-r1/day/pr/v20250506/pr_day_GCFS22--DWD-EPISODES2022--DE-0075x005_sfc20250501_r12i1p1_20250501-20251130.nc'
SCENARIOS = ['r1i1p1', 'r2i1p1', 'r3i1p1']
# SCENARIOS = ['r1i1p1']
VARIABLES = ['hurs', 'pr', 'psl', 'rsds', 'sfcWind', 'tas', 'tasmax', 'tasmin']
THREDDS_NAMESPACE = {"thredds": "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0"}


In [10]:
today = datetime.today()
year = today.year
month = today.month
future_date = today + relativedelta(months=+6)

# Move to the first day of the *next* month, then subtract one day
last_day_of_month = (future_date.replace(day=1) + relativedelta(months=+1)) - relativedelta(days=1)

In [12]:
variable = 'tas'
scenario = 'r1i1p1'

In [14]:
catalog_url = f"{BASE_CATALOG_URL.format(year=year, month=month, scenario=scenario,variable=variable)}catalog.xml"
        
catalog = requests.get(catalog_url)


In [15]:
catalog

<Response [404]>

In [None]:
catalog_tree = ElementTree.fromstring(catalog.content)

catalog = catalog_tree.findall(".//thredds:catalogRef", THREDDS_NAMESPACE)
latest_versions = []

In [None]:
def get_download_url(scenario, variable):
    """Get the latest catalog URL for the specified year, month, and scenario."""

    today = datetime.today()
    year = today.year
    month = today.month
    future_date = today + relativedelta(months=+6)

    # Move to the first day of the *next* month, then subtract one day
    last_day_of_month = (future_date.replace(day=1) + relativedelta(months=+1)) - relativedelta(days=1)

    # get the version folder's name
    try:
        catalog_url = f"{BASE_CATALOG_URL.format(year=year, month=month, scenario=scenario,variable=variable)}catalog.xml"
        
        catalog = requests.get(catalog_url)
        catalog_tree = ElementTree.fromstring(catalog.content)
        
        catalog = catalog_tree.findall(".//thredds:catalogRef", THREDDS_NAMESPACE)
        latest_versions = []
        for catalog_ref in catalog:
            latest_versions.append(catalog_ref.attrib['name'])
        latest_version = max(latest_versions)

        # compose catalog url for the latest version
        latest_version_url = f"{BASE_CATALOG_URL.format(year=year, month=month, scenario=scenario,variable=variable)}{latest_version}/catalog.xml"
        
        # Get the dataset name/ urlPath
        dataset_name_reponse = requests.get(latest_version_url)
        dataset_name_catalog_tree = ElementTree.fromstring(dataset_name_reponse.content)
        dataset_name_catalog = dataset_name_catalog_tree.findall(".//thredds:dataset", THREDDS_NAMESPACE)
        dataset_path = ''
        for dataset in dataset_name_catalog:
            if dataset.attrib.get('urlPath'):
                dataset_path = dataset.attrib['urlPath']

        https_download_url = f"https://esgf-data.dwd.de/thredds/fileServer/{dataset_path}"
        print('https_download_url: ', https_download_url)
        return {'success': True, 'url':https_download_url}
    
    except Exception as e:
        print(f"Error fetching download URL: {e}")
        return {'success': False, 'error': str(e)}
    


def get_local_path():
    """Get the base local path for storing NetCDF forecast files."""
    local = Path(__file__).resolve().parent.parent
    return local / 'climate_netcdf_forecast'

def fetch_available_variables(catalog_url):
    """Fetch available variables from the catalog XML, considering namespaces."""
    response = requests.get(catalog_url)
    response.raise_for_status()

    tree = ElementTree.fromstring(response.content)
    # Find all catalogRef elements within the namespace
    variables = [
        ref.attrib.get("name") for ref in tree.findall(".//thredds:catalogRef", THREDDS_NAMESPACE)
    ]
    
    return variables
# variables = ['hurs', 'pr', 'psl', 'rsds', 'sfcWind', 'tas', 'tasmax', 'tasmin']

def get_last_valid_forecast_date():
    nc_folder_path = get_local_path()
    nc_folder_path = os.path.join(nc_folder_path, 'r1i1p1/')
    netcdf_paths = [f'{nc_folder_path}/{nc}' for nc in os.listdir(nc_folder_path) if nc.endswith('.nc')]
    nc_path = netcdf_paths[0]
    ds = xr.open_dataset(nc_path)
    times = ds.time[:].values
    last_valid_date = times[-1]
    print('last_valid_date: ', last_valid_date)
    return last_valid_date.astype('datetime64[D]').astype(date)

def get_last_valid_forecast_date_cached(update=False):
    last_valid_forecast_date = cache.get('last_valid_forecast_date')
    if last_valid_forecast_date is None or update == True:
        last_valid_forecast_date = get_last_valid_forecast_date()
        cache.set('last_valid_forecast_date', last_valid_forecast_date, timeout=259200)  # Cache for 72 hours

    return last_valid_forecast_date

def delete_old_files(folder_path, new_files):
    """Delete old NetCDF files from the folder that are not in new_files list."""
    try:
        print('delete_old_files: ', folder_path, new_files)
        for file in os.listdir(folder_path):
            print('delete_old_files: ', file)
            if file.endswith('.nc') and file not in new_files:
                file_path = os.path.join(folder_path, file)
                print(f"Deleting old file: {file_path}")
                os.remove(file_path)
    except Exception as e:
        print(f"Error deleting old files: {e}")


def download_and_save_nc_file(nc_url, save_path):
    """Download and save the NetCDF file to the specified local path."""
    response = requests.get(nc_url)
    response.raise_for_status()

    filename = nc_url.split("/")[-1]
    save_path = Path(save_path)
    save_path = save_path / filename
    save_path.parent.mkdir(parents=True, exist_ok=True)

    with open(save_path, "wb") as file:
        file.write(response.content)

    print(f"Downloaded: {filename} to {save_path}")
    return filename


def automated_thredds_download():
    """Main function to automate downloads of variables across scenarios."""
    

    local_path = get_local_path()

    # Step 1: Iterate through scenarios and variables
    
    for scenario in SCENARIOS:
        new_files = [] 
        folder_path = f"{local_path}/{scenario}/"
        for variable in VARIABLES:
            print(f"Processing variable '{variable}' for scenario '{scenario}'...")

            nc_file_url_message = get_download_url(scenario, variable)
            if nc_file_url_message['success']:

                downloaded_file = download_and_save_nc_file(nc_file_url_message['url'], folder_path)
                new_files.append(downloaded_file)
                print('new_files: ', new_files)
            else:
                print(f"Failed to download {variable} for scenario {scenario}: {nc_file_url_message['error']}")


        if  new_files != []:
            print('new_files: ', new_files)
            print(f"Deleting old files for scenario '{scenario}'...")
            delete_old_files(folder_path, new_files)


    old_combined_ncs = [f'{local_path}/{nc}' for nc in os.listdir(local_path) if nc.endswith('.nc')]
    # print('old_ncs: ', old_ncs)
    new_combined_ncs = []

    # Combine NetCDF files  into a single file for each scenario
    try:
        for scenario in SCENARIOS:
            folder_path = f"{local_path}/{scenario}/"
            netcdf_paths = [f'{folder_path}/{nc}' for nc in os.listdir(folder_path) if nc.endswith('.nc')]
            
            dates = netcdf_paths[0].split('_')[-1].split('.')[0]
            filename = f'forecast_{scenario}_{dates}.nc'
            file_path = f"{local_path}/{filename}"
            if file_path not in old_combined_ncs:
                ds = xr.open_mfdataset(netcdf_paths, combine='by_coords', compat='override')
                ds.to_netcdf(file_path)
                ds.close()
                new_combined_ncs.append(file_path)
                
        print('old_ncs: ', old_combined_ncs)
        print('new_ncs: ', new_combined_ncs)
        if old_combined_ncs != [] and new_combined_ncs != [] and old_combined_ncs.sort() != new_combined_ncs.sort():
            for old_nc in old_combined_ncs:
                os.remove(old_nc)

        get_last_valid_forecast_date_cached(update=True)
    except Exception as e:
        print(f"Combining NetCDF files failed: {e}")


In [5]:
folder_path = f"./r1i1p1"
netcdf_paths = [f'{folder_path}/{nc}' for nc in os.listdir(folder_path) if nc.endswith('.nc')]

In [6]:
netcdf_paths.pop()

'./r1i1p1/psl_day_GCFS22--DWD-EPISODES2022--DE-0075x005_sfc20250501_r1i1p1_20250501-20251130.nc'

In [17]:
ds = xr.open_mfdataset(netcdf_paths, combine='by_coords', compat='override')

In [18]:
netcdf_paths

['./r1i1p1/pr_day_GCFS21--DWD-EPISODES2022--DE-0075x005_sfc20250201_r1i1p1_20250201-20250731.nc',
 './r1i1p1/tasmax_day_GCFS21--DWD-EPISODES2022--DE-0075x005_sfc20250201_r1i1p1_20250201-20250731.nc',
 './r1i1p1/sfcWind_day_GCFS21--DWD-EPISODES2022--DE-0075x005_sfc20250201_r1i1p1_20250201-20250731.nc',
 './r1i1p1/tasmin_day_GCFS21--DWD-EPISODES2022--DE-0075x005_sfc20250201_r1i1p1_20250201-20250731.nc',
 './r1i1p1/rsds_day_GCFS21--DWD-EPISODES2022--DE-0075x005_sfc20250201_r1i1p1_20250201-20250731.nc']