In [32]:
import requests
import itertools
import os
import yaml
import xarray as xr
import numpy as np

In [2]:
CURRENT_PATH = os.getcwd()
all_variables = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'variables.yaml')))

In [3]:
def get_files_per_variable(chunk_i, variables):
    files_per_var = {}
    base_url = "https://prod-actris-md.nilu.no/metadata/content"
    n_variables = len(variables)
    for var_i, var in enumerate(variables):
        print(f'[{chunk_i}] {var} ({var_i}/{n_variables})')
        if var not in files_per_var:
            files_per_var[var] = {}
        variable_files = []
        page = 0
        while True:
            # Set up URL with pagination
            url = f"{base_url}/{var}/page/{page}"
            response = requests.get(url)
            
            # Check if the response is valid and contains data
            if response.status_code != 200:
                print(f"Error fetching page {page}. Status code: {response.status_code}")
                break
            
            data = response.json()
            
            # Check if there's content in the data
            if not data:
                break
            
            # Loop through each entry in the data and print DOI and OPeNDAP URL
            for item in data:
                doi = item.get("md_identification", {}).get("identifier", {}).get("pid")
                opendap_urls = [protocol_dict['dataset_url'] for protocol_dict in item.get('md_distribution_information', []) if protocol_dict.get('protocol') == 'OPeNDAP']
                
                # Print DOI and OPeNDAP URL if both are present
                if doi and opendap_urls:
                    variable_files.append(opendap_urls)
                    
            # Go to the next page
            page += 1
        
        files_per_var[var]['files'] = list(itertools.chain.from_iterable(variable_files))
    
    return files_per_var

In [6]:
#combined_data = {}
#chunk_size = 100
#chunks = [list(all_variables.keys())[i:i + chunk_size] for i in range(0, len(all_variables), chunk_size)]
#for chunk_i, chunk in enumerate(chunks):
#    files_per_var = get_files_per_variable(chunk_i, chunk)
#    combined_data.update(files_per_var)

In [7]:
#with open("ebas_nrt_combined.yaml", "w") as file:
#    yaml.dump(combined_data, file, default_flow_style=False)

In [12]:
combined_data = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'ebas_nrt_combined.yaml')))

In [13]:
cleaned_dict = {k: v for k, v in combined_data.items() if k.strip() and v}

In [35]:
file = cleaned_dict['aerosol particle dibenzo(ac,ah)anthracenes mass concentration']['files'][0]
ds = xr.open_dataset(file)
ds

In [33]:
# Define metadata keys, inspired in first dataset (TODO: Check all available metadata fields)
metadata_keys = ['ebas_station_code','ebas_station_name',
                 'ebas_station_wdca_id', 'ebas_station_gaw_id',
                 'ebas_station_gaw_name', 'ebas_station_other_ids',
                 'ebas_station_land_use', 'ebas_station_setting',
                 'ebas_station_gaw_type', 'ebas_station_wmo_region',
                 'ebas_station_latitude', 'ebas_station_longitude',
                 'ebas_station_altitude']

# open all files for certain variable, extract and join
resolution = 'hourly'
for var in list(all_variables.keys())[0:20]:
    print(var)
    if len(cleaned_dict[var]['files']) != 0:
        # combine datasets that have the same variable
        combined_ds_list = []
        metadata = {}
        for i, file in enumerate(cleaned_dict[var]['files']):
            # open file
            ds = xr.open_dataset(file)
    
            # save metadata
            for ebas_key in metadata_keys:
                if ebas_key not in metadata.keys():
                    metadata[ebas_key] = []
                if ebas_key not in ds.attrs.keys():
                    metadata[ebas_key].append(np.nan)
                else:
                    metadata[ebas_key].append(ds.attrs[ebas_key])
            
            # assign station code as dimension
            ds = ds.expand_dims(dim={'station': [i]})
    
            # select data for that variable only
            ds_var = ds[var]
    
            # append modified dataset to list
            combined_ds_list.append(ds_var)
    
        # combine and create new dataset
        combined_ds = xr.concat(combined_ds_list, 
                                dim='station', 
                                combine_attrs='drop_conflicts').to_dataset()
    
        # add metadata
        for key in metadata.keys():
            combined_ds[key] = xr.Variable(data=metadata[key], dims=('station'))
        
        # save data per year and month
        path = f'/home/avilanov/data/providentia/obs/nonghost/actris/nrt/{resolution}/{var}'
        if not os.path.isdir(path):
            os.makedirs(path, exist_ok=True)
        for year, ds_year in combined_ds.groupby('time.year'):
            for month, ds_month in ds_year.groupby('time.month'):
                filename = f"{path}/{var}_{year}{month:02d}.nc"
                combined_ds_yearmonth = combined_ds.sel(time=f"{year}-{month:02d}")
                combined_ds_yearmonth.to_netcdf(filename)
                print(f"Saved: {filename}")

aerosol particle PCB 130 and PCB 176 mass concentration
aerosol particle PCB 144 and PCB 135 mass concentration
aerosol particle PCB 16 and PCB 32 mass concentration
aerosol particle PCB 17 and PCB 18 mass concentration
aerosol particle PCB 172 and PCB 197 mass concentration
aerosol particle PCB 178 and PCB 129 mass concentration
aerosol particle PCB 196 and PCB 203 mass concentration
aerosol particle PCB 201 and PCB 157 mass concentration
aerosol particle PCB 24 and PCB 27 mass concentration
aerosol particle PCB 28 and PCB 31 mass concentration
aerosol particle PCB 4 and PCB 10 mass concentration
aerosol particle PCB 41 and PCB 71 mass concentration
aerosol particle PCB 56 and PCB 60 mass concentration
aerosol particle PCB 70 and PCB 60 mass concentration
aerosol particle PCB 70 and PCB 76 mass concentration
aerosol particle PCB 8 and PCB 5 mass concentration
aerosol particle PCB 84 and PCB 89 mass concentration
aerosol particle dibenzo(ac,ah)anthracenes mass concentration


KeyError: "No variable named 'aerosol particle dibenzo(ac,ah)anthracenes mass concentration'. Variables on the dataset include ['station', 'time', 'time_bnds', 'metadata_time', 'metadata_time_bnds', ..., 'phenanthrene_qc', 'phenanthrene_ebasmetadata', 'pyrene', 'pyrene_qc', 'pyrene_ebasmetadata']"