In [1]:
import requests
import itertools
import os
import yaml
import xarray as xr
import numpy as np

In [2]:
CURRENT_PATH = os.getcwd()
all_variables = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'variables.yaml')))

In [3]:
def get_files_per_variable(chunk_i, variables):
    files_per_var = {}
    base_url = "https://prod-actris-md.nilu.no/metadata/content"
    n_variables = len(variables)
    for var_i, var in enumerate(variables):
        print(f'[{chunk_i}] {var} ({var_i}/{n_variables})')
        if var not in files_per_var:
            files_per_var[var] = {}
        variable_files = []
        page = 0
        while True:
            # Set up URL with pagination
            url = f"{base_url}/{var}/page/{page}"
            response = requests.get(url)
            
            # Check if the response is valid and contains data
            if response.status_code != 200:
                print(f"Error fetching page {page}. Status code: {response.status_code}")
                break
            
            data = response.json()
            
            # Check if there's content in the data
            if not data:
                break
            
            # Loop through each entry in the data and print DOI and OPeNDAP URL
            for item in data:
                doi = item.get("md_identification", {}).get("identifier", {}).get("pid")
                opendap_urls = [protocol_dict['dataset_url'] for protocol_dict in item.get('md_distribution_information', []) if protocol_dict.get('protocol') == 'OPeNDAP']
                
                # Print DOI and OPeNDAP URL if both are present
                if doi and opendap_urls:
                    variable_files.append(opendap_urls)
                    
            # Go to the next page
            page += 1
        
        files_per_var[var]['files'] = list(itertools.chain.from_iterable(variable_files))
    
    return files_per_var

In [4]:
files_per_var = get_files_per_variable(100, ['ozone amount concentration'])

[100] ozone amount concentration (0/1)


In [5]:
files_per_var

{'ozone amount concentration': {'files': []}}

In [6]:
#combined_data = {}
#chunk_size = 100
#chunks = [list(all_variables.keys())[i:i + chunk_size] for i in range(0, len(all_variables), chunk_size)]
#for chunk_i, chunk in enumerate(chunks):
#    files_per_var = get_files_per_variable(chunk_i, chunk)
#    combined_data.update(files_per_var)

In [7]:
#with open("ebas_doi_combined.yaml", "w") as file:
#    yaml.dump(combined_data, file, default_flow_style=False)

In [8]:
combined_data = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'ebas_doi_combined.yaml')))

In [9]:
combined_data = {k: v for k, v in combined_data.items() if k.strip() and v}

In [10]:
variable_mapping = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'variable_mapping.yaml')))
variable_mapping = {k: v for k, v in variable_mapping.items() if k.strip() and v}

In [11]:
actris_parameter = 'nitrogen dioxide mass concentration'
file = combined_data[actris_parameter]['files'][0]
ds = xr.open_dataset(file)
ds

In [12]:
#xr.open_dataset('https://thredds.nilu.no/thredds/dodsC/ebas_doi/ZF/D6/9Q/ZFD6-9QQX.nc')

In [13]:
# Define metadata keys, inspired in first dataset (TODO: Check all available metadata fields)
metadata_keys = {'ebas_station_code': 'station_reference',
                 'ebas_station_name': 'station_name',
                 'ebas_station_land_use': 'land_use',
                 'ebas_station_wmo_region': 'WMO_region',
                 'ebas_station_latitude': 'latitude', 
                 'ebas_station_longitude': 'longitude',
                 'ebas_station_altitude': 'altitude'}

parameters = {'nitrogen_dioxide': 'sconcno2'}

In [14]:
# open all files for certain variable, extract and join
resolution = 'hourly'
#for actris_parameter in ['ozone mass concentration']:
for actris_parameter in ['nitrogen dioxide mass concentration']:
    var = variable_mapping[actris_parameter]['var']
    print('ACTRIS vocabulary:', actris_parameter, '- Variable:', var)
    if len(combined_data[actris_parameter]['files']) != 0:
        # combine datasets that have the same variable
        combined_ds_list = []
        metadata = {}
        print('Total number of files:', len(combined_data[actris_parameter]['files']))
        for i, file in enumerate(combined_data[actris_parameter]['files'][0:10]):
            print(i, '-', file)
            # open file
            ds = xr.open_dataset(file)
    
            # save metadata
            for ebas_key in metadata_keys.keys():
                if ebas_key not in metadata.keys():
                    metadata[ebas_key] = []
                if ebas_key not in ds.attrs.keys():
                    metadata[ebas_key].append(np.nan)
                else:
                    metadata[ebas_key].append(ds.attrs[ebas_key])
            
            # assign station code as dimension
            ds = ds.expand_dims(dim={'station': [i]})
    
            # select data for that variable only
            ds_var = ds[var]
            #try:
            #    ds_var = ds[var]
            #except:
            #    units = variable_mapping[actris_parameter]['units']
            #    if unformatted_units == 'nmol/mol':
            #        units = 'nmol_per_mol'
            #    ds_var = ds[f'{var}_{units}']
    
            # append modified dataset to list
            combined_ds_list.append(ds_var)
    
        # combine and create new dataset
        combined_ds = xr.concat(combined_ds_list, 
                                dim='station', 
                                combine_attrs='drop_conflicts').to_dataset()

        # rename variable to BSC standards
        combined_ds = combined_ds.rename({var: parameters[var]})
        
        # add metadata
        for key, value in metadata.items():
            if metadata_keys[key] in ['latitude', 'longitude']:
                value = [float(val) for val in value]
            elif metadata_keys[key] == 'altitude':
                value = [float(val.replace('m', '').strip()) for val in value]
            combined_ds[metadata_keys[key]] = xr.Variable(data=value, dims=('station'))
        
        # save data per year and month
        path = f'/home/avilanov/data/providentia/obs/nonghost/actris/ebas_doi/{resolution}/{parameters[var]}'
        if not os.path.isdir(path):
            os.makedirs(path, exist_ok=True)
        for year, ds_year in combined_ds.groupby('time.year'):
            for month, ds_month in ds_year.groupby('time.month'):
                filename = f"{path}/{parameters[var]}_{year}{month:02d}.nc"
                combined_ds_yearmonth = combined_ds.sel(time=f"{year}-{month:02d}")
                combined_ds_yearmonth.to_netcdf(filename)
                print(f"Saved: {filename}")

ACTRIS vocabulary: nitrogen dioxide mass concentration - Variable: nitrogen_dioxide
Total number of files: 733
0 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/KJ/5T/KC/KJ5T-KCAB.nc
1 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/JK/E7/AA/JKE7-AABD.nc
2 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/BW/GE/SH/BWGE-SHXN.nc
3 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/JZ/S5/BN/JZS5-BNYJ.nc
4 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/KM/SZ/DN/KMSZ-DN7B.nc
5 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/R5/N5/TD/R5N5-TDVM.nc
6 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/QX/TN/E8/QXTN-E8CB.nc
7 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/2W/BJ/5E/2WBJ-5EAZ.nc
8 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/S5/MW/MR/S5MW-MR53.nc
9 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/2S/SQ/F2/2SSQ-F2FV.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/ebas_doi/hourly/sconcno2/sconcno2_197701.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actr

In [15]:
test_data = xr.open_dataset('/home/avilanov/data/providentia/obs/nonghost/actris/ebas_doi/hourly/sconcno2/sconcno2_200801.nc')
test_data

In [16]:
test_data.altitude.values

array([7.200e+02, 4.040e+02, 7.400e+01, 1.205e+03, 5.800e+01, 1.200e+01,
       9.850e+02, 1.000e+01, 4.400e+01, 1.000e+00])