In [1]:
import requests
import itertools
import os
import yaml
import xarray as xr
import numpy as np
from variable_mapping import variable_mapping
from datetime import datetime

In [2]:
# Define metadata keys, inspired in first dataset (TODO: Check all available metadata fields)
metadata_keys = {'ebas_station_code': 'station_reference',
                 'ebas_station_name': 'station_name',
                 'ebas_station_land_use': 'land_use',
                 'ebas_station_wmo_region': 'WMO_region',
                 'ebas_station_latitude': 'latitude', 
                 'ebas_station_longitude': 'longitude',
                 'ebas_station_altitude': 'altitude'}

parameters = {'nitrogen dioxide mass concentration': 'sconcno2'}

coverages = {'P0000-00-00T01:00:00': 'hourly', 
             'P0000-00-01T00:00:00': 'daily', 
             'P0000-01-00T00:00:00': 'monthly'}

# Get files per variable

In [3]:
CURRENT_PATH = os.getcwd()
all_variables = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'variables.yaml')))

In [4]:
def get_files_per_variable(chunk_i, variables):
    files_per_var = {}
    base_url = "https://prod-actris-md.nilu.no/metadata/content"
    n_variables = len(variables)
    for var_i, var in enumerate(variables):
        print(f'[{chunk_i}] {var} ({var_i}/{n_variables})')
        if var not in files_per_var:
            files_per_var[var] = {}
        variable_files = []
        page = 0
        while True:
            # Set up URL with pagination
            url = f"{base_url}/{var}/page/{page}"
            response = requests.get(url)
            
            # Check if the response is valid and contains data
            if response.status_code != 200:
                print(f"Error fetching page {page}. Status code: {response.status_code}")
                break
            
            data = response.json()
            
            # Check if there's content in the data
            if not data:
                break
            
            # Loop through each entry in the data and print DOI and OPeNDAP URL
            for item in data:
                doi = item.get("md_identification", {}).get("identifier", {}).get("pid")
                opendap_urls = [protocol_dict['dataset_url'] for protocol_dict in item.get('md_distribution_information', []) if protocol_dict.get('protocol') == 'OPeNDAP']
                
                # Print DOI and OPeNDAP URL if both are present
                if doi and opendap_urls:
                    variable_files.append(opendap_urls)
                    
            # Go to the next page
            page += 1
        
        files_per_var[var]['files'] = list(itertools.chain.from_iterable(variable_files))
    
    return files_per_var

In [5]:
def create_ebas_doi_combined_file():
    combined_data = {}
    chunk_size = 100
    chunks = [list(all_variables.keys())[i:i + chunk_size] for i in range(0, len(all_variables), chunk_size)]
    for chunk_i, chunk in enumerate(chunks):
        files_per_var = get_files_per_variable(chunk_i, chunk)
        combined_data.update(files_per_var)
    
    with open("ebas_doi_combined.yaml", "w") as file:
        yaml.dump(combined_data, file, default_flow_style=False)

In [6]:
#create_ebas_doi_combined_file()

In [7]:
combined_data = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'ebas_doi_combined.yaml')))

In [8]:
combined_data = {k: v for k, v in combined_data.items() if k.strip() and v}

# Get ACTRIS variable mapping

In [9]:
def create_variable_mapping_file():
    result = {
        value['preferred_term'].replace('"', ''): {'var': key[2], 'units': key[0]}
        for key, value in variable_mapping.items()
    }
    
    with open('variable_mapping.yaml', 'w') as file:
        yaml.dump(result, file, default_flow_style=False)

In [10]:
#create_variable_mapping_file()

In [11]:
variable_mapping = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'variable_mapping.yaml')))
variable_mapping = {k: v for k, v in variable_mapping.items() if k.strip() and v}

# Get information on files

In [12]:
ds = xr.open_dataset(combined_data['nitrogen dioxide mass concentration']['files'][1])
ds

In [13]:
def create_files_info_file():
    #for var in all_variables:
    files_info = {}
    for actris_parameter in ['nitrogen dioxide mass concentration']:
        print('ACTRIS vocabulary:', actris_parameter)
        files = combined_data[actris_parameter]['files']
        files_info[actris_parameter] = {}
        print('Total number of files:', len(files))
        for i, file in enumerate(files):
            print(f'{i} - {file}')
            try:
                ds = xr.open_dataset(file)
            except:
                print('Error opening dataset')
            coverage = ds.time_coverage_resolution
            try:             
                resolution = coverages[coverage]
            except:
                print('Error in resolution with coverage:', coverage)
                continue
            start_date = ds.time_coverage_start
            end_date = ds.time_coverage_end
            variables = list(ds.data_vars.keys())
            files_info[actris_parameter][file] = {}
            files_info[actris_parameter][file]['resolution'] = resolution
            files_info[actris_parameter][file]['start_date'] = start_date
            files_info[actris_parameter][file]['end_date'] = end_date
            files_info[actris_parameter][file]['variables'] = variables

    with open('files_information.yaml', 'w') as file:
        yaml.dump(files_info, file, default_flow_style=False)

In [14]:
#create_files_info_file()

In [15]:
files_info = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'files_information.yaml')))
files_info = {k: v for k, v in files_info.items() if k.strip() and v}

# Define GHOST mapping

In [16]:
def filter_files(actris_parameter, resolution, target_start_date, target_end_date):
    files = []
    for file, attributes in files_info[actris_parameter].items():
        if attributes["resolution"] == resolution:
            start_date = datetime.strptime(attributes["start_date"], "%Y-%m-%dT%H:%M:%S UTC")
            end_date = datetime.strptime(attributes["end_date"], "%Y-%m-%dT%H:%M:%S UTC")
            if start_date <= target_end_date and end_date >= target_start_date:
                files.append(file)
    return files

In [17]:
resolution = 'hourly'
target_start_date = datetime(2005, 1, 1)
target_end_date = datetime(2008, 12, 31)
for actris_parameter in ['nitrogen dioxide mass concentration']:
    var = variable_mapping[actris_parameter]['var']
    files = filter_files(actris_parameter, resolution, target_start_date, target_end_date)
    print('ACTRIS vocabulary:', actris_parameter, '- Variable:', var)
    if len(files) != 0:
        # combine datasets that have the same variable and resolution
        combined_ds_list = []
        metadata = {}
        metadata[resolution] = {}
        
        print('Total number of files:', len(files))
        for i, file in enumerate(files):
            print(i, '-', file)
            # open file
            try:
                ds = xr.open_dataset(file)
            except:
                print('Error opening file')
                continue

            # get resolution
            coverage = ds.time_coverage_resolution
            resolution = coverages[coverage]

            # assign station code as dimension
            ds = ds.expand_dims(dim={'station': [i]})
    
            # select data for that variable only
            unformatted_units = variable_mapping[actris_parameter]['units']
            if unformatted_units == 'nmol/mol':
                units = 'nmol_per_mol'
            elif unformatted_units == 'ug N/m3':
                units = 'ug_N_per_m3'
            units_var = f'{var}_{units}'
            possible_vars = [var, f'{var}_amean', units_var, f'{units_var}_amean']
            for possible_var in possible_vars:
                if possible_var in ds:
                    ds_var = ds[possible_var]
                    break
            
            # save metadata
            for ebas_key in metadata_keys.keys():
                if ebas_key not in metadata[resolution].keys():
                    metadata[resolution][ebas_key] = []
                if ebas_key not in ds_var.attrs.keys():
                    metadata[resolution][ebas_key].append(np.nan)
                else:
                    metadata[resolution][ebas_key].append(ds_var.attrs[ebas_key])

            # remove all attributes except units
            ds_var.attrs = {key: value for key, value in ds_var.attrs.items() if key == 'units'}
            
            # append modified dataset to list
            combined_ds_list.append(ds_var)

        # combine and create new dataset
        combined_ds = xr.concat(combined_ds_list, 
                                dim='station', 
                                combine_attrs='drop_conflicts').to_dataset()

        # rename variable to BSC standards
        combined_ds = combined_ds.rename({var: parameters[actris_parameter]})
                
        # add metadata
        for key, value in metadata[resolution].items():
            if metadata_keys[key] in ['latitude', 'longitude']:
                value = [float(val) for val in value]
            elif metadata_keys[key] == 'altitude':
                value = [float(val.replace('m', '').strip()) for val in value]
            combined_ds[metadata_keys[key]] = xr.Variable(data=value, dims=('station'))
        
        # save data per year and month
        path = f'/home/avilanov/data/providentia/obs/nonghost/actris/actris/{resolution}/{parameters[actris_parameter]}'
        if not os.path.isdir(path):
            os.makedirs(path, exist_ok=True)
        for year, ds_year in combined_ds.groupby('time.year'):
            for month, ds_month in ds_year.groupby('time.month'):
                if target_start_date <= datetime(year, month, 1) <= target_end_date:
                    filename = f"{path}/{parameters[actris_parameter]}_{year}{month:02d}.nc"
                    combined_ds_yearmonth = combined_ds.sel(time=f"{year}-{month:02d}")
                    combined_ds_yearmonth.to_netcdf(filename)
                    print(f"Saved: {filename}")

ACTRIS vocabulary: nitrogen dioxide mass concentration - Variable: nitrogen_dioxide
Total number of files: 77
0 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/2F/9X/PS/2F9X-PSBD.nc
1 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/2W/BJ/5E/2WBJ-5EAZ.nc
2 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/49/JQ/NK/49JQ-NKBV.nc
3 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/4B/UN/7X/4BUN-7XAR.nc
4 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/4G/72/GH/4G72-GHUB.nc
5 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/5N/8A/TR/5N8A-TR7Z.nc
6 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/5V/BE/DQ/5VBE-DQP5.nc
7 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/6G/6M/HM/6G6M-HMRB.nc
8 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/6H/5M/2P/6H5M-2PDR.nc
9 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/6M/47/CD/6M47-CD82.nc
10 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/6R/SS/9S/6RSS-9S4S.nc
11 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/7B/K3/6C/7BK3-6C7Y.nc
12 - https:/

oc_open: server error retrieving url: code=2 message="The variable `NOx_nmol_per_mol_det' was not found in the dataset."oc_open: server error retrieving url: code=2 message="The variable `NOx_nmol_per_mol_det' was not found in the dataset."oc_open: server error retrieving url: code=2 message="The variable `NOx_nmol_per_mol_det' was not found in the dataset."oc_open: server error retrieving url: code=2 message="The variable `NOx_nmol_per_mol_det' was not found in the dataset."oc_open: server error retrieving url: code=2 message="The variable `NOx_nmol_per_mol_det' was not found in the dataset."oc_open: server error retrieving url: code=2 message="The variable `NOx_nmol_per_mol_det' was not found in the dataset."oc_open: server error retrieving url: code=2 message="The variable `NOx_nmol_per_mol_det' was not found in the dataset."

Error opening file
44 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/MQ/RH/2R/MQRH-2RZW.nc
45 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/MR/FM/WV/MRFM-WVDN.nc
46 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/MY/DM/N7/MYDM-N7K6.nc
47 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/N2/XT/HT/N2XT-HTD2.nc
48 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/NQ/SD/EP/NQSD-EPJB.nc
49 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/QW/83/7F/QW83-7F88.nc
50 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/RA/AD/46/RAAD-466E.nc
51 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/S7/5Z/BM/S75Z-BMMV.nc
52 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/S9/8E/QF/S98E-QFYP.nc
53 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/TE/5K/EK/TE5K-EKJ9.nc
54 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/TK/3V/NA/TK3V-NAES.nc
55 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/UM/8X/3F/UM8X-3FVM.nc
56 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/UT/MV/PG/UTMV-PGGQ.nc
57 - https://thredd

In [21]:
test_data = xr.open_dataset('/home/avilanov/data/providentia/obs/nonghost/actris/actris/hourly/sconcno2/sconcno2_200801.nc')
test_data

In [19]:
nonghost_data = xr.open_dataset('/home/avilanov/data/providentia/obs/nonghost/eea/eionet/hourly/sconcno2/sconcno2_202406.nc')
nonghost_data

In [20]:
ghost_data = xr.open_dataset('/home/avilanov/data/providentia/obs/ghost/EEA_AQ_eReporting/1.5/hourly/sconcno2/sconcno2_201805.nc')
ghost_data