In [1]:
import netCDF4 as nc
import xarray as xr
import numpy as np
import os
import requests
from bs4 import BeautifulSoup

# Get filenames

In [2]:
url = 'https://thredds.nilu.no/thredds/catalog/actris_nrt/catalog.html'
response = requests.get(url)
response.raise_for_status()

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')
files = []
for a_tag in soup.find_all('a'):
    link = a_tag.get('href')
    if 'catalog.html?dataset=ACTRIS_NRT' in link and link != 'catalog.html?dataset=ACTRIS_NRT':
        files.append(link.replace('catalog.html?dataset=ACTRIS_NRT/', ''))
files

['RO0010R.20240716050000.20241016060828.filter_absorption_photometer...3mo.1h.RO03L_Magee_AE33_INO_NRT.RO03L_eBC_v1.lev3b.nc',
 'RO0010R.20240716050000.20241016060417.filter_absorption_photometer...3mo.1h.RO03L_Magee_AE33_INO_NRT.RO03L_AE33.lev1.5.nc',
 'RO0010R.20240716050000.20241016053408.aerosol_mass_spectrometer...3mo.1h.RO03L_Aerodyne_Q-ACSM_140-172_NRT.RO03L_Aerodyne_Q-ACSM_.lev1.5.nc',
 'NO0002R.20241002110000.20241004105104.cpc...47h.1h.NO01L_TSI_3750_BIR_NRT.NO01L_CPC_acquisition_lev0_0_0_1.lev1.5.nc',
 'NO0002R.20241002090000.20241009001220.filter_absorption_photometer...1w.1h.NO01L_Magee_AE33_BIR_dry_NRT.NO01L_abs_coef_AE33_v1.lev1.5.nc',
 'NO0002R.20240716050000.20241016053606.aerosol_mass_spectrometer...3mo.1h.NO01L_Aerodyne_Q-ACSM_140-144_NRT.NO01L_Aerodyne_Q-ACSM_BIR.lev1.5.nc',
 'IT0022C.20240716050000.20241016051207.filter_absorption_photometer...3mo.1h.IT06L_Magee_AE33_BO_NRT.IT06L_eBC_v1.lev3b.nc',
 'IT0022C.20240716050000.20241016050604.filter_absorption_photometer

## Check if files can be opened

In [4]:
wrong_files = []
correct_files = []
actris_nrt_path = 'https://thredds.nilu.no/thredds/dodsC/actris_nrt'
for file in files:
    variables = []
    try:
        opendap_url = f'{actris_nrt_path}/{file}'
        ds = xr.open_dataset(opendap_url)
        correct_files.append(file)
    except Exception as error:
        wrong_files.append(file)

print(f'Total number of files: {len(files)}')
print(f'OK: {len(correct_files)}')
print(f'Error: {len(wrong_files)}')

Total number of files: 45
OK: 45
Error: 0


## Collect basic information per file

In [5]:
data_dict = {}
all_variables = {}

In [6]:
actris_nrt_path = 'https://thredds.nilu.no/thredds/dodsC/actris_nrt'
for file in correct_files:
    data_dict[file]={}
    variables = []
    opendap_url = f'{actris_nrt_path}/{file}'
    ds = xr.open_dataset(opendap_url)
    station_code = ds.attrs['ebas_station_code']
    for var, data in ds.data_vars.items():
        if 'ebas_component' in data.attrs:
            speci = data.attrs['ebas_component']
            if speci not in variables:
                variables.append(var)
            if speci not in all_variables:
                all_variables[var] = {}
                
    data_dict[file]['variables'] = variables
    data_dict[file]['station_code'] = station_code
    data_dict[file]['start_date'] = ds.time[0].values
    data_dict[file]['end_date'] = ds.time[-1].values

In [7]:
data_dict

{'RO0010R.20240716050000.20241016060828.filter_absorption_photometer...3mo.1h.RO03L_Magee_AE33_INO_NRT.RO03L_eBC_v1.lev3b.nc': {'variables': ['pressure',
   'temperature',
   'aerosol_absorption_angstrom_exponent',
   'aerosol_absorption_coefficient_amean',
   'aerosol_absorption_coefficient_prec1587',
   'aerosol_absorption_coefficient_perc8413',
   'equivalent_black_carbon_amean',
   'equivalent_black_carbon_prec1587',
   'equivalent_black_carbon_perc8413'],
  'station_code': 'RO0010R',
  'start_date': numpy.datetime64('2024-07-16T05:30:00.000000000'),
  'end_date': numpy.datetime64('2024-10-16T04:30:00.000000000')},
 'RO0010R.20240716050000.20241016060417.filter_absorption_photometer...3mo.1h.RO03L_Magee_AE33_INO_NRT.RO03L_AE33.lev1.5.nc': {'variables': ['pressure',
   'temperature',
   'aerosol_absorption_coefficient_amean',
   'aerosol_absorption_coefficient_prec1587',
   'aerosol_absorption_coefficient_perc8413'],
  'station_code': 'RO0010R',
  'start_date': numpy.datetime64('202

In [8]:
all_variables

{'pressure': {},
 'temperature': {},
 'aerosol_absorption_angstrom_exponent': {},
 'aerosol_absorption_coefficient_amean': {},
 'aerosol_absorption_coefficient_prec1587': {},
 'aerosol_absorption_coefficient_perc8413': {},
 'equivalent_black_carbon_amean': {},
 'equivalent_black_carbon_prec1587': {},
 'equivalent_black_carbon_perc8413': {},
 'ammonium_ug_N_per_m3_amean': {},
 'ammonium_ug_per_m3_amean': {},
 'ammonium_ug_N_per_m3_uncertainty': {},
 'ammonium_ug_per_m3_uncertainty': {},
 'chloride_amean': {},
 'chloride_uncertainty': {},
 'nitrate_ug_N_per_m3_amean': {},
 'nitrate_ug_per_m3_amean': {},
 'nitrate_ug_N_per_m3_uncertainty': {},
 'nitrate_ug_per_m3_uncertainty': {},
 'organic_mass_amean': {},
 'organic_mass_uncertainty': {},
 'sulphate_total_ug_S_per_m3_amean': {},
 'sulphate_total_ug_per_m3_amean': {},
 'sulphate_total_ug_S_per_m3_uncertainty': {},
 'sulphate_total_ug_per_m3_uncertainty': {},
 'nitrate_ug_N_per_m3_amean_Fraction': {},
 'nitrate_ug_per_m3_amean_Fraction': {

## Get files per variable

In [9]:
for var in all_variables.keys():
    variable_files = []
    for file in data_dict.keys():
        if var in data_dict[file]['variables']:
            variable_files.append(file)
    all_variables[var]['files'] = variable_files

In [10]:
all_variables

{'pressure': {'files': ['RO0010R.20240716050000.20241016060828.filter_absorption_photometer...3mo.1h.RO03L_Magee_AE33_INO_NRT.RO03L_eBC_v1.lev3b.nc',
   'RO0010R.20240716050000.20241016060417.filter_absorption_photometer...3mo.1h.RO03L_Magee_AE33_INO_NRT.RO03L_AE33.lev1.5.nc',
   'RO0010R.20240716050000.20241016053408.aerosol_mass_spectrometer...3mo.1h.RO03L_Aerodyne_Q-ACSM_140-172_NRT.RO03L_Aerodyne_Q-ACSM_.lev1.5.nc',
   'NO0002R.20241002110000.20241004105104.cpc...47h.1h.NO01L_TSI_3750_BIR_NRT.NO01L_CPC_acquisition_lev0_0_0_1.lev1.5.nc',
   'NO0002R.20241002090000.20241009001220.filter_absorption_photometer...1w.1h.NO01L_Magee_AE33_BIR_dry_NRT.NO01L_abs_coef_AE33_v1.lev1.5.nc',
   'NO0002R.20240716050000.20241016053606.aerosol_mass_spectrometer...3mo.1h.NO01L_Aerodyne_Q-ACSM_140-144_NRT.NO01L_Aerodyne_Q-ACSM_BIR.lev1.5.nc',
   'IT0022C.20240716050000.20241016051207.filter_absorption_photometer...3mo.1h.IT06L_Magee_AE33_BO_NRT.IT06L_eBC_v1.lev3b.nc',
   'IT0022C.20240716050000.202410

## Test combination

In [11]:
file_1 = all_variables['particle_number_concentration_amean']['files'][0]
file_2 = all_variables['particle_number_concentration_amean']['files'][1]

In [12]:
ds_1 = xr.open_dataset(f'{actris_nrt_path}/{file_1}')
ds_1

In [13]:
ds_1 = ds_1.expand_dims(dim={"station": [0]})
ds_1

In [14]:
ds_2 = xr.open_dataset(f'{actris_nrt_path}/{file_2}')
ds_2

In [15]:
ds_2 = ds_2.expand_dims(dim={"station": [1]})
ds_2

In [16]:
combined_ds_list = [ds_1['particle_number_concentration_amean'], 
                    ds_2['particle_number_concentration_amean']]

In [17]:
combined_ds = xr.concat(combined_ds_list, 
                        dim='station', 
                        combine_attrs='drop_conflicts')
combined_ds = combined_ds.to_dataset()
combined_ds['ebas_station_code'] = xr.Variable(data=['NO0002R', 'FI0023R'], dims=('station'))
combined_ds

In [18]:
ebas_keys = [key for key in ds_1.attrs.keys() if key.startswith('ebas_station')]
ebas_keys

['ebas_station_code',
 'ebas_station_name',
 'ebas_station_wdca_id',
 'ebas_station_gaw_id',
 'ebas_station_gaw_name',
 'ebas_station_other_ids',
 'ebas_station_land_use',
 'ebas_station_setting',
 'ebas_station_gaw_type',
 'ebas_station_wmo_region',
 'ebas_station_latitude',
 'ebas_station_longitude',
 'ebas_station_altitude']

## Preprocessing of files

In [19]:
# Define metadata keys, inspired in first dataset (TODO: Check all available metadata fields)
metadata_keys = ['ebas_station_code','ebas_station_name',
                 'ebas_station_wdca_id', 'ebas_station_gaw_id',
                 'ebas_station_gaw_name', 'ebas_station_other_ids',
                 'ebas_station_land_use', 'ebas_station_setting',
                 'ebas_station_gaw_type', 'ebas_station_wmo_region',
                 'ebas_station_latitude', 'ebas_station_longitude',
                 'ebas_station_altitude']

# open all files for certain variable, extract and join
resolution = 'hourly'
for var in all_variables:

    # combine datasets that have the same variable
    combined_ds_list = []
    metadata = {}
    for i, file in enumerate(all_variables[var]['files']):
        # open file
        opendap_url = f'{actris_nrt_path}/{file}'
        ds = xr.open_dataset(opendap_url)

        # save metadata
        for ebas_key in metadata_keys:
            if ebas_key not in metadata.keys():
                metadata[ebas_key] = []
            if ebas_key not in ds.attrs.keys():
                metadata[ebas_key].append(np.nan)
            else:
                metadata[ebas_key].append(ds.attrs[ebas_key])
        
        # assign station code as dimension
        ds = ds.expand_dims(dim={'station': [i]})

        # select data for that variable only
        ds_var = ds[var]

        # append modified dataset to list
        combined_ds_list.append(ds_var)

    # combine and create new dataset
    combined_ds = xr.concat(combined_ds_list, 
                            dim='station', 
                            combine_attrs='drop_conflicts').to_dataset()

    # add metadata
    for key in metadata.keys():
        combined_ds[key] = xr.Variable(data=metadata[key], dims=('station'))
    
    # save data per year and month
    path = f'/home/avilanov/data/providentia/obs/nonghost/actris/nrt/{resolution}/{var}'
    if not os.path.isdir(path):
        os.makedirs(path, exist_ok=True)
    for year, ds_year in combined_ds.groupby('time.year'):
        for month, ds_month in ds_year.groupby('time.month'):
            filename = f"{path}/{var}_{year}{month:02d}.nc"
            combined_ds_yearmonth = combined_ds.sel(time=f"{year}-{month:02d}")
            combined_ds_yearmonth.to_netcdf(filename)
            print(f"Saved: {filename}")

Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/pressure/pressure_202404.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/pressure/pressure_202405.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/pressure/pressure_202406.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/pressure/pressure_202407.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/pressure/pressure_202408.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/pressure/pressure_202409.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/pressure/pressure_202410.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/temperature/temperature_202404.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/temperature/temperature_202405.nc
Saved: /home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/temperature/temperature_202406.nc
Sa

## Check new files

In [2]:
import xarray as xr
xr.open_dataset('/home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/nitrate_ug_N_per_m3_amean/nitrate_ug_N_per_m3_amean_202407.nc')

In [20]:
xr.open_dataset('/home/avilanov/data/providentia/obs/nonghost/actris/nrt/hourly/aerosol_absorption_angstrom_exponent/aerosol_absorption_angstrom_exponent_202407.nc')