In [1]:
import requests
import itertools
import os
import yaml
import xarray as xr
import numpy as np
from variable_mapping import variable_mapping
import datetime
import csv
import sys
import time
import pandas as pd
import re

CURRENT_PATH = os.getcwd()
sys.path = [path for path in sys.path if '../dependencies/GHOST_standards/' not in path]            
sys.path.insert(1, os.path.join(CURRENT_PATH, '../dependencies/GHOST_standards/1.5'))
from GHOST_standards import standard_parameters, get_standard_metadata

In [2]:
start_time = time.time()

In [3]:
#variables = ['sconcno2']
#resolution = 'daily'

variables = ['lsco525']
resolution = 'hourly'
target_start_date = datetime.datetime(2018, 1, 1, 0)
target_end_date = datetime.datetime(2018, 12, 31, 23)

In [4]:
coverages_dict = {'P0000-00-00T01:00:00': 'hourly', 
                  'P0000-00-01T00:00:00': 'daily', 
                  'P0000-01-00T00:00:00': 'monthly'}

units_dict = {'nmol/mol': 'nmol_per_mol',
              'ug/m3': 'ug_per_m3',
              'ug N/m3': 'ug_per_m3',
              'ug S/m3': 'ug_S_per_m3',
              'pmol/mol': 'pmol_mol',
              '1/Mm': ''
              } 

# Get ACTRIS variable mapping

In [5]:
def create_variable_mapping_file():
    result = {
        value['preferred_term'].replace('"', ''): {'var': key[2], 'units': key[0]}
        for key, value in variable_mapping.items()
    }
    
    with open('variable_mapping.yaml', 'w') as file:
        yaml.dump(result, file, default_flow_style=False)

In [6]:
#create_variable_mapping_file()

In [7]:
variable_mapping = yaml.safe_load(open(os.path.join(CURRENT_PATH, 'variable_mapping.yaml')))
variable_mapping = {k: v for k, v in variable_mapping.items() if k.strip() and v}

# Get BSC-ACTRIS parameters dictionary

In [8]:
def create_actris_variables_file():
    with open('actris_variables.csv', mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        for key in variable_mapping.keys():
            writer.writerow([key, variable_mapping[key]['var']])

In [9]:
#create_actris_variables_file()

In [10]:
def create_ghost_variables_file():    
    with open('ghost_variables.csv', mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        for key in standard_parameters.keys():
            writer.writerow([standard_parameters[key]['long_parameter_name'], standard_parameters[key]['bsc_parameter_name'], ', '.join( standard_parameters[key]['ebas_parameter_name'])])

In [11]:
#create_ghost_variables_file()

In [12]:
parameters_dict = {}
with open('ghost_actris_variables.csv', mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        # if ACTRIS was found (manual intervention)
        if len(row[6]) != 0 and row[6] != 'Preferred':
            # 'Preferred' = 'BSC'
            parameters_dict[row[1]] = row[6]
parameters_dict

{'sconco3': 'ozone mass concentration',
 'sconcno2': 'nitrogen dioxide mass concentration',
 'sconcso2': 'sulfur dioxide mass concentration',
 'sconcco': 'carbon monoxide amount fraction',
 'sconcch4': 'methane amount fraction',
 'sconcglyox': 'glyoxal mass concentration',
 'sconcc2h4': 'ethene amount fraction',
 'sconcald2': 'acetaldehyde mass concentration',
 'sconcc2h6': 'ethane amount fraction',
 'sconcetoh': 'ethanol mass concentration',
 'sconcc3h6': 'propene amount fraction',
 'sconcc3h8': 'propane amount fraction',
 'sconcc4h6': '1,3-butadiene amount fraction',
 'sconcc4h8': '1-butene amount fraction',
 'sconcisop': 'isoprene amount fraction',
 'sconcc5h12': 'n-pentane amount fraction',
 'sconcc6h6': 'benzene amount fraction',
 'sconcc6h14': 'n-hexane amount fraction',
 'sconcc7h8': 'toluene',
 'sconcmpxyl': 'm/p-xylenes amount fraction',
 'sconcmxyl': 'm-xylene amount fraction',
 'sconcoxyl': 'o-xylene amount fraction',
 'sconcc9h12': '1,2,4-trimethylbenzene amount fraction',


# Get BSC-ACTRIS metadata dictionary

In [13]:
standard_metadata = get_standard_metadata({'standard_units': ''})
ghost_metadata = list(standard_metadata.keys())

In [14]:
actris_metadata = ['Conventions', 'featureType', 'title', 'keywords', 'id', 'naming_authority',
                   'project', 'acknowledgement', 'doi', 'license', 'citation', 'summary', 'source', 
                   'institution', 'processing_level', 'date_created', 'date_metadata_modified', 
                   'creator_name', 'creator_type', 'creator_email', 'creator_institution',
                   'contributor_name', 'contributor_role', 'publisher_type', 'publisher_name', 
                   'publisher_institution', 'publisher_email', 'publisher_url', 'geospatial_bounds', 
                   'geospatial_bounds_crs', 'geospatial_lat_min', 'geospatial_lat_max',
                   'geospatial_lon_min', 'geospatial_lon_max', 'geospatial_vertical_min', 
                   'geospatial_vertical_max', 'geospatial_vertical_positive', 'time_coverage_start',
                   'time_coverage_end', 'time_coverage_duration', 'time_coverage_resolution',
                   'timezone', 'ebas_data_definition', 'ebas_data_license', 'ebas_citation', 
                   'ebas_set_type_code', 'ebas_timezone', 'ebas_file_name', 'ebas_represents_doi',
                   'ebas_contains_doi', 'ebas_file_creation', 'ebas_export_state', 'ebas_export_filter',
                   'ebas_startdate', 'ebas_revision_date', 'ebas_data_level', 'ebas_period_code',
                   'ebas_resolution_code', 'ebas_sample_duration', 'ebas_orig_time_res',
                   'ebas_station_code', 'ebas_platform_code', 'ebas_station_name', 
                   'ebas_station_wdca_id', 'ebas_station_gaw_id', 'ebas_station_gaw_name',
                   'ebas_station_land_use', 'ebas_station_setting', 'ebas_station_gaw_type', 
                   'ebas_station_wmo_region', 'ebas_station_latitude', 'ebas_station_longitude', 
                   'ebas_station_altitude', 'ebas_measurement_height', 'ebas_regime', 'ebas_component',
                   'ebas_matrix', 'ebas_laboratory_code', 'ebas_instrument_type', 'ebas_instrument_name',
                   'ebas_instrument_manufacturer', 'ebas_instrument_model', 
                   'ebas_instrument_serial_number', 'ebas_method_ref', 'ebas_standard_method',
                   'ebas_inlet_type', 'ebas_inlet_description', 'ebas_humidity_temperaure_control',
                   'ebas_absorption_cross_section', 'ebas_organization', 'ebas_framework_acronym',
                   'ebas_framework_name', 'ebas_framework_description', 'ebas_framework_contact_name',
                   'ebas_framework_contact_email', 'ebas_originator', 'ebas_submitter', 
                   'ebas_acknowledgement', 'Metadata_Conventions', 'geospatial_lat_units', 
                   'geospatial_lon_units', 'comment', 'standard_name_vocabulary', 'history', 
                   'creator_url']

In [15]:
all_metadata_dict = {'station_reference': 'ebas_station_code',
                     'WIGOS_station_identifier': '',
                     'station_timezone': 'timezone',
                     'latitude': 'ebas_station_latitude',
                     'longitude': 'ebas_station_longitude',
                     'altitude': 'ebas_station_altitude',
                     'sampling_height': 'ebas_measurement_height',
                     'measurement_altitude': 'ebas_station_altitude', # no diff with altitude?
                     'ellipsoid': '',
                     'horizontal_datum': '',
                     'vertical_datum': '',
                     'projection': '',
                     'distance_to_building': '',
                     'distance_to_kerb': '',
                     'distance_to_junction': '',
                     'distance_to_source': '',
                     'street_width': '',
                     'street_type': '',
                     'daytime_traffic_speed': '',
                     'daily_passing_vehicles': '',
                     'data_level': 'ebas_data_level',
                     'climatology': '',
                     'station_name': 'ebas_station_name',
                     'city': '', # get from title
                     'country': '',
                     'administrative_country_division_1': '',
                     'administrative_country_division_2': '',
                     'population': '',
                     'representative_radius': '',
                     'network': 'naming_authority',
                     'associated_networks': 'naming_authority',
                     'area_classification': '',
                     'station_classification': '',
                     'main_emission_source': '',
                     'land_use': 'ebas_station_land_use',
                     'terrain': '',
                     'measurement_scale': '',
                     'ESDAC_Iwahashi_landform_classification': '',
                     'ESDAC_modal_Iwahashi_landform_classification_5km': '',
                     'ESDAC_modal_Iwahashi_landform_classification_25km': '',
                     'ESDAC_Meybeck_landform_classification': '',
                     'ESDAC_modal_Meybeck_landform_classification_5km': '',
                     'ESDAC_modal_Meybeck_landform_classification_25km': '',
                     'GHSL_settlement_model_classification': '',
                     'GHSL_modal_settlement_model_classification_5km': '',
                     'GHSL_modal_settlement_model_classification_25km': '',
                     'Joly-Peuch_classification_code': '',
                     'Koppen-Geiger_classification': '',
                     'Koppen-Geiger_modal_classification_5km': '',
                     'Koppen-Geiger_modal_classification_25km': '',
                     'MODIS_MCD12C1_v6_IGBP_land_use': '',
                     'MODIS_MCD12C1_v6_modal_IGBP_land_use_5km': '',
                     'MODIS_MCD12C1_v6_modal_IGBP_land_use_25km': '',
                     'MODIS_MCD12C1_v6_UMD_land_use': '',
                     'MODIS_MCD12C1_v6_modal_UMD_land_use_5km': '',
                     'MODIS_MCD12C1_v6_modal_UMD_land_use_25km': '',
                     'MODIS_MCD12C1_v6_LAI': '',
                     'MODIS_MCD12C1_v6_modal_LAI_5km': '',
                     'MODIS_MCD12C1_v6_modal_LAI_25km': '',
                     'WMO_region': 'ebas_station_wmo_region',
                     'WWF_TEOW_terrestrial_ecoregion': '',
                     'WWF_TEOW_biogeographical_realm': '',
                     'WWF_TEOW_biome': '',
                     'UMBC_anthrome_classification': '',
                     'UMBC_modal_anthrome_classification_5km': '',
                     'UMBC_modal_anthrome_classification_25km': '',
                     'EDGAR_v4.3.2_annual_average_BC_emissions': '',
                     'EDGAR_v4.3.2_annual_average_CO_emissions': '',
                     'EDGAR_v4.3.2_annual_average_NH3_emissions': '',
                     'EDGAR_v4.3.2_annual_average_NMVOC_emissions': '',
                     'EDGAR_v4.3.2_annual_average_NOx_emissions': '',
                     'EDGAR_v4.3.2_annual_average_OC_emissions': '',
                     'EDGAR_v4.3.2_annual_average_PM10_emissions': '',
                     'EDGAR_v4.3.2_annual_average_biogenic_PM2.5_emissions': '',
                     'EDGAR_v4.3.2_annual_average_fossilfuel_PM2.5_emissions': '',
                     'EDGAR_v4.3.2_annual_average_SO2_emissions': '',
                     'ASTER_v3_altitude': '',
                     'ETOPO1_altitude': '',
                     'ETOPO1_max_altitude_difference_5km': '',
                     'GHSL_built_up_area_density': '',
                     'GHSL_average_built_up_area_density_5km': '',
                     'GHSL_average_built_up_area_density_25km': '',
                     'GHSL_max_built_up_area_density_5km': '',
                     'GHSL_max_built_up_area_density_25km': '',
                     'GHSL_population_density': '',
                     'GHSL_average_population_density_5km': '',
                     'GHSL_average_population_density_25km': '',
                     'GHSL_max_population_density_5km': '',
                     'GHSL_max_population_density_25km': '',
                     'GPW_population_density': '',
                     'GPW_average_population_density_5km': '',
                     'GPW_average_population_density_25km': '',
                     'GPW_max_population_density_5km': '',
                     'GPW_max_population_density_25km': '',
                     'NOAA-DMSP-OLS_v4_nighttime_stable_lights': '',
                     'NOAA-DMSP-OLS_v4_average_nighttime_stable_lights_5km': '',
                     'NOAA-DMSP-OLS_v4_average_nighttime_stable_lights_25km': '',
                     'NOAA-DMSP-OLS_v4_max_nighttime_stable_lights_5km': '',
                     'NOAA-DMSP-OLS_v4_max_nighttime_stable_lights_25km': '',
                     'OMI_level3_column_annual_average_NO2': '',
                     'OMI_level3_column_cloud_screened_annual_average_NO2': '',
                     'OMI_level3_tropospheric_column_annual_average_NO2': '',
                     'OMI_level3_tropospheric_column_cloud_screened_annual_average_NO2': '',
                     'GSFC_coastline_proximity': '',
                     'primary_sampling_type': '',
                     'primary_sampling_instrument_name': '',
                     'primary_sampling_instrument_documented_flow_rate': '',
                     'primary_sampling_instrument_reported_flow_rate': '',
                     'primary_sampling_process_details': '',
                     'primary_sampling_instrument_manual_name': '',
                     'primary_sampling_further_details': '',
                     'sample_preparation_types': '',
                     'sample_preparation_techniques': '',
                     'sample_preparation_process_details': '',
                     'sample_preparation_further_details': '',
                     'measurement_methodology': 'ebas_method_ref',
                     'measuring_instrument_name': 'ebas_instrument_name',
                     'measuring_instrument_sampling_type': 'ebas_instrument_type',
                     'measuring_instrument_documented_flow_rate': '',
                     'measuring_instrument_reported_flow_rate': '',
                     'measuring_instrument_process_details': '',
                     'measuring_instrument_manual_name': '',
                     'measuring_instrument_further_details': '',
                     'measuring_instrument_reported_units': '',
                     'measuring_instrument_reported_lower_limit_of_detection': '',
                     'measuring_instrument_documented_lower_limit_of_detection': '',
                     'measuring_instrument_reported_upper_limit_of_detection': '',
                     'measuring_instrument_documented_upper_limit_of_detection': '',
                     'measuring_instrument_reported_uncertainty': '',
                     'measuring_instrument_documented_uncertainty': '',
                     'measuring_instrument_reported_accuracy': '',
                     'measuring_instrument_documented_accuracy': '',
                     'measuring_instrument_reported_precision': '',
                     'measuring_instrument_documented_precision': '',
                     'measuring_instrument_reported_zero_drift': '',
                     'measuring_instrument_documented_zero_drift': '',
                     'measuring_instrument_reported_span_drift': '',
                     'measuring_instrument_documented_span_drift': '',
                     'measuring_instrument_reported_zonal_drift': '',
                     'measuring_instrument_documented_zonal_drift': '',
                     'measuring_instrument_reported_measurement_resolution': '',
                     'measuring_instrument_documented_measurement_resolution': '',
                     'measuring_instrument_reported_absorption_cross_section': '',
                     'measuring_instrument_documented_absorption_cross_section': '',
                     'measuring_instrument_inlet_information': 'ebas_inlet_description',
                     'measuring_instrument_calibration_scale': '',
                     'network_provided_volume_standard_temperature': '',
                     'network_provided_volume_standard_pressure': '',
                     'retrieval_algorithm': '',
                     'principal_investigator_name': 'creator_name',
                     'principal_investigator_institution': 'creator_institution',
                     'principal_investigator_email_address': 'creator_email',
                     'contact_name': 'publisher_name', # TODO: in doubt
                     'contact_institution': 'publisher_institution', # TODO: in doubt
                     'contact_email_address': 'publisher_email', # TODO: in doubt
                     'meta_update_stamp': 'date_metadata_modified',
                     'data_download_stamp': '', # TODO: Do we put our download date here?
                     'data_revision_stamp': '',
                     'network_sampling_details': '',
                     'network_uncertainty_details': '',
                     'network_maintenance_details': '',
                     'network_qa_details': '',
                     'network_miscellaneous_details': '',
                     'data_licence': 'license',
                     'process_warnings': ''}

In [16]:
metadata_dict = {k: v for k, v in all_metadata_dict.items() if v != ''}
metadata_dict

{'station_reference': 'ebas_station_code',
 'station_timezone': 'timezone',
 'latitude': 'ebas_station_latitude',
 'longitude': 'ebas_station_longitude',
 'altitude': 'ebas_station_altitude',
 'sampling_height': 'ebas_measurement_height',
 'measurement_altitude': 'ebas_station_altitude',
 'data_level': 'ebas_data_level',
 'station_name': 'ebas_station_name',
 'network': 'naming_authority',
 'associated_networks': 'naming_authority',
 'land_use': 'ebas_station_land_use',
 'WMO_region': 'ebas_station_wmo_region',
 'measurement_methodology': 'ebas_method_ref',
 'measuring_instrument_name': 'ebas_instrument_name',
 'measuring_instrument_sampling_type': 'ebas_instrument_type',
 'measuring_instrument_inlet_information': 'ebas_inlet_description',
 'principal_investigator_name': 'creator_name',
 'principal_investigator_institution': 'creator_institution',
 'principal_investigator_email_address': 'creator_email',
 'contact_name': 'publisher_name',
 'contact_institution': 'publisher_institution'

# Get files per variable

In [17]:
def get_files_per_variable(chunk_i, variables):
    files_per_var = {}
    base_url = "https://prod-actris-md.nilu.no/metadata/content"
    n_variables = len(variables)
    print('Variables:', variables)
    for var_i, var in enumerate(variables):
        print(f'[{chunk_i}] {var} ({var_i}/{n_variables})')
        if var not in files_per_var:
            files_per_var[var] = {}
        variable_files = []
        page = 0
        while True:
            # Set up URL with pagination
            url = f"{base_url}/{parameters_dict[var]}/page/{page}"
            response = requests.get(url)
            
            # Check if the response is valid and contains data
            if response.status_code != 200:
                print(f"Error fetching page {page}. Status code: {response.status_code}")
                break
            
            data = response.json()
            
            # Check if there's content in the data
            if not data:
                break
            
            # Loop through each entry in the data and print DOI and OPeNDAP URL
            for item in data:
                doi = item.get("md_identification", {}).get("identifier", {}).get("pid")
                opendap_urls = [protocol_dict['dataset_url'] for protocol_dict in item.get('md_distribution_information', []) if protocol_dict.get('protocol') == 'OPeNDAP']
                
                # Print DOI and OPeNDAP URL if both are present
                if doi and opendap_urls:
                    variable_files.append(opendap_urls)
                    
            # Go to the next page
            page += 1
        
        files_per_var[var]['files'] = list(itertools.chain.from_iterable(variable_files))
    
    return files_per_var

In [18]:
def get_files_per_var_list(variables):
    combined_data = {}
    chunk_size = 100
    chunks = [list(variables)[i:i + chunk_size] for i in range(0, len(variables), chunk_size)]
    for chunk_i, chunk in enumerate(chunks):
        files_per_var = get_files_per_variable(chunk_i, chunk)
        combined_data.update(files_per_var)
    return combined_data

In [19]:
combined_data = get_files_per_var_list(variables)

Variables: ['lsco525']
[0] lsco525 (0/1)


# Get information on files

In [20]:
def get_files_info(var, files):

    files_info = {}
    files_info[var] = {}
    for i, file in enumerate(files):
        print(f'{i} - {file}')
        try:
            ds = xr.open_dataset(file)
        except:
            print('Error opening dataset')
            continue
        coverage = ds.time_coverage_resolution
        try:             
            file_resolution = coverages_dict[coverage]
        except:
            print('Error in resolution with coverage:', coverage)
            continue
        start_date = ds.time_coverage_start
        end_date = ds.time_coverage_end
        variables = list(ds.data_vars.keys())
        files_info[var][file] = {}
        files_info[var][file]['resolution'] = file_resolution
        files_info[var][file]['start_date'] = start_date
        files_info[var][file]['end_date'] = end_date
        files_info[var][file]['variables'] = variables

    return files_info

In [21]:
def get_files_path(var):
    
    alpha_var = ''.join(x for x in var if x.isalpha())
    if alpha_var in ['lsco', 'absco', 'lbsco', 'odaero']:
        path = f'files/{alpha_var}/files.yaml'
    else:
        path = f'files/{var}/files.yaml'

    return path

In [22]:
%%time
for var in variables:

    path = get_files_path(var)
        
    # if file does not exist
    if not os.path.isfile(path):

        print(f'File {path} does not exist, creating.')

        # get files information
        files = combined_data[var]['files']
        print('Total number of files:', len(files))
        print('Getting information...')
        files_info = get_files_info(var, files)
    
        # create file
        datasets = {
            url: data
            for url, data in files_info[var].items()
        }
        if len(datasets) != 0:
            path_dir = os.path.dirname(path)
            if not os.path.exists(path_dir):
                os.makedirs(path_dir)
            with open(path, 'w') as file:
                yaml.dump(datasets, file, default_flow_style=False)
            
    # if file exists
    else:
        print(f'File {path} already exists, checking if it needs an update.')
        
        # get currently available files
        current_files = combined_data[var]['files']
    
        # get previously saved files
        file_info_to_update = yaml.safe_load(open(os.path.join(CURRENT_PATH, path)))
        previous_files = list(file_info_to_update.keys())

        # keep old files only if they are currently available
        files_to_remove = []
        for file in previous_files:
            if file not in current_files:
                files_to_remove.append(file)
    
        # add new files
        files_to_add = []
        for file in current_files:
            if file not in previous_files:
                files_to_add.append(file)

        # get information from new files
        datasets = []
        if len(files_to_add) > 0:
            print('- Files to add ({0}): {1}'.format(len(files_to_add), files_to_add))
            print('Getting information...')
            files_info = get_files_info(var, files_to_add)
            datasets = {
                url: data
                for url, data in files_info[var].items()
            }
    
            # add new data to dictionary
            if len(datasets) != 0:
                file_info_to_update.update(datasets)
    
        # remove unavailable data
        if len(files_to_remove) > 0:
            print('- Files to remove ({0}): {1}'.format(len(files_to_remove), files_to_remove))
            for file in files_to_remove:
                print(f'Removing file {file}')
                file_info_to_update.pop(file, None)
    
        # recreate file info 
        if len(datasets) != 0 or len(files_to_remove) > 0:
            print('Updating file...')
            os.remove(path)
            with open(path, 'w') as file:
                yaml.dump(file_info_to_update, file, default_flow_style=False)
        else:
            print('No relevant changes were found.')

File files/lsco/files.yaml already exists, checking if it needs an update.
No relevant changes were found.
CPU times: user 357 ms, sys: 1.24 ms, total: 358 ms
Wall time: 360 ms


# Format data

In [23]:
def filter_files(var, resolution, target_start_date, target_end_date):
    files = []
    path = get_files_path(var)
    files_info = yaml.safe_load(open(os.path.join(CURRENT_PATH, path)))
    files_info = {k: v for k, v in files_info.items() if k.strip() and v}
    for file, attributes in files_info.items():
        if attributes["resolution"] == resolution:
            start_date = datetime.datetime.strptime(attributes["start_date"], "%Y-%m-%dT%H:%M:%S UTC")
            end_date = datetime.datetime.strptime(attributes["end_date"], "%Y-%m-%dT%H:%M:%S UTC")
            if start_date <= target_end_date and end_date >= target_start_date:
                files.append(file)
    return files

In [24]:
def temporally_average_data(combined_ds, year, month, var):

    # get valid dates frequency
    if resolution == 'hourly':
        frequency = 'h'
    elif resolution == 'daily':
        frequency = 'D'
    elif resolution == 'monthly':
        frequency = 'MS'

    # get start and end of period to construct valid dates
    time = combined_ds.time.values
    start_date = datetime.datetime(year, month, 1)
    first_day_next_month = datetime.datetime(year, month % 12 + 1, 1) if month != 12 else datetime.datetime(year + 1, 1, 1)
    end_date = first_day_next_month - datetime.timedelta(days=1)
    valid_dates = pd.date_range(start=start_date, end=end_date, freq=frequency).to_numpy(dtype='datetime64[ns]')
    
    # initialise averaged data
    averaged_data = np.empty((len(combined_ds.station.values), len(valid_dates)))
    
    for station_i, station in enumerate(combined_ds.station.values):
        # initialise averaged data
        station_averaged_data = []
    
        # read data per station
        data = combined_ds[var].isel(station=station_i).values

        # ignore data (times and values) if the values are nan
        valid_idxs = ~np.isnan(data)
        valid_time = time[valid_idxs]
        valid_data = data[valid_idxs]

        # calculate weighted averages
        if len(valid_data) != 0:
            for date in valid_dates:
            
                # get differences between valid time and actual times in minutes
                time_diffs = (valid_time - date).astype('timedelta64[ns]').astype(float)
            
                # get positive differences and negative differences to differentiate 
                # between the actual times that are earlier than the valid date (negative), and those that are later (positive)
                positive_diffs = time_diffs[time_diffs > 0]
                negative_diffs = time_diffs[time_diffs < 0]
                
                # find the closest actual time after the valid time
                closest_positive = None
                if len(positive_diffs) > 0:
                    closest_positive_idx = np.abs(positive_diffs).argmin()
                    closest_positive = positive_diffs[closest_positive_idx]
                    closest_positive_time = valid_time[time_diffs == positive_diffs[closest_positive_idx]][0]
                    closest_positive_value = valid_data[time_diffs == positive_diffs[closest_positive_idx]][0]
            
                # find the closest actual time before the valid time
                closest_negative = None
                if len(negative_diffs) > 0:
                    closest_negative_idx = np.abs(negative_diffs).argmin()
                    closest_negative = negative_diffs[closest_negative_idx]
                    closest_negative_time = valid_time[time_diffs == negative_diffs[closest_negative_idx]][-1]
                    closest_negative_value = valid_data[time_diffs == negative_diffs[closest_negative_idx]][-1]
            
                # when the valid time only has a value in one direction, get closest value without calculating weights
                if closest_positive is None:
                    value = closest_negative_value
                elif closest_negative is None:
                    value = closest_positive_value
                # in the rest of cases, calculate weights of 2 closest values and make average
                else:
                    # get 2 closest times and make positive to be able to compare differences
                    closest_diffs = np.abs([closest_negative, closest_positive])
            
                    # we do the reverse, since we want the differences in minutes to have a heavier weight if these are smaller (nearer the actual time)
                    weights = 1 / closest_diffs
                    
                    # finally we normalize them to have values between 0 and 1
                    weights_normalized = weights / np.sum(weights)
            
                    # get average
                    value = np.average([closest_negative_value, closest_positive_value], weights=weights_normalized)
        
                # save averaged data
                station_averaged_data.append(value)
        
            averaged_data[station_i, :] = station_averaged_data
        else:
            averaged_data[station_i, :] = [np.nan]*len(valid_dates)
    
    # create new variable with averaged data
    combined_averaged_ds = xr.DataArray(
        data=averaged_data,
        coords={'station': combined_ds.station.values, 'time': valid_dates}, 
        dims=['station', 'time'],
        attrs={'units': combined_ds[var].units})
    
    # drop old variable and associated time
    combined_ds = combined_ds.drop_vars(var)
    combined_ds = combined_ds.drop_dims('time')
    
    # add new variable
    combined_ds[var] = combined_averaged_ds
    
    return combined_ds

In [25]:
globals()

{'__name__': '__main__',
 '__doc__': 'Automatically created module for IPython interactive environment',
 '__package__': None,
 '__loader__': None,
 '__spec__': None,
 '__builtin__': <module 'builtins' (built-in)>,
 '__builtins__': <module 'builtins' (built-in)>,
 '_ih': ['',
  "import requests\nimport itertools\nimport os\nimport yaml\nimport xarray as xr\nimport numpy as np\nfrom variable_mapping import variable_mapping\nimport datetime\nimport csv\nimport sys\nimport time\nimport pandas as pd\nimport re\n\nCURRENT_PATH = os.getcwd()\nsys.path = [path for path in sys.path if '../dependencies/GHOST_standards/' not in path]            \nsys.path.insert(1, os.path.join(CURRENT_PATH, '../dependencies/GHOST_standards/1.5'))\nfrom GHOST_standards import standard_parameters, get_standard_metadata",
  'start_time = time.time()',
  "#variables = ['sconcno2']\n#resolution = 'daily'\n\nvariables = ['lsco525']\nresolution = 'hourly'\ntarget_start_date = datetime.datetime(2018, 1, 1, 0)\ntarget_e

In [26]:
for var in variables:
    files = filter_files(var, resolution, target_start_date, target_end_date)
    if len(files) != 0:
    
        actris_parameter = parameters_dict[var]
        ebas_component = variable_mapping[actris_parameter]['var']
        
        print('Variable:', var, '- ACTRIS:', actris_parameter)
        
        # combine datasets that have the same variable and resolution
        combined_ds_list = []
        metadata = {}
        metadata[resolution] = {}
        
        print('Total number of files:', len(files))
        for i, file in enumerate(files):
            print(i, '-', file)
            # open file
            try:
                ds = xr.open_dataset(file)
            except:
                print('Error opening file')
                continue

            # get resolution
            coverage = ds.time_coverage_resolution
            resolution = coverages_dict[coverage]

            # get lowest level if tower height is in coordinates
            if 'Tower_inlet_height' in list(ds.coords):
                ds = ds.sel(Tower_inlet_height=min(ds.Tower_inlet_height.values), drop=True)

            # get data at desired wavelength if wavelength is in coordinates
            wavelength_var = False
            if 'Wavelength' in list(ds.coords):
                wavelength = int(re.findall(r'\d+', var)[0])
                if wavelength in ds.Wavelength.values:
                    ds = ds.sel(Wavelength=wavelength, drop=True)
                    wavelength_var = True
                else:
                    print(f'Data at {wavelength}nm could not be found')
                    continue
            
            # assign station code as dimension
            ds = ds.expand_dims(dim={'station': [i]})
    
            # select data for that variable only
            unformatted_units = variable_mapping[actris_parameter]['units']
            if unformatted_units in units_dict.keys():
                units = units_dict[unformatted_units]
            else:
                print(f'Units {unformatted_units} were not found in dictionary')
                continue
            units_var = f'{ebas_component}_{units}'
            possible_vars = [ebas_component, 
                             f'{ebas_component}_amean', 
                             units_var, 
                             f'{units_var}_amean']
            ds_var_exists = False
            for possible_var in possible_vars:
                if possible_var in ds:
                    ds_var = ds[possible_var]
                    ds_var_exists = True
                    break

            # continue to next file if variable cannot be read
            if not ds_var_exists:
                print(f'No variable name matches for {possible_vars}. Existing keys: {list(ds.data_vars)}')
                continue
                
            # save metadata
            for ghost_key, ebas_key in metadata_dict.items():
                # create key if it does not exist
                if ghost_key not in metadata[resolution].keys():
                    metadata[resolution][ghost_key] = []

                # search value in var attrs
                if ebas_key in ds_var.attrs.keys():
                    metadata[resolution][ghost_key].append(ds_var.attrs[ebas_key])
                # search value in ds attrs
                elif ebas_key in ds.attrs.keys():
                    metadata[resolution][ghost_key].append(ds.attrs[ebas_key])
                # not found -> nan
                else:
                    metadata[resolution][ghost_key].append(np.nan)

            # remove all attributes except units
            ds_var.attrs = {key: value for key, value in ds_var.attrs.items() if key == 'units'}

            # rename variable to BSC standards
            ds_var = ds_var.to_dataset(name=var)

            # append modified dataset to list
            combined_ds_list.append(ds_var)

        # combine and create new dataset
        try:
            combined_ds = xr.concat(combined_ds_list, 
                                    dim='station', 
                                    combine_attrs='drop_conflicts')
        except Exception as error:
            print(f'Error: Datasets could not be combined - {error}')
            if 'time' in str(error):
                for item in combined_ds_list:
                    print(item.time.values[0], item.time.values[1])
            continue
        
        # add metadata
        for key, value in metadata[resolution].items():
            if key in ['latitude', 'longitude']:
                value = [float(val) for val in value]
            elif key in ['altitude', 'measurement_altitude', 'sampling_height']:
                value = [float(val.replace('m', '').strip()) if isinstance(val, str) else val for val in value]
            combined_ds[key] = xr.Variable(data=value, dims=('station'))

        # add units for lat and lon
        # TODO: Check attrs geospatial_lat_units and geospatial_lon_units
        combined_ds.latitude.attrs['units'] = 'degrees_north'
        combined_ds.longitude.attrs['units'] = 'degrees_east'

        # add general attrs
        combined_ds.attrs['data_license'] = 'BSD-3-Clause. Copyright 2025 Alba Vilanova Cortezón'
        combined_ds.attrs['source'] = 'Observations'
        combined_ds.attrs['institution'] = 'Barcelona Supercomputing Center'
        combined_ds.attrs['creator_name'] = 'Alba Vilanova Cortezón'
        combined_ds.attrs['creator_email'] = 'alba.vilanova@bsc.es'
        combined_ds.attrs['application_area'] = 'Monitoring atmospheric composition'
        combined_ds.attrs['domain'] = 'Atmosphere'
        combined_ds.attrs['observed_layer'] = 'Land surface'
        
        # save data per year and month
        path = f'/home/avilanov/data/providentia/obs/nonghost/actris/actris/{resolution}/{var}'
        if not os.path.isdir(path):
            os.makedirs(path, exist_ok=True)
        saved_files = 0
        for year, ds_year in combined_ds.groupby('time.year'):
            for month, ds_month in ds_year.groupby('time.month'):
                if target_start_date <= datetime.datetime(year, month, 1) <= target_end_date:
                    filename = f"{path}/{var}_{year}{month:02d}.nc"
                    combined_ds_yearmonth = combined_ds.sel(time=f"{year}-{month:02d}")
                    combined_ds_yearmonth = temporally_average_data(combined_ds_yearmonth, year, month, var)

                    # add title to attrs
                    extra_info = ''
                    if wavelength_var:
                        extra_info = f' at {wavelength}nm'
                    combined_ds_yearmonth.attrs['title'] = f'Surface {parameters_dict[var]}{extra_info} in the ACTRIS network in {year}-{month:02d}.'

                    # order attrs
                    custom_order = ['title', 'institution', 'creator_name', 'creator_email',
                                    'source', 'application_area', 'domain', 'observed_layer',
                                    'data_license']
                    ordered_attrs = {key: combined_ds_yearmonth.attrs[key] 
                                     for key in custom_order 
                                     if key in combined_ds_yearmonth.attrs}
                    combined_ds_yearmonth.attrs = ordered_attrs

                    # save file
                    combined_ds_yearmonth.to_netcdf(filename)

                    # change permissions
                    os.system("chmod 777 {}".format(filename))
                    print(f"Saved: {filename}")
                    saved_files += 1
                    
        print(f'Total number of saved files: {saved_files}')
    else:
        print(f'No files were found for {var} in {resolution} resolution between {target_start_date} and {target_end_date}')

Variable: lsco525 - ACTRIS: aerosol particle light scattering coefficient
Total number of files: 82
0 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/2F/GN/NP/2FGN-NP9C.nc
1 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/2Y/PB/YP/2YPB-YP9F.nc
Data at 525nm could not be found
2 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/3N/PD/AK/3NPD-AKFX.nc
Data at 525nm could not be found
3 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/3Z/KP/QJ/3ZKP-QJE3.nc
Data at 525nm could not be found
4 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/42/5M/N3/425M-N3DZ.nc
5 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/4B/88/PB/4B88-PB4B.nc
Data at 525nm could not be found
6 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/4Z/NQ/KK/4ZNQ-KK4W.nc
Data at 525nm could not be found
7 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/52/US/FA/52US-FAV7.nc
Data at 525nm could not be found
8 - https://thredds.nilu.no/thredds/dodsC/ebas_doi/59/G8/BE/59G8-BEJT.nc
Data at 525nm could not be found
9 - https://

In [27]:
end_time = time.time()

In [28]:
total_time = end_time - start_time
total_time

140.26642537117004

In [29]:
time.strftime("%H:%M:%S.{}".format(str(total_time % 1)[2:])[:15], time.gmtime(total_time))

'00:02:20.266425'

In [30]:
test_data = xr.open_dataset(f'/home/avilanov/data/providentia/obs/nonghost/actris/actris/{resolution}/{var}/{var}_201801.nc')
test_data