# TROPOMI functions

In [None]:
def TROPOMI_L3_download(date, component_nom):

    """ Query and download the TROPOMI L3 dataset from Sentinel API

         Args:
            date (str): Query date
            component_nom (str): Component chemical nomenclature
   """

    year = date.split('-')[0]
    month = date.split('-')[1]
    
    TROPOMI_product_path = os.path.join('/', '/'.join(
                           os.getcwd().split('/')[1:3]), 'adc-toolbox', 
                           os.path.relpath('data/tropomi/' + component_nom + '/L3/' + year + '-' + month))
    os.makedirs(TROPOMI_product_path, exist_ok = True) 
    product_name = 'TROPOMI_L3_NO2_COLUMN_' + year + month + '.asc.gz'
    file_name = TROPOMI_product_path + '/' + product_name

    path = ('https://d1qb6yzwaaq4he.cloudfront.net/tropomi/' + component_nom.lower() + '/' + year +
            '/' + month + '/' + component_nom.lower() + '_' + year + month + '.asc.gz')
    subprocess.run(['wget', '-q', '-nc', path, '-O', TROPOMI_product_path + '/' + product_name])
    
    if os.stat(file_name).st_size == 0:  
        os.remove(file_name) 
        print(product_name, 'is not available.')

    else:
        print(product_name, 'was downloaded.')

In [None]:
def TROPOMI_L3_read(component_nom, dates, lat_res = 0.125, lon_res = 0.125):

    """ Read TROPOMI L3 dataset as xarray dataset object

        Args:
            component_nom (str): Component chemical nomenclature
            dates (list): Available dates

        Returns:
            sensor_ds (xarray): TROPOMI dataset in xarray format
   """

    sensor_ds = []

    for date in dates:

        year = date.split('-')[0]
        month = date.split('-')[1]
        time = dt.datetime(int(year), int(month), 1)

        TROPOMI_product_path = os.path.join('/', '/'.join(
                            os.getcwd().split('/')[1:3]), 'adc-toolbox', 
                            os.path.relpath('data/tropomi/' + component_nom + '/L3/' + year + '-' + month))
        product_name = 'TROPOMI_L3_NO2_COLUMN_' + year + month + '.asc.gz'
        
        sensor_df_time = []

        # Reconstruct file
        with gzip.open(TROPOMI_product_path + '/' + product_name, 'rt', encoding='utf-8') as f:
            
            i = 0    
            lon = -179.9375

            for line in f:    
                
                if i > 3:

                    if 'lat' in line:
                        lat = float(line.replace('lat=  ', ''))
                        lon = -179.9375

                    else: 

                        line = line.replace('\n', '')

                        for value in [line[i:i+4] for i in range(0, len(line), 4)]:
                            
                            if value == '-999':
                                value = np.nan
                                
                            else:
                                value = float(value.replace(' ', ''))
                                value = value*10**13
                             
                            sensor_df_time.append({'time': time, 
                                                   'latitude': lat, 
                                                   'longitude': lon, 
                                                   'sensor_column': value})
                            lon += 0.125
                            
                i += 1
                
        sensor_df_time = pd.DataFrame(sensor_df_time)
        sensor_df_time = sensor_df_time.set_index(['time', 'latitude', 'longitude'])
        sensor_df_time = sensor_df_time[~sensor_df_time.index.duplicated()]

        # Change resolution
        sensor_ds_time = sensor_df_time.to_xarray()
        sensor_ds_time = regrid(sensor_ds_time, lat_res, lon_res) 

        sensor_ds.append(sensor_ds_time)
            
    sensor_ds = xr.concat(sensor_ds, dim = 'time')
    sensor_ds['sensor_column'] = sensor_ds['sensor_column'].assign_attrs({'units': 'molec cm-2'})

    return sensor_ds

In [1]:
def TROPOMI_L2_download(input_type, bbox, date, product_type, component_nom):

   """ Query and download the TROPOMI L2 dataset from Sentinel API

         Args:
            input_type (str): Search type (Manual or Query)
            bbox (arr): Query bounding box
            date (str): Query date
            product_type (str): Query product type
            component_nom (str): Component chemical nomenclature

        Returns:
            product_name (str): Product name of TROPOMI product within 5phub
   """

   user = 's5pguest' 
   password = 's5pguest' 
   api = SentinelAPI(user, password, 'https://s5phub.copernicus.eu/dhus/')

   if input_type == 'Manual':

      file_name = input('Write file name: ')
      product_name = input('Write product name:')

   elif input_type == 'Query':
      
      print('WARNING: The maximum number of items that can be shown is 5.')
      print('You can see all the results at https://s5phub.copernicus.eu/dhus/.')

      poly = geojson.Polygon([[(bbox[0][0], bbox[0][1]), (bbox[0][0], bbox[1][1]), 
                               (bbox[1][0], bbox[1][1]), (bbox[1][0], bbox[0][1]), 
                               (bbox[0][0], bbox[0][1])]])

      products = api.query(area = geojson_to_wkt(poly),
                           area_relation = 'Contains',
                           producttype = product_type,
                           processinglevel = 'L2',
                           platformname = 'Sentinel-5 Precursor',
                           instrumentname = 'TROPOspheric Monitoring Instrument',
                           processingmode = 'Near real time',
                           date = date,
                           limit = 5)

      items = list(products.items())
      
      if items:
         for i in range(0, len(items)):
               print('Number ', i, '-', items[i][1]['title'], sep = '')

      else: 
         print('There are no results for the processing mode NRT. The search in the offline archives will start.')
         products = api.query(area = geojson_to_wkt(poly),
                              area_relation = 'Contains',
                              producttype = product_type,
                              processinglevel = 'L2',
                              platformname = 'Sentinel-5 Precursor',
                              instrumentname = 'TROPOspheric Monitoring Instrument',
                              processingmode = 'Offline',
                              date = date,
                              limit = 5)

         items = list(products.items())

         if items:
            for i in range(0, len(items)):
               print('Number ', i, '-', items[i][1]['title'], sep = '')

         else: 
            print('There are no results in the offline archives. The code will be interrupted.')
            raise KeyboardInterrupt
            
      file_int = input('Select number or press Enter if you want to select the first result: ') or 0
      file_name = items[int(file_int)][0]
      product_name = items[int(file_int)][1]['title'] + '.nc'

   print('SELECTED')
   print('File name:', file_name)
   print('Product name:', product_name)
   
   if os.path.isfile(os.path.join('/', '/'.join(os.getcwd().split('/')[1:3]), 'adc-toolbox', os.path.relpath('data/tropomi' + '/' + 
                     component_nom + '/L2/' + date[0].split('T')[0] + '/' + product_name))):
      print('The file exists, it will not be downloaded again.')

   else:
      print('The file does not exist, it will be downloaded.')
      print(f'Downloading {product_name}...')
      api.download(file_name, directory_path = os.path.join('/', '/'.join(
                                               os.getcwd().split('/')[1:3]), 'adc-toolbox', 
                                               os.path.relpath('data/tropomi/' + component_nom + '/L2/' + date[0].split('T')[0])))

   return product_name

In [2]:
def TROPOMI_L2_read(component_nom, sensor_column, dates):

    """ Read TROPOMI L2 dataset as xarray dataset object

        Args:
            component_nom (str): Component chemical nomenclature
            sensor_column (str): Name of sensor column in downloaded dataset
            dates (list): Available dates

        Returns:
            sensor_ds (xarray): TROPOMI dataset in xarray format
            support_input_ds (xarray): TROPOMI dataset that contains support input data in xarray format
            support_details_ds (xarray): TROPOMI dataset that contains support details data in xarray format
    """

    sensor_ds_all = []
    support_input_ds_all = []
    support_details_ds_all = []

    for date in dates:

        path = os.path.join('/', '/'.join(
               os.getcwd().split('/')[1:3]), 'adc-toolbox', 
               os.path.relpath('data/tropomi/' + component_nom + '/L2/' + date[0].split('T')[0]))
        product_names = [file for file in os.listdir(path) if file.endswith('.nc')]
        
        for product_name in product_names:

            sensor_ds = xr.open_dataset(path + '/' + product_name, group = 'PRODUCT')
            sensor_ds = sensor_ds.rename({sensor_column: 'sensor_column'})
            
            support_input_ds = xr.open_dataset(path + '/' + product_name, 
                                               group = 'PRODUCT/SUPPORT_DATA/INPUT_DATA')

            support_input_ds = support_input_ds.assign(ground_pixel = sensor_ds.ground_pixel)
            support_input_ds = support_input_ds.assign(scanline = sensor_ds.scanline)
            support_input_ds = support_input_ds.assign(time = sensor_ds.time)
            support_input_ds = support_input_ds.set_coords(['ground_pixel', 'scanline', 'time'])

            support_details_ds = xr.open_dataset(path + '/' + product_name, 
                                                 group = 'PRODUCT/SUPPORT_DATA/DETAILED_RESULTS')

            support_details_ds = support_details_ds.assign(ground_pixel = sensor_ds.ground_pixel)
            support_details_ds = support_details_ds.assign(scanline = sensor_ds.scanline)
            support_details_ds = support_details_ds.assign(time = sensor_ds.time)
            support_details_ds = support_details_ds.set_coords(['ground_pixel', 'scanline', 'time'])

            if component_nom == 'CO':
                    
                # Transform heights into levels
                data = {'Layer': np.arange(1, 51)[::-1], 'Height': sensor_ds.layer}
                dataframe = pd.DataFrame(data)
                
                for i in range(0, 50):

                    sensor_ds['layer'] = xr.where(sensor_ds.layer == dataframe['Height'].iloc[i], 
                                                  int(dataframe['Layer'].iloc[i]), sensor_ds['layer'])

            sensor_ds_all.append(sensor_ds)
            support_input_ds_all.append(support_input_ds)
            support_details_ds_all.append(support_details_ds)

    sensor_ds = xr.concat(sensor_ds_all, dim = 'time')
    support_input_ds = xr.concat(support_input_ds_all, dim = 'time')
    support_details_ds = xr.concat(support_details_ds_all, dim = 'time')

    return sensor_ds, support_input_ds, support_details_ds

In [3]:
def TROPOMI_pressure(sensor_ds, component_nom, support_input_ds, support_details_ds):

    """ Calculate level pressures for TROPOMI dataset

        Args:
            sensor_ds (xarray): TROPOMI dataset in xarray format
            component_nom (str): Component chemical nomenclature
            support_input_ds (xarray): TROPOMI dataset that contains support input data in xarray format
            support_details_ds (xarray): TROPOMI dataset that contains support details data in xarray format
            
        Returns:
            sensor_ds (xarray): TROPOMI dataset in xarray format
    """

    if component_nom == 'NO2':
        
        print('The level pressures will be calculated.')

        # Calculate pressure as p = ap + b * ps (Units: ap(Pa) + b(none) * ps(Pa) -> To Pa)
        pressure_upper_bound = (sensor_ds.tm5_constant_a.sel(vertices = 1) + 
                                sensor_ds.tm5_constant_b.sel(vertices = 1) * support_input_ds.surface_pressure)

        pressure_lower_bound = (sensor_ds.tm5_constant_a.sel(vertices = 0) + 
                                sensor_ds.tm5_constant_b.sel(vertices = 0) * support_input_ds.surface_pressure)

        sensor_ds = sensor_ds.assign(pressure = (pressure_upper_bound + pressure_lower_bound) / 2)
    
    elif component_nom == 'CO':

        print('The level pressures will be retrieved.')

        # Pressure is at lower bound!
        pressure_lower = support_details_ds.pressure_levels
        sensor_ds = sensor_ds.assign(pressure = pressure_lower)

    elif component_nom == 'SO2':
        
        print('The level pressures will be calculated.')
        
        # Unknown bound, half?
        pressure = (support_input_ds.tm5_constant_a + 
                    support_input_ds.tm5_constant_b * support_input_ds.surface_pressure)
        sensor_ds = sensor_ds.assign(pressure = pressure)
    
    elif component_nom == 'O3':

        print('The level pressures will be retrieved.')

        # Unknown bound, half?
        pressure = support_details_ds.pressure_grid
        sensor_ds = sensor_ds.assign(pressure = pressure)

    else:
        print('This dataset does not contain data to retrieve or calculate the level pressures.')

    return sensor_ds

In [4]:
def TROPOMI_column_kernel(sensor_ds, component_nom, support_details_ds):

    """ Calculate column kernels for TROPOMI dataset

        Args:
            sensor_ds (xarray): TROPOMI dataset in xarray format
            component_nom (str): Component chemical nomenclature
            support_details_ds (xarray): TROPOMI dataset that contains support details data in xarray format

        Returns:
            sensor_ds (xarray): TROPOMI dataset in xarray format
    """

    if component_nom == 'NO2':

        print('The column kernels will be calculated.')

        column_kernel = xr.where(sensor_ds.layer > sensor_ds.tm5_tropopause_layer_index, 0, 
                                 sensor_ds.averaging_kernel * (sensor_ds.air_mass_factor_total / 
                                 sensor_ds.air_mass_factor_troposphere))
        sensor_ds = sensor_ds.assign(column_kernel = column_kernel)

    elif component_nom == 'CO':
        
        print('The column kernels will be calculated.')

        column_kernel = support_details_ds.column_averaging_kernel
        sensor_ds = sensor_ds.assign(column_kernel = column_kernel)
    
    elif component_nom == 'SO2':
        
        print('The column kernels will be calculated.')
        
        """
        height_options = [1, 7, 15]
        height = input('Input height (in km) to calculate the column kernels with accuracy (1, 7 or 15): ')

        while int(height) not in height_options:
            print('ERROR: Enter a valid height number. The options are 1, 7 or 15 km.')
            height = input('Input height (in km): ')

        air_mass_factor_total = support_details_ds['sulfurdioxide_total_air_mass_factor_' + height + 'km']
        column_kernel = xr.where(sensor_ds.layer > support_input_ds.tm5_tropopause_layer_index, 0, 
                                 support_details_ds.averaging_kernel * (air_mass_factor_total / 
                                 sensor_ds.air_mass_factor_troposphere))
        """
        
        column_kernel = support_details_ds.averaging_kernel
        sensor_ds = sensor_ds.assign(column_kernel = column_kernel)
    
    elif component_nom == 'O3':

        print('The column kernels will be calculated.')
        
        column_kernel = support_details_ds.averaging_kernel
        sensor_ds = sensor_ds.assign(column_kernel = column_kernel)

    else:
        print('The dataset does not contain data to retrieve or calculate the column averaging kernels.')

    return sensor_ds

In [5]:
def TROPOMI_apriori_profile(sensor_ds, component_nom, component, support_details_ds):

    """ Retrieve apriori profile if it exists and add to xarray dataset.

        Args:
            sensor_ds (xarray): TROPOMI dataset in xarray format
            component_nom (str): Component chemical nomenclature
            component (str): Component name
            support_details_ds (xarray): TROPOMI dataset that contains support details data in xarray format
        
        Returns:
            sensor_ds (xarray): TROPOMI dataset in xarray format
    """

    if component_nom == 'SO2':
        apriori_profile = 'sulfurdioxide_profile_apriori'

    else:
        apriori_profile = component.replace('_', '') + '_profile_apriori'
    
    if apriori_profile in list(support_details_ds.keys()):
        apriori_profile = support_details_ds[apriori_profile]
        sensor_ds = sensor_ds.assign(apriori_profile = apriori_profile)
        print('The apriori profiles will be retrieved.')

    else:
        print('The dataset does not contain any apriori profile.')
    
    return sensor_ds

In [6]:
def TROPOMI_lookup_table(sensor_ds, component_nom):

    """ Create file with the original corresponding coordinates to each scanline and ground pixel in TROPOMI dataset

        Args:
            sensor_ds (xarray): TROPOMI dataset in xarray format
            component_nom (str): Component chemical nomenclature

        Returns:
            lookup_table (dataframe): TROPOMI coordinates equivalence table
    """

    # Dims and vars to drop
    if component_nom == 'NO2':
        dims_to_drop = ['polynomial_exponents', 'vertices', 'intensity_offset_polynomial_exponents', 'corner', 'layer']
        vars_to_drop = ['time_utc', 'qa_value', 'sensor_column', 'nitrogendioxide_tropospheric_column_precision', 
                        'nitrogendioxide_tropospheric_column_precision_kernel', 'air_mass_factor_troposphere', 
                        'air_mass_factor_total', 'tm5_tropopause_layer_index']

    elif component_nom == 'O3':
        dims_to_drop = ['corner', 'layer', 'level']
        vars_to_drop = ['time_utc', 'qa_value', 'sensor_column', 'ozone_total_vertical_column_precision']

    elif component_nom == 'CO':
        dims_to_drop = ['corner', 'layer']
        vars_to_drop = ['time_utc', 'qa_value', 'sensor_column', 'carbonmonoxide_total_column_precision',
                        'carbonmonoxide_total_column_corrected']

    elif component_nom == 'SO2':
        dims_to_drop = ['corner', 'layer']
        vars_to_drop = ['time_utc', 'qa_value', 'sensor_column', 'sulfurdioxide_total_vertical_column_precision']

    elif component_nom == 'HCHO':
        dims_to_drop = ['corner', 'layer']
        vars_to_drop = ['formaldehyde_tropospheric_vertical_column_precision']

    # Create lookup table
    lookup_table = sensor_ds.drop_dims(dims_to_drop).drop_vars(vars_to_drop).to_dataframe().reset_index()

    return lookup_table

In [7]:
def TROPOMI_subset(sensor_ds_time, bbox, component_nom):

    """ Read file with the corresponding coordinates to each scanline and ground pixel in TROPOMI dataset.
        Subset dataset into desired bounding box.

        Args:
            sensor_ds_time (xarray): TROPOMI dataset in xarray format per time
            bbox (arr): Query bounding box
            component_nom (str): Component chemical nomenclature
    
        Returns:
            sensor_ds_time (xarray): TROPOMI dataset in xarray format
    """

    # Read lookup table
    lookup_table = TROPOMI_lookup_table(sensor_ds_time, component_nom)
    
    # Set limits
    lookup_table = lookup_table[(lookup_table['latitude'] >= bbox[0][1]) & 
                                (lookup_table['latitude'] <= bbox[1][1])]
    lookup_table = lookup_table[(lookup_table['longitude'] >= bbox[0][0]) & 
                                (lookup_table['longitude'] <= bbox[1][0])]
    
    if lookup_table.empty:
        
        print('ERROR: The subset could not be made. Try for another TROPOMI dataset. The code will be interrupted.')
        raise KeyboardInterrupt

    else:
        
        # Get scanline and ground pixel coordinates
        scanline_coords = np.unique(lookup_table['scanline'].values).tolist()
        ground_pixel_coords = np.unique(lookup_table['ground_pixel'].values).tolist()

        # Set limits
        sensor_ds_time = sensor_ds_time.sel(scanline = scanline_coords, ground_pixel = ground_pixel_coords)

    return sensor_ds_time

In [9]:
def TROPOMI_apply_kernels(match_df, model_ds_time, sensor_ds_time, component_nom):

    """ Apply averaging kernels: Find the nearest neighbours in the observation space 
        (in latitude and longitudes) and interpolate values in pressure

        Args:
            match_df (dataframe): Dataframe used to apply averaging kernels
            model_ds_time (xarray): Model levels dataset in xarray format per time
            sensor_ds_time (xarray): TROPOMI dataset in xarray format per time
            component_nom (str): Component chemical nomenclature
            
        Returns:
            match_df (dataframe): Dataframe used to apply averaging kernels
    """

    print('WARNING: The application of averaging kernels will take some time.')
    
    # Read new coordinates (after subset)
    lookup_table = TROPOMI_lookup_table(sensor_ds_time, component_nom)

    # Set the layer coordinate as index
    match_df = match_df.set_index('layer', append = True)

    # Create index that includes CAMS pressure levels for all the locations in TROPOMI
    new_array = np.concatenate([np.arange(1, 137) * 1000, sensor_ds_time.layer.values])
    new_index = pd.MultiIndex.from_product([match_df.index.levels[0], 
                                            match_df.index.levels[1],
                                            match_df.index.levels[2],
                                            new_array
                                            ],
                                            names = ['scanline', 'ground_pixel', 'time', 'layer'])
    
    # Append original and new indexes and reindex dataframe
    match_df = match_df[~match_df.index.duplicated()]
    match_df = match_df.reindex(match_df.index.append(new_index))
    
    # Sort and reset index
    match_df = match_df.sort_index()
    match_df = match_df.reset_index()

    # Find latitudes in CAMS rows with scanlines and ground pixels
    match_df['latitude'] = match_df.apply(lambda row: float(lookup_table[
                                                           (lookup_table['scanline'] == row['scanline']) & 
                                                           (lookup_table['ground_pixel'] == row['ground_pixel'])]['latitude'])
                                                           if pd.isnull(row['latitude']) 
                                                           else row['latitude'], 
                                                           axis = 1)
                                                            
    # Find longitudes in CAMS rows with scanlines and ground pixels
    match_df['longitude'] = match_df.apply(lambda row: float(lookup_table[
                                                            (lookup_table['scanline'] == row['scanline']) & 
                                                            (lookup_table['ground_pixel'] == row['ground_pixel'])]['longitude'])
                                                            if pd.isnull(row['longitude']) 
                                                            else row['longitude'], 
                                                            axis = 1)

    # Get unique timestep
    sensor_times = sensor_ds_time.delta_time.isel(scanline = 0).values
    model_times = model_ds_time.valid_time.values
    unique_step = int(np.unique(nearest_neighbour(model_times, sensor_times)))
    unique_time = model_ds_time.component.isel(step = unique_step).step.values.astype('timedelta64[h]')

    # Get CAMS model partial columns above each level at closest TROPOMI locations (nearest neighbours)
    match_df['model_partial_column_above'] = match_df.apply(lambda row: model_ds_time.component.sel(
                                                                        step = unique_time,
                                                                        hybrid = row['layer'] / 1000, 
                                                                        latitude = row['latitude'], 
                                                                        longitude = row['longitude'], 
                                                                        method = 'nearest').values 
                                                                        if pd.isnull(row['sensor_column']) 
                                                                        else math.nan,
                                                                        axis = 1)

    # Get CAMS model level pressures
    match_df['pressure'] = match_df.apply(lambda row: model_ds_time.pressure.sel(
                                                      step = unique_time,
                                                      hybrid = row['layer'] / 1000, 
                                                      latitude = row['latitude'], 
                                                      longitude = row['longitude'], 
                                                      method = 'nearest').values 
                                                      if pd.isnull(row['pressure']) 
                                                      else row['pressure'],
                                                      axis = 1)

    # Transform 1D-array data to float
    match_df['model_partial_column_above'] = match_df['model_partial_column_above'].apply(lambda x: float(x))
    match_df['pressure'] = match_df['pressure'].apply(lambda x: float(x))

    # Set multiindex again and sort for interpolation
    match_df = match_df.reset_index()
    match_df = match_df.set_index(['time', 'ground_pixel', 'scanline', 'pressure'])
    match_df = match_df.sort_values(['time', 'ground_pixel','scanline', 'pressure'], 
                                    ascending = [True, True, True, False])

    # Interpolate partial columns onto the TM5 pressure levels.
    match_df = match_df[~match_df.index.duplicated()]
    match_df['model_partial_column_above'] = match_df['model_partial_column_above'].interpolate()

    # Drop unnecessary values
    match_df = match_df.reset_index()
    match_df = match_df.set_index(['time', 'ground_pixel', 'scanline', 'layer'])
    match_df = match_df.drop(np.arange(1, 137) * 1000, level = 'layer')     
    
    # Calculate CAMS partial columns for each TM5 layer (as difference of the interpolated values)
    match_df['model_column'] = match_df['model_partial_column_above'] - match_df['model_partial_column_above'].shift(-1)
    match_df = match_df.reset_index()
    match_df.loc[match_df['layer'] == 33, ['model_column']] = match_df['model_partial_column_above']
    match_df = match_df.set_index(['time', 'ground_pixel', 'scanline', 'layer'])

    # Calculate values to generate CAMS column to sum in the next step
    if 'apriori_profile' in match_df.columns:
        match_df['model_column'] = match_df.apply(lambda row: row['apriori_profile'] +
                                                              row['column_kernel'] * row['model_column']  -
                                                              row['column_kernel'] * row['apriori_profile'], 
                                                              axis = 1)
    
    else:
        match_df['model_column'] = match_df.apply(lambda row: row['model_column'] * 
                                                              row['column_kernel'], 
                                                              axis = 1)

    return match_df