In [1]:
"""
Created on Wed Jul 5 2023

@author: Laia Amorós

Version that Maija has modified
"""

import netCDF4 as nc
import numpy as np
import xarray as xr

import os
import sys


#sys.path.append('../src')

def regrid(data, lon, lat, factor1, factor2):
    """
    Regrids data to lower resolution by averaging over a window of size factor1 x factor2.
    """
    data_unmasked = data.filled(np.nan)
    data_shape = data.shape
    regridded_shape = (data_shape[0] // factor1, data_shape[1] // factor2)

    data_reg = np.full(regridded_shape, np.nan)
    lon_reg = np.zeros(regridded_shape)
    lat_reg = np.zeros(regridded_shape)

    for i in range(regridded_shape[0]):
        for j in range(regridded_shape[1]):
            window = data_unmasked[i*factor1:min((i+1)*factor1, data_shape[0]), 
                                                j*factor2:min((j+1)*factor2, data_shape[1])]
            lon_window = lon[i*factor1:min((i+1)*factor1, data_shape[0]), 
                                                j*factor2:min((j+1)*factor2, data_shape[1])]
            lat_window = lat[i*factor1:min((i+1)*factor1, data_shape[0]), 
                                                j*factor2:min((j+1)*factor2, data_shape[1])]

            data_reg[i, j] = np.nanmean(window)
            lon_reg[i, j] = np.nanmean(lon_window)
            lat_reg[i, j] = np.nanmean(lat_window)

    # Mask new arrays where np.isnan is True
    data_reg = np.ma.masked_where(np.isnan(data_reg), data_reg)

    return data_reg, lon_reg, lat_reg


def covariance_matrix(matrix):
    """
    Computes the covariance matrix of a matrix with NaN values.
    """
    matrix_flat = np.asarray(matrix).flatten()
    # Replace NaN values with mean
    matrix_flat[np.isnan(matrix_flat)] = np.nanmedian(matrix_flat)

    deviations = matrix_flat - np.nanmean(matrix_flat)
    outer_product = np.outer(deviations, deviations)
    
    covariance_matrix = outer_product / (len(matrix_flat) - 1)

    return covariance_matrix

  

def create_dataset(covariance, lon, lat):
    out_cov = xr.Dataset(
        data_vars={"covariance": (["nparams", "nparams"], covariance)},
        coords={"lon": (["nparams"], np.asarray(lon).flatten()),
                  "lat": (["nparams"], np.asarray(lat).flatten()),},
        attrs={'comment': "Spatial uncertainty covariance matrix for gridcells numbered in the variable nparams with coordinates of the centre of each gridcell"}
    )
    return out_cov

    


def main():
    NUMBER_OF_FILES = 2 # change to the number of covariance matrices you want to produce
    FACTOR1 = 15
    FACTOR2 = 2

    # Path to CO2M simulations data and output
    # in LUMI
 #   DATA_PATH = '/scratch/project_462000289/CO2M_obs/CO2M_simulations/2018/Orbits_Europe/CO2Meast/'
 #   OUTPUT_PATH = '/scratch/project_462000289/covariance_matrices'
    DATA_PATH = '/home/pietaril/Documents/data/CO2M_testdata/CO2M_simulations/'
    OUTPUT_PATH = '/home/pietaril/Documents/data/CO2M_testdata/unc_cov_matrices/'
    
    filenames = [os.path.join(DATA_PATH, f) for f in os.listdir(DATA_PATH) if f.endswith('.nc')]
        
    for file in filenames[:NUMBER_OF_FILES]:
        # Read data
        data_nc = nc.Dataset(file, 'r')
        xco2 = data_nc.groups['data']['observation_data']['xco2'][:]
        xco2_quality_flag = data_nc.groups['data']['observation_data']['xco2_quality_flag'][:]
        uncertainties = data_nc.groups['data']['observation_data']['xco2_precision'][:]
        lon = data_nc.groups['data']['geolocation_data_dem']['longitude'][:]
        lat = data_nc.groups['data']['geolocation_data_dem']['latitude'][:]

        # Regrid data to lower resolution. Factor1 and factor2 can be changed to any integer
        uncertainties_reg, lon_reg, lat_reg = regrid(uncertainties, lon, lat, FACTOR1, FACTOR2)

        #Compute covariance matrix
        covariance = covariance_matrix(uncertainties_reg)
    
        # create xarray dataset with covariance matrix and corresponding spatial coordinates 
        out_cov = create_dataset(covariance, lon_reg, lat_reg)
        


        # Save covariance matrix and coordinates as xarray.dataset to output folder
        date = os.path.basename(file)[21:29]
        output_filename = f'unc_cov_matrix{date}_nanmedian.nc'
        out_cov.to_netcdf(os.path.join(OUTPUT_PATH, output_filename))
        #np.save(os.path.join(OUTPUT_PATH, output_filename), covariance)


if __name__ == "__main__":
    main()



  data_reg[i, j] = np.nanmean(window)


Read the matrix just saved to check if everything ok

In [2]:
ds = xr.open_dataset('/home/pietaril/Documents/data/CO2M_testdata/unc_cov_matrices/unc_cov_matrix20250609_nanmedian.nc')
ds

In [3]:
cov = ds["covariance"].values
lon = ds["lon"].values
lat = ds["lat"].values



In [5]:
print(np.finfo(float).eps)

2.220446049250313e-16


In [17]:
np.count_nonzero(cov)

902500

Check if there are zeros on the diagonal

In [4]:
vars = np.diag(cov)
print(f"Number of entries on the diagonal: {len(vars)}")
print(f"Strictly nonzero values on the diagonal: {np.count_nonzero(vars)}")

print(vars[np.nonzero(vars)])


Number of entries on the diagonal: 33495
Strictly nonzero values on the diagonal: 950
[2.27034177e-06 1.19884811e-06 5.50971722e-05 5.36924853e-05
 5.26279703e-05 5.16722602e-05 3.27471685e-05 1.47873636e-06
 5.66313590e-05 5.84537630e-05 5.92704418e-05 4.05978263e-05
 1.85392688e-05 2.48700613e-05 3.43164816e-05 4.24936583e-05
 5.04689599e-05 4.66469213e-05 3.38236084e-05 5.42104465e-05
 4.43669483e-05 1.05737849e-05 3.55111249e-07 8.05025081e-07
 3.42284985e-06 3.85733934e-07 4.64377054e-11 3.62172349e-07
 4.81145571e-06 1.36585576e-05 1.32697345e-05 9.95650955e-06
 3.09606859e-06 5.35128999e-08 4.25774751e-07 2.84854941e-06
 7.31632199e-06 3.67700280e-08 8.02080038e-07 2.76736638e-06
 3.92604252e-07 1.18004655e-06 7.59748891e-07 3.78137990e-06
 1.70313403e-05 2.83296618e-05 5.02283441e-05 5.56175772e-05
 6.09208258e-05 4.79346359e-05 3.67418481e-05 2.81825251e-05
 1.66092852e-05 1.72947572e-06 2.86406336e-07 1.58064138e-06
 7.40217156e-06 1.03891586e-05 5.87303563e-06 8.67056486e-06

Tried what happens if I replace the zeros on the diagonal by small positive values. Still not PD

In [6]:
minpos = abs(cov[np.nonzero(cov)]).min()
n = len(vars)
newvars = vars.copy()
newvars[newvars == 0] += minpos
np.count_nonzero(newvars)
newcov = cov.copy()
newcov[np.diag_indices(n)] = newvars

posdiag_cov = create_dataset(newcov, lon, lat)
date = '20250609'
output_filename = f'unc_cov_matrix{date}_newdiag.nc'
#OUTPUT_PATH = '/home/pietaril/Documents/data/CO2M_testdata/unc_cov_matrices/'
#posdiag_cov.to_netcdf(os.path.join(OUTPUT_PATH, output_filename))


(array([    0,     1,     2, ..., 33492, 33493, 33494]),
 array([    0,     1,     2, ..., 33492, 33493, 33494]))

In [5]:
cov_flat = cov.flatten()

In [6]:
len(cov_flat)

1121915025

In [7]:
len(cov_flat[~np.isnan(cov_flat)])

1121915025

In [5]:
unc

masked_array(
  data=[[--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        ...,
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --],
        [--, --, --, ..., --, --, --]],
  mask=[[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ...,
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]],
  fill_value=1e+20)

In [8]:
len(cov[abs(cov) > 0.])

34225

In [11]:
cov.shape

(33495, 33495)

For some reason lat & lon not given in 1D so needed to flatten

In [20]:
np.asarray(lon).flatten()

array([  19.08547592,   19.00862694,   18.93191719, ..., -148.00617981,
       -147.87005615, -147.73410034])

In [17]:
lat.flatten()

(609, 55)