In [12]:
import xarray as xr
import numpy as np
import glob
import os
import h5py
import matplotlib.pyplot as plt
#import cartopy.crs as ccrs
#import cartopy.feature as cfeature
import warnings
warnings.filterwarnings("ignore")

In [13]:
C1 = 1.19104*10**(-5)  # in [mW (cm−1)−4 m-2 sr−1]
C2 = 1.43877  # in [K cm]

CHANNEL_NAME = {"channel_1": "VIS 0.6", 
                "channel_2": "VIS 0.8", 
                "channel_3": "NIR 1.6", 
                "channel_4": "IR 3.9", 
                "channel_5": "WV 6.2", 
                "channel_6": "WV 7.3", 
                "channel_7": "IR 8.7", 
                "channel_8": "IR 9.7 - O3", 
                "channel_9": "IR 10.8", 
                "channel_10": "IR 12.0", 
                "channel_11": "IR 13.4 - CO2", }
# in [cm−1]
VC = {'MSG1': {"channel_4": 2567.330, "channel_5": 1598.103, "channel_6": 1362.081, "channel_7": 1149.069, 
                "channel_8": 1034.343, "channel_9": 930.647, "channel_10": 839.660, "channel_11": 752.387
                }, 
      'MSG2': {"channel_4": 2568.832, "channel_5": 1600.548, "channel_6": 1360.330, "channel_7": 1148.620, 
                "channel_8": 1035.289, "channel_9": 931.700, "channel_10": 836.445, "channel_11": 751.792
                }, 
      'MSG3': {"channel_4": 2547.771, "channel_5": 1595.621, "channel_6": 1360.377, "channel_7": 1148.130, 
                "channel_8": 1034.715, "channel_9": 929.842, "channel_10": 838.659, "channel_11": 750.653
                }, 
      'MSG4': {"channel_4": 2555.280, "channel_5": 1596.080, "channel_6": 1361.748, "channel_7": 1147.433, 
                "channel_8": 1034.851, "channel_9": 931.122, "channel_10": 839.113, "channel_11": 748.585
                }, }
# unitless
ALPHA = {'MSG1': {"channel_4": 0.9956, "channel_5": 0.9962, "channel_6": 0.9991, "channel_7": 0.9996, 
                   "channel_8": 0.9999, "channel_9": 0.9983, "channel_10": 0.9988, "channel_11": 0.9981
                   }, 
         'MSG2': {"channel_4": 0.9954, "channel_5": 0.9963, "channel_6": 0.9991, "channel_7": 0.9996, 
                   "channel_8": 0.9999, "channel_9": 0.9983, "channel_10": 0.9988, "channel_11": 0.9981
                   }, 
         'MSG3': {"channel_4": 0.9915, "channel_5": 0.9960, "channel_6": 0.9991, "channel_7": 0.9996, 
                   "channel_8": 0.9999, "channel_9": 0.9983, "channel_10": 0.9988, "channel_11": 0.9982
                   }, 
         'MSG4': {"channel_4": 0.9916, "channel_5": 0.9959, "channel_6": 0.9990, "channel_7": 0.9996, 
                   "channel_8": 0.9998, "channel_9": 0.9983, "channel_10": 0.9988, "channel_11": 0.9981
                   }, }
# in [K]
BETA = {'MSG1': {"channel_4": 3.410, "channel_5": 2.218, "channel_6": 0.478, "channel_7": 0.179, ''
                  "channel_8": 0.060, "channel_9": 0.625, "channel_10": 0.397, "channel_11": 0.578
                  },
        'MSG2': {"channel_4": 3.438, "channel_5": 2.185, "channel_6": 0.470, "channel_7": 0.179, 
                  "channel_8": 0.056, "channel_9": 0.640, "channel_10": 0.408, "channel_11": 0.561
                  },
        'MSG3': {"channel_4": 2.9002, "channel_5": 2.0337, "channel_6": 0.4340, "channel_7": 0.1714, 
                  "channel_8": 0.0527, "channel_9": 0.6084, "channel_10": 0.3882, "channel_11": 0.5390
                  },
        'MSG4': {"channel_4": 2.9438, "channel_5": 2.0780, "channel_6": 0.4929, "channel_7": 0.1731, 
                  "channel_8": 0.0597, "channel_9": 0.6256, "channel_10": 0.4002, "channel_11": 0.5635
                  }, }

# %%
#############
############# look up tables for calculating reflectances
#############
# constants taken from website: 
# https://eumetsatspace.atlassian.net/wiki/spaces/DSDT/pages/1537277953/MSG15+radiances+conversion+to+BT+and+Reflectances
# and from https://www-cdn.eumetsat.int/files/2020-04/pdf_msg_seviri_rad2refl.pdf

IRRAD = {'MSG1': {"channel_1": 65.2296, "channel_2": 73.0127, "channel_3": 62.3715},
         'MSG2': {"channel_1": 65.2065, "channel_2": 73.1869, "channel_3": 61.9923},
         'MSG3': {"channel_1": 65.5148, "channel_2": 73.1807, "channel_3": 62.0208}, 
         'MSG4': {"channel_1": 65.2656, "channel_2": 73.1692, "channel_3": 61.9416}, }


# %%
class ir_channel:
    """
    class that calls channel specific constants from look up tables above
    """
    def __init__(self, satellite, channel):

        self.name = CHANNEL_NAME[channel]
        self.vc = VC[satellite][channel]  # wavenumber in [cm−1]
        self.alpha = ALPHA[satellite][channel]  # unitless
        self.beta = BETA[satellite][channel]  # in [K]

class vis_nir_channel:
    def __init__(self, satellite, channel):
        
        self.name = CHANNEL_NAME[channel]
        self.irrad = IRRAD[satellite][channel]  # irradiance at 1AU in [mW·m-2·(cm-1)-1]

class MSG_satellite:
    def __init__(self, name):
        self.name =  name

    def _get_channel(self, channel_number):
        # return vis/nir or ir channel depending on channel number
        if channel_number <=3:
            return vis_nir_channel(satellite=self.name, channel=f"channel_{channel_number}")
        else:
            return ir_channel(satellite=self.name, channel=f"channel_{channel_number}")

    def rad_2_tb(self, channel_number, radiances):
        # error handling here:
        # TODO: raise exception when given incorrect channel_number, must be >=4

        # get constants for given channel
        channel_consts = self._get_channel(channel_number)

        # converting radiance to brightness temperature [K] with simplified equation
        numerator = C2 * channel_consts.vc
        fraction = C1 * channel_consts.vc**3 / radiances + 1
        denominator = channel_consts.alpha * (np.log(fraction))
        tb = numerator / denominator - channel_consts.beta / channel_consts.alpha  ## [K]
        return tb
    
    def _d(t):
        # Sun-Earth distance in AU at time t
        return None
    
    def _solar_zenith_angle(t, lon, lat):
        # Solar Zenith Angle in Radians at time t and location x
        return None

    def rad_2_refl(self, channel_number, radiances, t, lon, lat):
        # error handling here:
        # TODO: raise exception when given incorrect channel_number, must be <= 3

        # get constants for given channel
        channel_consts = self._get_channel(channel_number)

        numerator = np.pi * radiances * self._d(t)**2
        denominator = channel_consts.irrad * np.cos(self._solar_zenith_angle(t, lon, lat))

# %%
def radiances_2_brightnesstemp_and_reflectances(radiances, channel_number, satellite_name):
    ## radiances in [mW m−2 sr−1 (cm−1)−1)]
    # TODO: add constraint to channel_number (must be >= 4)

    # access correct satellite 
    satellite = MSG_satellite(satellite_name)
    if channel_number <= 3:
        print("not implemented yet for visible and near-infrared")

    elif channel_number >= 4 and channel_number < 12:
        # get brightness temp fro given channel
        return satellite.rad_2_tb(channel_number, radiances)
    
    else:
        print(f"This channel does not exist for satellite {satellite_name}")
        # TODO: raise exception

In [15]:
nc_file_loc = '/p/scratch/exaww/chatterjee1/msg_netcdf/2023/'
output_file = nc_file_loc + "msgobs_108_juelich_crops.nc"
log_file = nc_file_loc + "processed_files_log_juelich.txt"
nan_crop_file = nc_file_loc + "nan_files_log_juelich.txt"

# Months mapping
months = {
    4: '04/',
    #5: '05/',
    #6: '06/',
    #7: '07/',
    #8: '08/',
    #9: '09/',
}

# Jülich lat/lon
Juelich_lat_coord, Juelich_lon_coord = 50.9, 6.3
crop_size = 128  # Crop size (128x128 pixels)

# Initialize storage for results
juelich_crops = []
juelich_lats = []
juelich_lons = []
juelich_times = []

sample_counter = 0  # Counter for unique samples
first_write = True  # To handle file writing on the first loop

for _, key in enumerate(months.keys()):
    loc = nc_file_loc + months[key]
    nc_filepattern = "HRSEVIRI_2023*_PC.nc"
    nc_files = sorted(glob.glob(loc + nc_filepattern))
    
    for file in nc_files:
        # Log the file being processed
        with open(log_file, 'a') as log:
            log.write(f"{file}\n")
        
        try:
            # Open the file
            data = xr.open_dataset(file)

            # Locate Jülich indices
            idx_lat = np.argmin(np.abs(data.lat.values - Juelich_lat_coord))
            idx_lon = np.argmin(np.abs(data.lon.values - Juelich_lon_coord))

            # Compute crop boundaries
            half_crop = crop_size // 2
            lat_start = max(idx_lat - half_crop, 0)
            lat_end = min(idx_lat + half_crop, data.lat.shape[0])
            lon_start = max(idx_lon - half_crop, 0)
            lon_end = min(idx_lon + half_crop, data.lon.shape[0])

            # Extract crop
            lat_crop = data.lat[lat_start:lat_end].values
            lon_crop = data.lon[lon_start:lon_end].values
            radiances = data["channel_9"][lat_start:lat_end, lon_start:lon_end].values

            # Convert radiances to brightness temperature
            satellite_name = data.EPCT_product_name.split('-')[0]
            bt_crop = radiances_2_brightnesstemp_and_reflectances(radiances, 9, satellite_name)

            # Log time
            timestamp = data.EPCT_product_name.split('A-')[1].split('.')[0]

            # Append results
            juelich_crops.append(bt_crop)
            juelich_lats.append(lat_crop)
            juelich_lons.append(lon_crop)
            juelich_times.append(timestamp)

            # Increment the sample counter
            sample_counter += 1

        except Exception as e:
            # Handle any errors and log them
            with open(nan_crop_file, 'a') as nan_log:
                nan_log.write(f"Failed to process {file}: {str(e)}\n")
            continue

# Convert to numpy arrays if needed
juelich_crops_np = np.array(juelich_crops)
juelich_lats_np = np.array(juelich_lats)
juelich_lons_np = np.array(juelich_lons)
juelich_times_np = np.array(juelich_times)

ds = xr.Dataset(
    {
        "sample_data": (["sample", "y", "x"], juelich_crops_np)  # Data variable
    },
    coords={
        "sample": (["sample"], np.arange(len(juelich_crops_np))),  # Sample numbers
        "lat": (["sample", "y"], juelich_lats_np),
        "lon": (["sample", "x"], juelich_lons_np),
        "time": (["sample"], juelich_times_np)
    }
)

# Write or append to the NetCDF file
if first_write:
    ds.to_netcdf(output_file, mode='w')
    first_write = False

In [16]:
j_data = xr.open_dataset(nc_file_loc + "msgobs_108_juelich_crops.nc")
j_data

In [17]:
j_data.sample_data.shape

(4092, 128, 128)

In [21]:
def convert_nc_to_hdf5(nc_file, hdf5_file):
    # Open the NetCDF file using xarray
    ds = xr.open_dataset(nc_file)

    # Create the HDF5 file
    with h5py.File(hdf5_file, 'w') as hdf5_data:
        # Iterate over all variables in the xarray dataset
        for var_name in ds.data_vars:
            var_data = ds[var_name].values
            
            # Create a dataset in the HDF5 file
            hdf5_dataset = hdf5_data.create_dataset(
                var_name, 
                data=var_data, 
                dtype=var_data.dtype, 
                #chunks=chunking,  # Enable chunking if requested
                #compression=compression  # Apply compression if provided
            )

            # Copy variable attributes of the variable to the HDF5 dataset
            for attr_name, attr_value in ds[var_name].attrs.items():
                hdf5_dataset.attrs[attr_name] = attr_value
        
        # Iterate over all coordinates in the xarray dataset
        for coord_name in ds.coords:
            coord_data = ds[coord_name].values
            
            # Handle special case for time coordinate with dtype('O')
            if coord_data.dtype == 'O':
                # Convert to fixed-length strings
                coord_data = coord_data.astype('S')
            
            # Create a dataset in the HDF5 file for the coordinate
            hdf5_coord = hdf5_data.create_dataset(
                coord_name, 
                data=coord_data, 
                dtype=coord_data.dtype, 
                #chunks=chunking,  # Enable chunking if requested
                #compression=compression  # Apply compression if provided
            )
            
            # Copy coordinate attributes to the HDF5 dataset
            for attr_name, attr_value in ds[coord_name].attrs.items():
                hdf5_coord.attrs[attr_name] = attr_value
        
        # Copy global attributes
        for attr_name, attr_value in ds.attrs.items():
            hdf5_data.attrs[attr_name] = attr_value
    # Close the xarray dataset
    ds.close()


nc_file = nc_file_loc + "msgobs_108_juelich_crops.nc"
hdf5_file = nc_file_loc + 'msgobs_108_juelich.h5'
convert_nc_to_hdf5(
    nc_file, 
    hdf5_file, 
    #chunking=False, 
    #compression=None
)
print(f"Converted {nc_file} to {hdf5_file}")

Converted /p/scratch/exaww/chatterjee1/msg_netcdf/2023/msgobs_108_juelich_crops.nc to /p/scratch/exaww/chatterjee1/msg_netcdf/2023/msgobs_108_juelich.h5


In [20]:
nc_file_loc + "msgobs_108_juelich_crops.nc"

'/p/scratch/exaww/chatterjee1/msg_netcdf/2023/msgobs_108_juelich_crops.nc'

In [22]:
# Function to calculate mean and standard deviation using chunks
def compute_mean_std_chunked(data, chunk_size=1000):
    n_samples, height, width = data.shape
    n_elements = n_samples * height * width

    # Initialize mean and variance sums
    total_sum = 0.0
    total_square_sum = 0.0

    # Process in chunks
    for i in range(0, n_samples, chunk_size):
        chunk = data[i:i+chunk_size]

        # Update sums
        total_sum += np.sum(chunk)
        total_square_sum += np.sum(chunk ** 2)

    # Calculate mean and variance
    mean = total_sum / n_elements
    variance = (total_square_sum / n_elements) - (mean ** 2)
    std = np.sqrt(variance)

    return mean, std

ds = xr.open_dataset(nc_file)
# Simulate smaller sample_data for demonstration purposes
small_sample_data = ds.sample_data

# Calculate the mean and standard deviation using chunks
mean_sample_data_chunked, std_sample_data_chunked = compute_mean_std_chunked(small_sample_data)

mean_sample_data_chunked, std_sample_data_chunked

(<xarray.DataArray 'sample_data' ()>
 array(264.89034864),
 <xarray.DataArray 'sample_data' ()>
 array(13.43505868))

In [23]:
ds

In [4]:
nc_file_loc = '/p/scratch/exaww/chatterjee1/msg_netcdf/2023/'
output_file = nc_file_loc + "msgobs_108_juelich_crops.nc"
log_file = nc_file_loc + "processed_files_log_juelich.txt"
nan_crop_file = nc_file_loc + "nan_files_log_juelich.txt"

months = {
    4: '04/',
    5: '05/',
    6: '06/',
    7: '07/',
    8: '08/',
    9: '09/',
}

sample_counter = 0  # Initialize a counter for unique sample naming


juelich_crops = []  # List to store all crops
juelich_lats = []   # List to store all latitude coordinates
juelich_lons = []   # List to store all longitude coordinates
juelich_times = []  # List to store all timestamps

first_write = True  # Flag to check if it's the first time writing to the file

for _, key in enumerate(months.keys()):
    loc = nc_file_loc + months[key]
    nc_filepattern = "HRSEVIRI_2023*_PC.nc"
    nc_files = sorted(glob.glob(loc + nc_filepattern))
    
    #Juelich details
    Juelich_lat_coord, Juelich_lon_coord= 50.9, 6.3
    idx_lat = np.argmin(np.abs(data.lat.values - Juelich_lat_coord))
    idx_lon = np.argmin(np.abs(data.lon.values - Juelich_lon_coord))


    for i, file in enumerate(nc_files):

        # Log the name of the current file
        with open(log_file, 'a') as log:
            log.write(f"{file}\n")
        
        data = xr.open_dataset(file)
        satellite_name = data.EPCT_product_name.split('-')[0]
        timestamp = data.EPCT_product_name.split('A-')[1].split('.')[0]

        lat_crop = data.lat[465:611].values  
        lon_crop = data.lon[252:823].values  
        radiances = data["channel_9"][465:611, 252:823].values
        bt_crop = radiances_2_brightnesstemp_and_reflectances(radiances, 9, satellite_name)


        # Store the crop and the corresponding coordinates
        juelich_crops.append(bt_crop)
        juelich_lats.append(lat_crop)
        juelich_lons.append(lon_crop)
        juelich_times.append(timestamp)

        # Increment the sample counter
        sample_counter += 1

        juelich_crops_np = np.array(juelich_crops)
        juelich_lats_np = np.array(juelich_lats)
        juelich_lons_np = np.array(juelich_lons)
        juelich_times_np = np.array(juelich_times)

        # Create a dataset with the combined data
ds = xr.Dataset(
    {
        "sample_data": (["sample", "y", "x"], juelich_crops_np)  # Data variable
    },
    coords={
        "sample": (["sample"], np.arange(len(juelich_crops_np))),  # Sample numbers
        "lat": (["sample", "y"], juelich_lats_np),
        "lon": (["sample", "x"], all_lons_np),
        "time": (["sample"], juelich_times_np)
    }
)

# Write or append to the NetCDF file
if first_write:
    ds.to_netcdf(output_file, mode='w')
    first_write = False
  

In [8]:
file = nc_files[0]

In [9]:
data = xr.open_dataset(file)

In [10]:
Juelich_lat_coord, Juelich_lon_coord= 50.9, 6.3
idx_lat = np.argmin(np.abs(data.lat.values - Juelich_lat_coord))
idx_lon = np.argmin(np.abs(data.lon.values - Juelich_lon_coord))
idx_lat, idx_lon

(555, 260)

In [None]:
555+64, 