In [1]:
import xarray as xr
import numpy as np
import glob
import os
import h5py
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import warnings
warnings.filterwarnings("ignore")

In [2]:
nc_file_loc = '/p/project1/exaww/chatterjee1/dataset/iconeu/'
output_file = "msgobs_108_randcrops_icon.nc"

months = {
    4:'04/',
    5:'05/',
    6:'06/',
    7:'07/',
    8:'08/',
    9:'09/',
}

log_file = nc_file_loc + "processed_files_log.txt"
nan_crop_file = nc_file_loc + "nan_files_log.txt"

In [3]:
ds = xr.open_dataset('/p/project1/exaww/chatterjee1/dataset/iconeu/09/30/iefrf00000000.nc')
ds

In [4]:
ds.lat.min().values, ds.lat.max().values

(array(29.5), array(70.5))

In [5]:
ds.lon.min().values, ds.lon.max().values

(array(-23.5), array(62.5))

## Randomly plotting 78 x 78

In [None]:
sample_counter = 0  # Initialize a counter for unique sample naming

all_crops = []  # List to store all crops
all_lats = []   # List to store all latitude coordinates
all_lons = []   # List to store all longitude coordinates
all_times = []  # List to store all timestamps

first_write = True  # Flag to check if it's the first time writing to the file

# Iterate over each month
for _, key in enumerate(months.keys()):
    month_loc = nc_file_loc + months[key]
    day_folders = sorted(glob.glob(month_loc + '*/'))  # Finds all day subfolders within the month folder

    # Iterate over each day folder within the current month
    for day_folder in day_folders:
        nc_filepattern = "iefrf*.nc"
        nc_files = sorted(glob.glob(day_folder + nc_filepattern)) # '/p/project1/exaww/chatterjee1/dataset/iconeu/09/01/'

        for i, file in enumerate(nc_files):

            # Log the name of the current file
            with open(log_file, 'a') as log:
                log.write(f"{file}\n")

            data = xr.open_dataset(file)
            date = data.time.dt.strftime('%Y-%m-%d').values[0]
            timestamp = data.time.dt.strftime('%H:%M:%S').values[0]
            
            # Get OBS domain bounds
            lat_min = 30.018796992481203
            lat_max = 54.9812030075188
            lon_min = -3.4812108559498958
            lon_max =  32.4812108559499

            # Subset ICON data to OBS domain
            bt_data = data['SYNMSG_BT_CL_IR10.8'].isel(time=0).sel(
                lat=slice(lat_min, lat_max),
                lon=slice(lon_min, lon_max)
            )

            # Update lat/lon after slicing
            lat_icon = bt_data['lat']
            lon_icon = bt_data['lon']
            

            y_dim, x_dim = bt_data.shape

            # Divide x dimension into 4 segments for spreading the crops
            x_segments = np.linspace(0, x_dim - 78, 5, dtype=int)

            for j in range(4):
                # Ensure crops are more spread in the x direction by choosing a segment
                start_y = np.random.randint(0, y_dim - 78)
                start_x = np.random.randint(x_segments[j], x_segments[j + 1])

                # Crop the data
                crop = bt_data[start_y:start_y + 78, start_x:start_x + 78]

                # Skip this crop if it contains any NaN values
                if np.isnan(crop).any():
                    with open(nan_crop_file, 'a') as log:
                        log.write(f"{file} and crop no {j}\n")
                    continue

                # Store the crop and the corresponding coordinates
                all_crops.append(crop)
                all_lats.append(lat[start_y:start_y + 78])
                all_lons.append(lon[start_x:start_x + 78])
                all_times.append(timestamp)

                # Increment the sample counter
                sample_counter += 1

                all_crops_np = np.array(all_crops)
                all_lats_np = np.array(all_lats)
                all_lons_np = np.array(all_lons)
                all_times_np = np.array(all_times)

# Create a dataset with the combined data
ds = xr.Dataset(
    {
        "model_108": (["sample", "y", "x"], all_crops_np)  # Data variable
    },
    coords={
        "sample": (["sample"], np.arange(len(all_crops_np))),  # Sample numbers
        "lat": (["sample", "y"], all_lats_np),
        "lon": (["sample", "x"], all_lons_np),
        "time": (["sample"], all_times_np)
    }
)

# Write or append to the NetCDF file
if first_write:
    ds.to_netcdf(output_file, mode='w')
    first_write = False
