In [None]:
# Specify the input and output folders
input_folder = r"D:/GPM/Final/Original/Half_Hourly"  # Replace with the actual path to the input folder
output_folder = r"D:/GPM/Final/Original/Daily"

In [None]:
import numpy as np
import datetime
import netCDF4 as nc
import os
import re

# Specify the input and output folders
input_folder = r"D:/GPM/Final/Original/Half_Hourly"  # Replace with your input folder path
output_folder = r"D:/GPM/Final/Original/Daily"      # Replace with your output folder path

# Define the start and end dates for the data period
start_date = datetime.datetime(2022, 6, 1)
end_date = datetime.datetime(2022, 6, 7)

# List all files in the input folder
LF = os.listdir(input_folder)
print("Total files in LF:", len(LF))

# Read lat and lon from one of the input files
example_file_path = os.path.join(input_folder, LF[0])
with nc.Dataset(example_file_path) as example_dataset:
    latitudes = example_dataset.variables['lat'][:]
    longitudes = example_dataset.variables['lon'][:]

# Process each day
current_date = start_date
while current_date <= end_date:
    start_time = current_date.replace(hour=3, minute=0)
    end_time = (current_date + datetime.timedelta(days=1)).replace(hour=3, minute=0)
    print("Start date and time:", start_time, "\tEnd date and time:", end_time)

    # Filter files for the current day's range
    LF2 = []
    for file_name in LF:
        match = re.match(r"\d{8}-S\d{4}\.nc", file_name)
        if match:
            file_time_string = file_name.split('-')[1].split('.')[0]
            file_time = datetime.datetime.strptime(file_time_string, "S%H%M")
            file_date = datetime.datetime.strptime(file_name[:8], "%Y%m%d")
            file_datetime = file_date.replace(hour=file_time.hour, minute=file_time.minute)
            if start_time <= file_datetime < end_time:
                LF2.append(file_name)

    # Process the selected files
    precip3D = []
    for file_name in LF2:
        file_path = os.path.join(input_folder, file_name)
        try:
            dataset = nc.Dataset(file_path)
        except OSError as e:
            print("Error opening file:", file_path)
            print(e)
            continue

        # Extract the variable data
        data = dataset.variables['precipitation'][:]
        precip3D.append(data)
        dataset.close()

    # Aggregate data if available
    if precip3D:
        precip3D = np.concatenate(precip3D, axis=0)
        correction_factor = 0.5  # Half-hourly to daily conversion
        precip3D_sum = np.sum(precip3D, axis=0) * correction_factor

        # Ensure the array is correctly shaped (transpose if necessary)
        if precip3D_sum.shape != (len(latitudes), len(longitudes)):
            precip3D_sum = precip3D_sum.T  # Transpose the array

        # Format the date for the output file
        date_str = current_date.strftime('%Y%m%d')
        output_file_name = date_str + ".nc"
        output_file_path = os.path.join(output_folder, output_file_name)

        # Create a new netCDF file and write the data
        with nc.Dataset(output_file_path, "w", format="NETCDF4") as dataset:
            # Create dimensions
            time_dim = dataset.createDimension("time", 1)
            lon_dim = dataset.createDimension("lon", len(longitudes))
            lat_dim = dataset.createDimension("lat", len(latitudes))

            # Create variables
            time_var = dataset.createVariable("time", np.float64, ("time",))
            lon_var = dataset.createVariable("lon", np.float32, ("lon",))
            lat_var = dataset.createVariable("lat", np.float32, ("lat",))
            precip_var = dataset.createVariable("precipitationCal", np.float32, ("time", "lat", "lon"), fill_value=-9999.9)

            # Write data
            time_var[:] = [0]  # Assuming a single time step
            lon_var[:] = longitudes
            lat_var[:] = latitudes
            precip_var[0, :, :] = precip3D_sum

            # Add attributes to the variables
            time_var.standard_name = "time"
            time_var.units = "seconds since 1970-01-01 00:00:00 UTC"
            time_var.calendar = "standard"
            time_var.axis = "T"

            lon_var.standard_name = "longitude"
            lon_var.long_name = "longitude"
            lon_var.units = "degrees_east"
            lon_var.axis = "X"

            lat_var.standard_name = "latitude"
            lat_var.long_name = "latitude"
            lat_var.units = "degrees_north"
            lat_var.axis = "Y"

            precip_var.units = "mm"
            precip_var.missing_value = -9999.9
            precip_var.DimensionNames = "time,lon,lat"
            precip_var.CodeMissingValue = -9999.9

    else:
        print("No data available for concatenation for", date_str)

    # Move to the next day
    current_date += datetime.timedelta(days=1)


Checking the data as dataframe

In [None]:
"""Daily"""
import xarray as xr
with xr.open_dataset(filename_or_obj= "D:/GPM/Final/Original/Daily/20220601.nc", engine='netcdf4') as file:
    df = file.to_dataframe()
df.head(10)

In [None]:
"""Half Hourly"""
import xarray as xr
with xr.open_dataset(filename_or_obj= "D:/GPM/Final/Original/Half_Hourly/20220601-S0000.nc", engine='netcdf4') as file:
    df = file.to_dataframe()
df.head(10)