Author: Andrew Martin (eeasm) \
Creation date: 14/12/22

#### Known Issues:
+ Doesn't actually do anything yet

# Transferring Summit data

This notebook will be to transfer Summit data from the JASMIN ncas gws to another location, subset for a given time frame.

For example, if I want the MMCR data for 5 hours surrounding 11/08/2022 15:00 then I can set up the parameters to look within the appropriate timeframe.

In [1]:
# initial imports
import os
import datetime
import helper as hh

In [2]:
# setup where we want the data to be delivered to.
# In my case, this is my part of the ncas_radar_gws_vol1
endpoint_base = '/gws/nopw/j04/ncas_radar_vol1/eeasm'
# steps from within endpoint_base to get to where I want the data to be stored
endpoint_path_steps = ['ICESAT','rgt_0749','cycle_10']
# the final path that shall be used.
endpoint_path = os.path.join(endpoint_base,*endpoint_path_steps)

print(endpoint_path)

/gws/nopw/j04/ncas_radar_vol1/eeasm/ICESAT/rgt_0749/cycle_10


In [3]:
# here, we will extract the ICESAT ATL09 file from the given endpoint_path and determine what date it came from. Then we will select the 3 hours either side of this.

ATL09_in_endpoint = [f for f in os.listdir(endpoint_path) if 'ATL09' in f]
print(f'{ATL09_in_endpoint=}')

# for each ATL09 file in the destination, we will extract the date determined by the format of thae filename

ATL09_timestamps = []
for f in ATL09_in_endpoint:
    if 'processed' in f:
        # in this instance, icepyx has been used and the filename format is
        filename_format = 'processed_ATL09_%Y%m%d%H%M%S'
        f = f[:len(filename_format) + 2] # the additional 2 is for the %Y accounting for 4 characters rather than 2
    else:
        # otherwise, we assume the filename is of the format
        filename_format = 'ATL09_%Y%m%d%H%M%S'
        f = f[:len(filename_format) + 2]

    ATL09_timestamps.append(datetime.datetime.strptime(f,filename_format))

for f,d in zip(ATL09_in_endpoint,ATL09_timestamps):
    print(f, d.strftime('   datetime= %Y-%m-%d %H:%M:%S'))

ATL09_in_endpoint=['processed_ATL09_20210211004659_07491001_005_01.h5']
processed_ATL09_20210211004659_07491001_005_01.h5    datetime= 2021-02-11 00:46:59


In [4]:
# setup where the data will be being copied from
initial_base = '/gws/nopw/j04/ncas_radar_vol2/data/ICECAPSarchive'

mpl_path_steps = ['mpl','raw']
mmcr_path_steps = ['mmcr','mom']


mpl_path = os.path.join(initial_base,*mpl_path_steps)
mmcr_path = os.path.join(initial_base,*mmcr_path_steps)

# the file format that the MPL and MMCR files use.
# IMPORTANT NOTE: if we want sub-daily partitioning, we MUST include %H into these formats
mpl_format = '%Y%m%d%H*.mpl.gz'
mmcr_format = '%Y%j%H*.nc.zip' # uses the julian day
depth = 'h' # how the helper hh.move_data_files.find_files_in_date_range() will dearch for files
width = datetime.timedelta(hours=2) # this can be adjusted

extract_paths = [mpl_path, mmcr_path]
extract_formats = [mpl_format, mmcr_format]
for v,format in zip(extract_paths,extract_formats): print(v, f'  {format=}')

/gws/nopw/j04/ncas_radar_vol2/data/ICECAPSarchive/mpl/raw   format='%Y%m%d%H*.mpl.gz'
/gws/nopw/j04/ncas_radar_vol2/data/ICECAPSarchive/mmcr/mom   format='%Y%j%H*.nc.zip'


In [5]:
# for each ATL09 file in ATL09_in_endpoint, we will locate the relevant mmcr and mpl files at their locations within the daterange.
# We will then check that each hasn't already been moved to the target location.
# If not, we will move the file across.

for f,d in zip(ATL09_in_endpoint,ATL09_timestamps):
    daterange = [d-width, d+width]

    # for each type of data file we want to extract: mmcr, mpl, etc
    for initial, format in zip(extract_paths,extract_formats):
        # extract the filenames for the data files
        datafile_names = hh.move_data_files.find_files_in_date_range(initial,daterange,format,depth)
        # next, we need to check to see if the files exist at the target location
        exists_at_endpoint = [hh.move_data_files._check_file_exists(endpoint_path,dfname) for dfname in datafile_names]
        print(exists_at_endpoint)

        for dfname,exists in zip(datafile_names,exists_at_endpoint):
            if not exists:
                success = hh.move_data_files.copy_file(initial,dfname,endpoint_path)
                if not success:
                    print(f'Trouble copying {dfname=} from {initial=} to {endpoint_path=}')
            else:
                print(f'{dfname} already exists at {endpoint_path}')

print('files copied')

[False, False, False]
[False, False, False]
