# PRE PROCESSING DATA PIPELINE

#### IMPORTS

In [1]:
import matplotlib.pyplot as plt
import xarray as xr

from reproduce.ose_data_pipeline import download_copernicus_data_for_sat, filt_daily_ssh_data, grid_input

  from .autonotebook import tqdm as notebook_tqdm


## REAL DATA TEST

### Pipeline will:

- #### **Download**, **concatenate** and **grid** the *input* nadirs data
- #### **Download** and **concatenate** the *reference* nadir data

### VALUES TO BE DEFINED:

(copy following templates)

- min/max **time**, **lat**, **lon** : of the data you will download and preprocess

- copernicus dataset id (found on the [CMEMS portal](https://data.marine.copernicus.eu/products))

- satellites (input) and ref satellites: found on CMEMS portal

- file names for downloaded/concatenated/gridded input and reference files

#### 2023 REPROCESSED

In [2]:
# TIME VALUES
min_time = '2023-01-01'
max_time = '2023-12-31'

# LAT LON VALUES
min_lon = -66
max_lon = -54
min_lat = 32
max_lat = 44

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_my_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = []
ref_satellites = ['s6a-lr']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = '/DATASET/OCB_traces/dl/2023_reprocessed/input/{}'
download_sat_ref_dir = '/DATASET/OCB_traces/dl/2023_reprocessed/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "/DATASET/OCB_traces/concat/concatenated_input_2023_reprocessed.nc"
concatenated_ref_path = "/DATASET/OCB_traces/concat/concatenated_ref_2023_reprocessed.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "/DATASET/OCB_traces/input_test_6sat_2023_reprocessed.nc"

### 2023 GLOBAL NRT 4TH

In [3]:
# TIME VALUES
min_time = '2023-01-01'
max_time = '2023-12-31'

# LAT LON VALUES
min_lon = -180
max_lon = 180
min_lat = -80
max_lat = 90

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_nrt_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = ['al', 'c2n', 'h2b', 'j3n', 's3a', 's3b']
ref_satellites= ['s6a-hr', 'swon']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = '/DATASET/OCB_traces/dl/2023/input/{}'
download_sat_ref_dir = '/DATASET/OCB_traces/dl/2023/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "/DATASET/OCB_traces/concat/concatenated_input_2023_global_4.nc"
concatenated_ref_path = "/DATASET/OCB_traces/concat/concatenated_ref_2023_global_4.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "/DATASET/OCB_traces/input_test_6sat_2023_global_4.nc"

#### 2023

In [2]:
# TIME VALUES
min_time = '2023-01-01'
max_time = '2023-12-31'

# LAT LON VALUES
min_lon = -66
max_lon = -54
min_lat = 32
max_lat = 44

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_nrt_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = ['al', 'c2n', 'h2b', 'j3n', 's3a', 's3b']
ref_satellites= ['s6a-hr', 'swon']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = '/DATASET/OCB_traces/dl/2023/input/{}'
download_sat_ref_dir = '/DATASET/OCB_traces/dl/2023/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "/DATASET/OCB_traces/concat/concatenated_input_2023.nc"
concatenated_ref_path = "/DATASET/OCB_traces/concat/concatenated_ref_2023.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "/DATASET/OCB_traces/input_test_6sat_2023.nc"

#### 2022

In [None]:
# TIME VALUES
min_time = '2022-01-01'
max_time = '2022-12-31'

# LAT LON VALUES
min_lon = -66
max_lon = -54
min_lat = 32
max_lat = 44

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_nrt_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = ['al', 'c2n', 'h2b', 'j3n', 's3a', 's3b']
ref_satellites= ['s6a-hr', 'swon']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = '/DATASET/OCB_traces/dl/2022/input/{}'
download_sat_ref_dir = '/DATASET/OCB_traces/dl/2022/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "/DATASET/OCB_traces/concat/concatenated_input_2022.nc"
concatenated_ref_path = "/DATASET/OCB_traces/concat/concatenated_ref_2022.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "/DATASET/OCB_traces/input_test_6sat_2022.nc"

#### 2022 GLOBAL

In [2]:
# TIME VALUES
min_time = '2022-01-01'
max_time = '2022-12-31'

# LAT LON VALUES
min_lon = -180
max_lon = 180
min_lat = -80
max_lat = 90

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_nrt_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = ['al', 'c2n', 'h2b', 'j3n', 's3a', 's3b']
ref_satellites= ['s6a-hr', 'swon']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = '/DATASET/OCB_traces/dl/2022_global/input/{}'
download_sat_ref_dir = '/DATASET/OCB_traces/dl/2022_global/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "/DATASET/OCB_traces/concat/concatenated_input_2022_global.nc"
concatenated_ref_path = "/DATASET/OCB_traces/concat/concatenated_ref_2022_global.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "/DATASET/OCB_traces/input_test_6sat_2022_global.nc"

#### 2022 GLOBAL REPROCESSED

In [2]:
# TIME VALUES
min_time = '2022-01-01'
max_time = '2022-12-31'

# LAT LON VALUES
min_lon = -180
max_lon = 180
min_lat = -80
max_lat = 90

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_my_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = []
ref_satellites = ['s6a-lr']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = '/DATASET/OCB_traces/dl/2022_reprocessed/input/{}'
download_sat_ref_dir = '/DATASET/OCB_traces/dl/2022_reprocessed/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "/DATASET/OCB_traces/concat/concatenated_input_2022_reprocessed.nc"
concatenated_ref_path = "/DATASET/OCB_traces/concat/concatenated_ref_2022_reprocessed.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "/DATASET/OCB_traces/input_test_6sat_2022_reprocessed.nc"

#### 2017

In [None]:
# TIME VALUES
min_time = '2017-01-01'
max_time = '2017-12-31'

# LAT LON VALUES
min_lon = -66
max_lon = -54
min_lat = 32
max_lat = 44

# DATASET ID WITH BRACKETS FOR IN PLACE OF SATELLITE NAME
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_my_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = ['alg', 'h2ag', 'j2g', 'j2n', 'j3', 's3a']
ref_satellites = ['c2']

# DIR TO STORE DOWNLOADED DATA WITH BRACKETS FOR IN PLACE OF SATELLITE NAME
download_sat_input_dir = '/DATASET/OCB_traces/dl/2017/input/{}'
download_sat_ref_dir = '/DATASET/OCB_traces/dl/2017/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "/DATASET/OCB_traces/concat/concatenated_input_2017.nc"
concatenated_ref_path = "/DATASET/OCB_traces/concat/concatenated_ref_2017.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "/DATASET/OCB_traces/input_test_6sat_2017.nc"

#### LOGIN

In [3]:
import os
# log in with your free copernicus account
os.system('export COPERNICUSMARINE_CACHE_DIRECTORY=/tmp')
os.system('export COPERNICUSMARINE_SERVICE_USERNAME=<your_username>')
os.system('export COPERNICUSMARINE_SERVICE_PASSWORD=<your_password>')

0

## DOWNLOAD

In [3]:
# INPUT DATA
for sat in satellites:
    download_copernicus_data_for_sat(sat=sat, download_dir=download_sat_input_dir.format(sat), min_time=min_time, max_time=max_time, copernicus_dataset_id=copernicus_dataset_id)

Starting


Fetching catalog: 100%|██████████| 3/3 [00:11<00:00,  3.69s/it]


INFO - 2024-07-18T14:34:04Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T14:34:04Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T14:34:04Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T14:34:04Z - Downloading using service original-files...


100%|██████████| 365/365 [02:51<00:00,  2.13it/s]


Starting output validation
Succesfully validated output
Starting
INFO - 2024-07-18T14:36:59Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T14:36:59Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T14:36:59Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T14:36:59Z - Downloading using service original-files...


100%|██████████| 365/365 [02:34<00:00,  2.36it/s]


Starting output validation
Succesfully validated output
Starting
INFO - 2024-07-18T14:39:37Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T14:39:37Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T14:39:37Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T14:39:37Z - Downloading using service original-files...


100%|██████████| 361/361 [02:23<00:00,  2.51it/s]


Starting output validation
Succesfully validated output
Starting
INFO - 2024-07-18T14:42:05Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T14:42:05Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T14:42:05Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T14:42:05Z - Downloading using service original-files...


100%|██████████| 243/243 [01:36<00:00,  2.52it/s]


Starting output validation
Succesfully validated output
Starting
INFO - 2024-07-18T14:43:46Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T14:43:46Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T14:43:46Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T14:43:46Z - Downloading using service original-files...


100%|██████████| 365/365 [02:18<00:00,  2.63it/s]


Starting output validation
Succesfully validated output
Starting
INFO - 2024-07-18T14:46:07Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T14:46:07Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T14:46:07Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T14:46:07Z - Downloading using service original-files...


100%|██████████| 365/365 [02:23<00:00,  2.54it/s]

Starting output validation
Succesfully validated output





In [4]:
# REFERENCE DATA
for sat in ref_satellites:
    download_copernicus_data_for_sat(sat=sat, download_dir=download_sat_ref_dir.format(sat), min_time=min_time, max_time=max_time, copernicus_dataset_id=copernicus_dataset_id, no_metadata_cache=True)

Starting
username:password:

Fetching catalog: 100%|██████████| 3/3 [00:16<00:00,  5.61s/it]

INFO - 2024-09-02T20:08:05Z - Dataset version was not specified, the latest one was selected: "202207"
INFO - 2024-09-02T20:08:05Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-09-02T20:08:05Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-09-02T20:08:05Z - Downloading using service original-files...



100%|██████████| 485/485 [02:09<00:00,  3.74it/s]

Starting output validation
Succesfully validated output





## CONCATENATE

In [4]:
# INPUT DATA
filt_daily_ssh_data(
    input_dir=download_sat_input_dir.format(''),
    output_path=concatenated_input_path,
    min_time=min_time,
    max_time=max_time,
    min_lon=min_lon,
    max_lon=max_lon,
    min_lat=min_lat,
    max_lat=max_lat
    )

Starting
Starting input validation
Succesfully validated input
Starting output validation
Succesfully validated output


In [5]:
# REFERENCE DATA
filt_daily_ssh_data(
    input_dir=download_sat_ref_dir.format(''),
    output_path=concatenated_ref_path,
    min_time=min_time,
    max_time=max_time,
    min_lon=min_lon,
    max_lon=max_lon,
    min_lat=min_lat,
    max_lat=max_lat
    )

Starting
Starting input validation
Succesfully validated input
Starting output validation
Succesfully validated output


## GRIDDING

In [3]:
# INPUT DATA
grid_input(
    input_path=concatenated_input_path,
    output_path=gridded_input_path,
    min_time=min_time,
    max_time=max_time,
    min_lon=min_lon,
    max_lon=max_lon,
    min_lat=min_lat,
    max_lat=max_lat,
    #degrees=0.083,
    degrees=0.25
)

## TESTS

In [4]:
gridded_input_data = xr.load_dataset(gridded_input_path)
concat_ref_data = xr.load_dataset(concatenated_ref_path)
print('GRIDDED INPUT\nMIN TIME: {}\nMAX TIME: {}'.format(gridded_input_data.time.values.min(), gridded_input_data.time.values.max()))
print('\n\nCONCAT REF\nMIN TIME: {}\nMAX TIME: {}'.format(concat_ref_data.time.values.min(), concat_ref_data.time.values.max()))



CONCAT REF
MIN TIME: 2023-01-01T00:02:36.999199232
MAX TIME: 2023-06-07T15:19:13.702756864
