# PRE PROCESSING DATA PIPELINE

#### IMPORTS

In [1]:
import matplotlib.pyplot as plt
import xarray as xr

from src.data_pipeline.ose_data_pipeline import download_copernicus_data_for_sat, filt_daily_ssh_data, grid_input

  from .autonotebook import tqdm as notebook_tqdm


## REAL DATA TEST

### Pipeline must:

- #### **Download**, **concatenate** and **grid** the *input* nadirs data
- #### **Download** and **concatenate** the *reference* nadir data

### VALUES TO BE DEFINED:

***RUN ONLY THE CELL CONTAINING THE DATA YOU ARE INTERESTED IN***

*If you want to include a new dataset to be processed, create a new cell and define all the necessary values*

#### 2023 REPROCESSED

In [2]:
# TIME VALUES
min_time = '2023-01-01'
max_time = '2023-12-31'

# LAT LON VALUES
min_lon = -66
max_lon = -54
min_lat = 32
max_lat = 44

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_my_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = []
ref_satellites = ['s6a-lr']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = 'data/dl/2023_reprocessed/input/{}'
download_sat_ref_dir = 'data/dl/2023_reprocessed/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "data/concat/concatenated_input_2023_reprocessed.nc"
concatenated_ref_path = "data/concat/concatenated_ref_2023_reprocessed.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "data/gridded/input_test_6sat_2023_reprocessed.nc"

#### 2023 NRT

In [2]:
# TIME VALUES
min_time = '2023-01-01'
max_time = '2023-12-31'

# LAT LON VALUES
min_lon = -66
max_lon = -54
min_lat = 32
max_lat = 44

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_nrt_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = ['al', 'c2n', 'h2b', 'j3n', 's3a', 's3b']
ref_satellites= ['s6a-hr', 'swon']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = 'data/dl/2023/input/{}'
download_sat_ref_dir = 'data/dl/2023/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "data/concat/concatenated_input_2023.nc"
concatenated_ref_path = "data/concat/concatenated_ref_2023.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "data/gridded/input_test_6sat_2023.nc"

#### 2022 NRT

In [None]:
# TIME VALUES
min_time = '2022-01-01'
max_time = '2022-12-31'

# LAT LON VALUES
min_lon = -66
max_lon = -54
min_lat = 32
max_lat = 44

# DATASET ID
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_nrt_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = ['al', 'c2n', 'h2b', 'j3n', 's3a', 's3b']
ref_satellites= ['s6a-hr', 'swon']

# DIR TO STORE DOWNLOADED DATA
download_sat_input_dir = 'data/dl/2022/input/{}'
download_sat_ref_dir = 'data/dl/2022/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "data/concat/concatenated_input_2022.nc"
concatenated_ref_path = "data/concat/concatenated_ref_2022.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "data/gridded/input_test_6sat_2022.nc"

#### 2017 PREPROCESSED

In [None]:
# TIME VALUES
min_time = '2017-01-01'
max_time = '2017-12-31'

# LAT LON VALUES
min_lon = -66
max_lon = -54
min_lat = 32
max_lat = 44

# DATASET ID WITH BRACKETS FOR IN PLACE OF SATELLITE NAME
copernicus_dataset_id = "cmems_obs-sl_glo_phy-ssh_my_{}-l3-duacs_PT1S"

# SATELLITES NAMES TO DOWNLOAD
satellites = ['alg', 'h2ag', 'j2g', 'j2n', 'j3', 's3a']
ref_satellites = ['c2']

# DIR TO STORE DOWNLOADED DATA WITH BRACKETS FOR IN PLACE OF SATELLITE NAME
download_sat_input_dir = 'data/dl/2017/input/{}'
download_sat_ref_dir = 'data/dl/2017/ref/{}'

# DIR TO STORE CONCATENATED DATA
concatenated_input_path = "data/concat/concatenated_input_2017.nc"
concatenated_ref_path = "data/concat/concatenated_ref_2017.nc"

# GRIDDED INPUT FILEPATH
gridded_input_path = "data/gridded/input_test_6sat_2017.nc"

## DOWNLOAD

In [6]:
# INPUT DATA
for sat in satellites:
    download_copernicus_data_for_sat(sat=sat, download_dir=download_sat_input_dir.format(sat), min_time=min_time, max_time=max_time, copernicus_dataset_id=copernicus_dataset_id)

Starting


Fetching catalog: 100%|██████████| 3/3 [00:10<00:00,  3.47s/it]
INFO - 2024-07-18T10:59:21Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T10:59:21Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T10:59:21Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T10:59:21Z - Downloading using service original-files...
100%|██████████| 919/919 [06:33<00:00,  2.33it/s]


Starting output validation
Succesfully validated output
Starting


Fetching catalog: 100%|██████████| 3/3 [00:10<00:00,  3.59s/it]
INFO - 2024-07-18T11:06:08Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T11:06:08Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T11:06:08Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T11:06:08Z - Downloading using service original-files...
100%|██████████| 929/929 [06:36<00:00,  2.34it/s]


Starting output validation
Succesfully validated output
Starting


Fetching catalog: 100%|██████████| 3/3 [00:11<00:00,  3.94s/it]
INFO - 2024-07-18T11:12:59Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T11:12:59Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T11:12:59Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T11:12:59Z - Downloading using service original-files...
100%|██████████| 904/904 [06:57<00:00,  2.17it/s]


Starting output validation
Succesfully validated output
Starting


Fetching catalog: 100%|██████████| 3/3 [00:11<00:00,  3.88s/it]
INFO - 2024-07-18T11:20:10Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T11:20:10Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T11:20:10Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T11:20:10Z - Downloading using service original-files...
100%|██████████| 808/808 [06:23<00:00,  2.11it/s]


Starting output validation
Succesfully validated output
Starting


Fetching catalog: 100%|██████████| 3/3 [00:12<00:00,  4.21s/it]
INFO - 2024-07-18T11:26:48Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T11:26:49Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T11:26:49Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T11:26:49Z - Downloading using service original-files...
100%|██████████| 930/930 [07:08<00:00,  2.17it/s]


Starting output validation
Succesfully validated output
Starting


Fetching catalog: 100%|██████████| 3/3 [00:11<00:00,  3.82s/it]
INFO - 2024-07-18T11:34:11Z - Dataset version was not specified, the latest one was selected: "202311"
INFO - 2024-07-18T11:34:11Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T11:34:11Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T11:34:11Z - Downloading using service original-files...
100%|██████████| 930/930 [07:18<00:00,  2.12it/s]


Starting output validation
Succesfully validated output


In [3]:
# REFERENCE DATA
for sat in ref_satellites:
    download_copernicus_data_for_sat(sat=sat, download_dir=download_sat_ref_dir.format(sat), min_time=min_time, max_time=max_time, copernicus_dataset_id=copernicus_dataset_id)

Starting


Fetching catalog: 100%|██████████| 3/3 [00:10<00:00,  3.45s/it]
INFO - 2024-07-18T10:54:49Z - Dataset version was not specified, the latest one was selected: "202207"
INFO - 2024-07-18T10:54:49Z - Dataset part was not specified, the first one was selected: "default"
INFO - 2024-07-18T10:54:49Z - Service was not specified, the default one was selected: "original-files"
INFO - 2024-07-18T10:54:49Z - Downloading using service original-files...
100%|██████████| 349/349 [02:56<00:00,  1.98it/s]

Starting output validation
Succesfully validated output





## CONCATENATE

In [7]:
# INPUT DATA
filt_daily_ssh_data(
    input_dir=download_sat_input_dir.format(''),
    output_path=concatenated_input_path,
    min_time=min_time,
    max_time=max_time,
    min_lon=min_lon,
    max_lon=max_lon,
    min_lat=min_lat,
    max_lat=max_lat
    )

Starting
Starting input validation
Succesfully validated input
Starting output validation
Succesfully validated output


In [4]:
# REFERENCE DATA
filt_daily_ssh_data(
    input_dir=download_sat_ref_dir.format(''),
    output_path=concatenated_ref_path,
    min_time=min_time,
    max_time=max_time,
    min_lon=min_lon,
    max_lon=max_lon,
    min_lat=min_lat,
    max_lat=max_lat
    )

Starting
Starting input validation
Succesfully validated input
Starting output validation
Succesfully validated output


## GRIDDING

In [3]:
# INPUT DATA
grid_input(
    input_path=concatenated_input_path,
    output_path=gridded_input_path,
    min_time=min_time,
    max_time=max_time,
    min_lon=min_lon,
    max_lon=max_lon,
    min_lat=min_lat,
    max_lat=max_lat,
    degrees=0.083
)

## TESTS

In [None]:
gridded_input_data = xr.load_dataset(gridded_input_path)
concat_ref_data = xr.load_dataset(concatenated_ref_path)
print('GRIDDED INPUT\nMIN TIME: {}\nMAX TIME: {}'.format(gridded_input_data.time.values.min(), gridded_input_data.time.values.max()))
print('\n\nCONCAT REF\nMIN TIME: {}\nMAX TIME: {}'.format(concat_ref_data.time.values.min(), concat_ref_data.time.values.max()))