# Data Preprocessing

This notebook runs a user through the steps to select a survey and preprocess all the raw data into the inputs necessary to run GARPOS.  

In [1]:
import os
from pathlib import Path

from es_sfgtools.processing.pipeline import DataHandler
from es_sfgtools.utils.archive_pull import (
    list_campaign_files_by_type
    )

from es_sfgtools.utils.loggers import BaseLogger
from sfg_metadata import META_DATA,GEOLAB_CATALOG

### Confirm required environment variables are set

In [2]:
# this must be set correctly for GO executables to translate novatel to rinex

#Linux
#!echo $LD_LIBRARY_PATH

#Mac
os.environ['DYLD_LIBRARY_PATH'] = "/Users/gottlieb/miniconda3/envs/seafloor_geodesy_mac/lib"
os.getenv('DYLD_LIBRARY_PATH')

'/Users/gottlieb/miniconda3/envs/seafloor_geodesy_mac/lib'

In [3]:
# this confirms PRIDE-PPPAR is in the PATH
!which pdp3

/Users/gottlieb/.PRIDE_PPPAR_BIN/pdp3


## Step 1. Initial Setup


#### Browse available surveys from the community archive and select target
- Locate the survey of interest in https://gage-data.earthscope.org/archive/seafloor, and note the `network`, `station`, and `survey` names, which will be input in the cell below.  Leave vessel_type as SV3 unless you know you are working with older SV2 data.
- In order to use this notebook to process new surveys, the data must first be submitted and made available from the community archive 

In [4]:
# Input survey parameters
network='alaska-shumagins'
site='IVB1'
#campaign='2024_A_1126'
campaign='2022_A_1049'
vessel_type = 'SV3'

# Set data directory path for local environment
data_dir = Path(f"{os.path.expanduser('~/data/sfg')}")
os.makedirs(data_dir, exist_ok=True)

#### USE THE FOLLOWING DEFAULTS UNLESS DESIRED####
data_handler = DataHandler(directory=data_dir)
data_handler.change_working_station(network=network, station=site, campaign=campaign)
BaseLogger.set_dir(data_handler.station_log_dir)

if vessel_type == 'SV3':
    pipeline, config = data_handler.get_pipeline_sv3()
elif vessel_type == 'SV2':
    pipeline, config = data_handler.get_pipeline_sv2()
else:
    raise ValueError(f"Vessel type {vessel_type} not recognized")



Building directory structure for alaska-shumagins IVB1 2022_A_1049
No date range set for alaska-shumagins, IVB1, 2022_A_1049
Building TileDB arrays for IVB1
Changed working station to alaska-shumagins IVB1


# Step 2. Inventory available data and its location

In [7]:
data_handler.update_catalog_from_archive()

Updating catalog with remote paths of available data for alaska-shumagins IVB1 2022_A_1049
Listing raw campaign files from url https://data.earthscope.org/archive/seafloor/alaska-shumagins/2022/IVB1/2022_A_1049/raw
Found under https://data.earthscope.org/archive/seafloor/alaska-shumagins/2022/IVB1/2022_A_1049/raw:
    120 NOV000 file(s)
    50 NOV770 file(s)
    8 DFOP00 file(s)
Listing metadata campaign files from url https://data.earthscope.org/archive/seafloor/alaska-shumagins/2022/IVB1/2022_A_1049/metadata
Found under https://data.earthscope.org/archive/seafloor/alaska-shumagins/2022/IVB1/2022_A_1049/metadata:
    1 master file(s)
    1 lever_arms file(s)
    2 ctd file(s)
31 files not recognized and skipped
181 files already exist in the catalog
Added 0 out of 181 files to the catalog


In [8]:
# See what files exist locally
data_type_counts = data_handler.get_dtype_counts()
print(f"Local data directory contains the following:")
for item in data_type_counts.items():
    print(f"    {item[0]}: {item[1]}")

Local data directory contains the following:
    ctd: 1
    svp: 1


## Step 3. Pull data from remote archive

#### Select files types for downloading
Observable file types depend on whether data was collected with an SV2 or SV3 waveglider.  

![Alt text](garpos_flow.jpg)

In [None]:
# Download the files by type
# data_handler.download_data(file_type='sonardyne', show_details=False)
# data_handler.download_data(file_type='novatel', show_details=False)
#data_handler.download_data(file_types='master')
data_handler.download_data(file_types='seabird')
# data_handler.download_data(file_type='leverarm', show_details=False)

#data_handler.download_data(file_types='dfop00')
#data_handler.download_data(file_types='novatel770')

In [None]:
from es_sfgtools.processing.operations.site_ops import (ctd_to_soundvelocity, CTDfile_to_svp, seabird_to_soundvelocity)
ctd_path = "/Users/gottlieb/data/sfg/alaska-shumagins/SEM1/2022_A_1049/raw/skq201811s_ctd001svpavg.cnv"
seabird_to_soundvelocity(ctd_path)

# Step 4. Parse/Process raw data to processing input schemas

- 4.1 Parse acoustic observations into AcousticDataFrames
- 4.2 Parse IMU observations into IMUDataFrames
- 4.3 Process GNSS observables to generate PositionDataFrames
    - Parse RANGE-A novatel messages, build RINEX files
    - Run PRIDE-PPP-AR on RINEX, generate Kin files
    - Parse Kin files into PositionDataFrames
- 4.4 Parse metadata files into SiteConfig

### 4.1 Process and read DFOP00 files 

In [None]:
#config.dfop00_config.override=True
#config.dfop00_config.show_details=True
#pipeline.config = config
pipeline.process_dfop00()

pipeline.pre_process_novatel()### 4.3 Take all GNSS parent files and generate GNSS df's

In [None]:
#config.novatel_config.override=True
#pipeline.config = config
pipeline.pre_process_novatel()

In [None]:
# config.rinex_config.n_processes=2
config.rinex_config.override=True
pipeline.config = config
pipeline.get_rinex_files()

In [None]:
config.rinex_config.override=True
pipeline.config = config   
pipeline.process_rinex()

In [None]:
pipeline.process_kin()

In [None]:
pipeline.update_shotdata()