# Data Preprocessing
This notebook runs a user through the steps to select a survey and preprocess all the raw data into the inputs necessary to run GARPOS.  

In [None]:
import os
from pathlib import Path
import logging
import pandas as pd
import json

from es_sfgtools.pipeline import DataHandler,DATA_TYPE,FILE_TYPE,DataCatalog
from es_sfgtools.utils.archive_pull import (
    list_survey_files
    )

## Step 1. Initial Setup


#### Browse available surveys from the community archive and select target
- Locate the survey of interest in https://gage-data.earthscope.org/archive/seafloor, and note the `network`, `station`, and `survey` names, which will be input in the cell below.
- In order to use this notebook to process new surveys, the data must first be submitted and made available from the community archive 

In [None]:
network='aleutian'
site='IVB1'
survey='2018_A_SFG1'

#### USE THE FOLLOWING DEFAULTS UNLESS DESIRED####

# Set data directory
data_dir = Path(f"{os.getcwd()}/data/{network}_{site}_{survey}")

# Set Logger Location
logging.basicConfig(level=logging.INFO, 
                    #format="{asctime} - {levelname} - {message}",
                    format="{message}",
                    style="{",
                    datefmt="%Y-%m-%d %H:%M:%S")
logger = logging.getLogger()

# Initialize the data_handler
data_handler = DataHandler(working_dir=data_dir)

# Step 2. Inventory available data and its location

In [None]:
# Generate a list of files available from remote archive
#TODO: implement options for raw vs intermediate vs processed 
remote_filepaths = list_survey_files(network=network, station=site, survey=survey, show_details=True)

In [None]:
# See what files exist locally
data_type_counts = data_handler.get_local_counts()
logger.info(f"Local data directory contains the following:")
for item in data_type_counts.items():
    logger.info(f"    {item[0]}: {item[1]}")

## Step 3. Pull data from remote archive

In [None]:
#Add found remote files to the local catalog.  Note this builds an inventory, but does not do the downloading until a later step.
# TODO: Detail counts of files local vs only remote
data_handler.add_campaign_data(network=network, station=site, survey=survey, remote_filepaths=remote_filepaths)
data_type_counts = data_handler.get_dtype_counts()
logger.info(f"Catalog now contains the following:")
for item in data_type_counts.items():
    logger.info(f"    {item[0]}: {item[1]}")

#### Select files types for downloading
Observable file types depend on whether data was collected with an SV2 or SV3 waveglider.  

![Alt text](garpos_flow.jpg)

In [None]:
# Download the files by type
data_handler.download_campaign_data(network=network, station=site, survey=survey, file_type='sonardyne', show_details=False)
# data_handler.download_campaign_data(network=network, station=site, survey=survey, file_type='novatel', show_details=True)
# data_handler.download_campaign_data(network=network, station=site, survey=survey, file_type='master', show_details=False)
# data_handler.download_campaign_data(network=network, station=site, survey=survey, file_type='svpavg', show_details=False)
#data_handler.download_campaign_data(network=network, station=site, survey=survey, file_type='leverarm', show_details=False)

# Step 4. Parse/Process raw data to processing input schemas

- 4.1 Parse acoustic observations into AcousticDataFrames
- 4.2 Parse IMU observations into IMUDataFrames
- 4.3 Process GNSS observables to generate PositionDataFrames
    - Parse RANGE-A novatel messages, build RINEX files
    - Run PRIDE-PPP-AR on RINEX, generate Kin files
    - Parse Kin files into PositionDataFrames
- 4.4 Parse metadata files into SiteConfig

### 4.1 Take all acoustic parent files and generate acoustic df's

In [None]:
data_handler.process_acoustic_data(network=network, station=site, survey=survey, override=False, show_details=False)

### 4.2 Take all IMU parent files and generate IMU df's

In [None]:
data_handler.process_imu_data(network=network, station=site, survey=survey, override=False, show_details=False)

### 4.3 Take all GNSS parent files and generate GNSS df's

In [None]:
data_handler.process_rinex(network=network, station=site, survey=survey, override=False, show_details=False)

In [None]:
data_handler.process_gnss_data_kin(network=network, station=site, survey=survey, override=False, show_details=False)

In [None]:
data_handler.process_gnss_data(network=network, station=site, survey=survey,  override=False, show_details=False)

### 4.4 Take all site parent files and generate the site configuration data, which includes ATD offset, sound-velocity DF, and transponder info

In [None]:
data_handler.process_siteconfig(network=network, station=site, survey=survey, show_details=False)

# Step 5. Build ShotDataFrame
- Combine the AcousticDataFrame, ImuDataFrame, and PositionDataFrame into a single ShotDataFrame, interpolating the IMU and Position data to match the 15s sample interval in the Acoustic Data.  
- Store this ShotDataFrame as csv or tab-delimited ascii.
- If ShotDataFrame csv exists, can skip and load from csv.