# Data Preprocessing
This notebook runs a user through the steps to select a survey and preprocess all the raw data into the inputs necessary to run GARPOS.  

In [None]:
import os
from pathlib import Path
import pandas as pd

from es_sfgtools.pipeline import DataHandler
from es_sfgtools.utils.archive_pull import (
    list_survey_files
    )

## Step 1. Initial Setup


#### Browse available surveys from the community archive and select target
- Locate the survey of interest in https://gage-data.earthscope.org/archive/seafloor, and note the `network`, `station`, and `survey` names, which will be input in the cell below.
- In order to use this notebook to process new surveys, the data must first be submitted and made available from the community archive 

# Step 2. Inventory available data and its location

In [None]:
network='aleutian'
site='SPT1'
survey='2020_A_UOH1'
#site='SEM1'
#survey='2022_A_1049'

#### USE THE FOLLOWING DEFAULTS UNLESS DESIRED####

# Set data directory
data_dir = Path(f"{os.getcwd()}/data")

data_handler = DataHandler(network=network, station=site, survey=survey, data_dir=data_dir)

In [None]:
# Generate a list of files available from remote archive
#TODO: implement options for raw vs intermediate vs processed 
remote_filepaths = list_survey_files(network=network, station=site, survey=survey, show_details=True)

In [None]:
# See what files exist locally
data_type_counts = data_handler.get_dtype_counts()
print(f"Local data directory contains the following:")
for item in data_type_counts.items():
    print(f"    {item[0]}: {item[1]}")

## Step 3. Pull data from remote archive

In [None]:
#Add found remote files to the local catalog.  Note this builds an inventory, 
#but does not do the downloading until a later step.
# TODO: Detail counts of files local vs only remote
data_handler.add_data_remote(remote_filepaths=remote_filepaths)


#### Select files types for downloading
Observable file types depend on whether data was collected with an SV2 or SV3 waveglider.  

![Alt text](garpos_flow.jpg)

In [None]:
# Download the files by type
data_handler.download_data(file_type='sonardyne', show_details=False)
data_handler.download_data(file_type='novatel', show_details=False)
data_handler.download_data(file_type='master', show_details=False)
data_handler.download_data(file_type='svpavg', show_details=False)
data_handler.download_data(file_type='leverarm', show_details=False)

# Step 4. Parse/Process raw data to processing input schemas

- 4.1 Parse acoustic observations into AcousticDataFrames
- 4.2 Parse IMU observations into IMUDataFrames
- 4.3 Process GNSS observables to generate PositionDataFrames
    - Parse RANGE-A novatel messages, build RINEX files
    - Run PRIDE-PPP-AR on RINEX, generate Kin files
    - Parse Kin files into PositionDataFrames
- 4.4 Parse metadata files into SiteConfig

### 4.1 Take all acoustic parent files and generate acoustic df's

In [None]:
data_handler.process_acoustic_data(override=False, show_details=False)

### 4.2 Take all IMU parent files and generate IMU df's

In [None]:
data_handler.process_imu_data(override=False, show_details=False)

### 4.3 Take all GNSS parent files and generate GNSS df's

In [None]:
data_handler.process_rinex(override=False, show_details=False)

In [None]:
data_handler.process_gnss_data_kin(override=False, show_details=False)

In [None]:
data_handler.process_gnss_data(override=False, show_details=False)

### 4.4 Take all site parent files and generate the site configuration data, which includes ATD offset, sound-velocity DF, and transponder info

In [None]:
data_handler.process_siteconfig(override=False, show_details=False)

In [None]:
data_handler.process_atdoffset(override=False, show_details=False)

In [None]:
data_handler.process_svp(override=False, show_details=False)

# Step 5. Build ShotDataFrame
- Combine the AcousticDataFrame, ImuDataFrame, and PositionDataFrame into a single ShotDataFrame, interpolating the IMU and Position data to match the 15s sample interval in the Acoustic Data.  
- Store this ShotDataFrame as csv or tab-delimited ascii.
- If ShotDataFrame csv exists, can skip and load from csv.

In [None]:
#merge position dataframes and adjust time windows based on waveglider position plots
# def merge_position_dataframes(data_handler):
#     paths = data_handler.catalog_data[data_handler.catalog_data['type']=="gnss"]["local_location"].dropna()
#     merged_positions = pd.concat([pd.read_csv(path) for path in paths])
#     merged_positions['seconds'] = merged_positions['modified_julian_date'] * 86400 + merged_positions['second_of_day']
#     return merged_positions.set_index('seconds')
# merged_positions = merge_position_dataframes(data_handler)
# merged_positions

In [None]:
#merged_positions['seconds'] = merged_positions['modified_julian_date'] * 86400 + merged_positions['second_of_day']
#merged_positions2 = merged_positions.set_index(['modified_julian_date','second_of_day'])

In [None]:
# from scipy import stats
# import numpy as np
# from matplotlib import pyplot as plt

# def remove_outliers(positions, threshold=3):
#     z_lat = np.abs(stats.zscore(positions['latitude']))
#     outliers_lat = positions[z_lat > threshold]
#     cleaned_positions = positions.drop(outliers_lat.index)
#     print(outliers_lat)
#     z_long = np.abs(stats.zscore(cleaned_positions['longitude']))
#     outliers_long = cleaned_positions[z_long > threshold]
#     print(outliers_long)
#     return cleaned_positions.drop(outliers_long.index)


# def plot_llh(positions):
    
#     cleaned_positions = remove_outliers(positions, threshold=1)
#     # Create a figure with two subplots
#     fig, ax = plt.subplots(1, 1, figsize=(10, 10))

#     # Plot lines between antenna positions
#     ax.scatter(
#         cleaned_positions["longitude"],
#         cleaned_positions["latitude"],
#         color="green",
#         marker="o",
#         linewidths=0.005,
#     )

#     # # Plot transponder positions
#     # for transponder in garpos_input.site.transponders:
#     #     ax_enu.scatter(
#     #         transponder.position_enu.east,
#     #         transponder.position_enu.north,
#     #         label=transponder.id,
#     #         marker="x",
#     #         color="red",
#     #         linewidths=5,
#     #     )

#     # # Plot site center enu
#     # ax_enu.scatter(
#     #     garpos_input.site.center_enu.east,
#     #     garpos_input.site.center_enu.north,
#     #     label="Center",
#     #     marker="x",
#     #     color="blue",
#     #     linewidths=5,
#     # )
#     ax.set_xlabel("Longitude")
#     ax.set_ylabel("Latitude")
#     ax.set_title("Transponder and Antenna Positions")
#     ax.grid(True)

# plot_llh(merged_positions2)