# Preparing metadata for archiving ApRES data from teh centerline of thwaites. 

In [23]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
import glob

## Define the folder where archived data are stored

In [5]:
archived_data_path = '../../../../../../data/thwaites_apres/archiving'

## Confirm that the waypoints in the metadata are the same as the directories containing the data
For the centerline attended surveys, the .dat files are each supposed to be in a directory corresponding to their waypoint name. Below we check that this these directory names are all included in the metadata. 

In [6]:
# List contents of the directory and filter for just directories
attended_22_23 = [ f.name for f in os.scandir(archived_data_path + '/attended/centerline/single/2022-2023') if f.is_dir() ]
attended_23_24 = [ f.name for f in os.scandir(archived_data_path + '/attended/centerline/single/2023-2024') if f.is_dir() ]
print(f"there are {len(attended_22_23)} waypoint 2022-2023 directories")
print(f"there are {len(attended_23_24)} waypoint 2023-2024 directories")

there are 232 waypoint 2022-2023 directories
there are 95 waypoint 2023-2024 directories


In [7]:
# Load metadata for each year and convert to strings
md_22_23 = pd.read_excel(archived_data_path + '/attended/centerline/single/2022-2023/metadata_notes_22-23.xlsx')
md_23_24 = pd.read_excel(archived_data_path + '/attended/centerline/single/2023-2024/metadata_notes_23-24.xlsx')
md_waypoints_22_23 = md_22_23.waypoint
md_waypoints_23_24 = md_23_24.waypoint
md_waypoints_22_23 = [str(x) for x in md_waypoints_22_23]
md_waypoints_23_24 = [str(x) for x in md_waypoints_23_24]

In [8]:
# Check that all the folder names are contained in the metadata. 
print(all([x in md_waypoints_22_23 for x in attended_22_23]))
print(all([x in md_waypoints_23_24 for x in attended_23_24]))


True
True


## Check that the centerline polarimetric folder names are in the metadata

In [9]:
attended_22_23_polar = [ f.name for f in os.scandir(archived_data_path + '/attended/centerline/polarimetric/2022-2023') if f.is_dir() ]
attended_23_24_polar = [ f.name for f in os.scandir(archived_data_path + '/attended/centerline/polarimetric/2023-2024') if f.is_dir() ]
print(f"there are {len(attended_22_23_polar)} waypoint 2022-2023 directories")
print(f"there are {len(attended_23_24_polar)} waypoint 2023-2024 directories")

there are 43 waypoint 2022-2023 directories
there are 4 waypoint 2023-2024 directories


In [10]:
print(all([x in md_waypoints_22_23 for x in attended_22_23_polar]))
print(all([x in md_waypoints_23_24 for x in attended_23_24_polar]))

True
True


In [11]:
for x, y in zip(([x in md_waypoints_22_23  for x in attended_22_23_polar]),attended_22_23_polar):
    print(f"{x}: {y}")


True: G6-29-05
True: G6-39-05
True: G6-19-05
True: G8-05-264
True: G6-09-05
True: G9-041
True: G10-09-219
True: G9-15-234
True: G8-15-254
True: G9-031
True: G5-39-05
True: G10-04-224
True: G5-29-05
True: G5-09-05
True: G5-19-05
True: G3-39-05
True: G7-28-05
True: G7-38-05
True: G9-05-244
True: G3-29-05
True: G3-09-05
True: G7-18-05
True: G7-08-05
True: G3-19-05
True: G8-10-259
True: G8-20-249
True: G9-10-239
True: G2-10-05
True: G9-20-229
True: G2-30-05
True: G4-40-05
True: G2-20-05
True: G4-10-05
True: G4-30-05
True: G4-20-05
True: G10-032
True: G2-40-05
True: G1-41-05
True: G9-061
True: G9-051
True: G10-042
True: G1-21-05
True: G1-31-05


## Note on naming conventions and changes made to metadata and file names. 
As described in a file supplied by Elizabeth Case in the original collection of files sent to Jonny Kingslake (/Users/jkingslake/Documents/data/thwaites_apres/original/2022-2023/Polarmetric/NamingConvention.md), the naming convection of some sites have waypoint number at at the end of their name, typically are 250. Within the waypoint name which include this number, there were difference in how they are referred to between the metadata, the folder names for the single measurements, and the folder names for the polarimetric measurements. 

- The metadata originally just had either the box number and the waypoint number,  e.g., G8-249 or just the  the waypoint number, e.g, 248. 
- The folder names for the single measurements originally had the box number, the waypoint number, and an additional number (described in NamingConvention.md), e.g., G9-01-248
- The folder names for the polarimetirc data had the box number and the the waypoint number,  e.g., G8-249.

In the archived versions I have changed the waypoint names in the metadata and in the folder names of the polarimetric data to be the same as the folder names from teh single data, e.g., G9-01-248 wherever possible. In cases where a polarimetric waypoint is not also in the single-measurement waypoints, I have changed the metadata to be the same as the polarimetric folder names. 

# Create a hybrid position record
There are three sources of waypoint positions:
1. The 22-23 metadata (source 1)
2. The 23-24 metadata, but it is noted that this is from the previous season (source 2)
3. The 23-24 metadata, recorded by the operators in 23-24 (source 3)
Note that in some cases waypoints are listed more than once with the same or different positions. 

The cell below combines these using the following logic:
For each waypoint we look first and source 1, then source 2, then finally source 3/ 
1. If any of the locations listed for a waypoint in source 1 has longitude, latitude and elevation present, we use them. If no we move onto step 2.
2. If source 2 has an entry for this waypoint we move on to step 3 (if not we give up and put in NaNs for the position).
3. If any of the locations listed for a waypoint in source 2 has longitude, latitude and elevation present, we use them. If not we move onto step 4.
4. If source 3 has an entry for this waypoint (which it will because isin the same dataframe as source 2), we move onto step 5
5. Is any of the locations listed in source 3 has a latitude present, we use the longitude, latitude and elevations from that entry. 

In [12]:
md_22_23.set_index('waypoint', inplace=True)
md_23_24.set_index('waypoint', inplace=True)

In [13]:
md_22_23.insert(md_22_23.shape[1], 'latitude (EPSG:4326 - WGS 84) combined', np.nan)
md_22_23.insert(md_22_23.shape[1], 'longitude (EPSG:4326 - WGS 84) combined', np.nan)
md_22_23.insert(md_22_23.shape[1], 'elevation (EPSG:4326 - WGS 84) combined', np.nan)
md_22_23.insert(md_22_23.shape[1], 'combined location source', np.nan)#

In [14]:
def remove_nan_from_list(input_list):
    """Remove NaN values from a list."""
    return [c for c in input_list.tolist() if  ~np.isnan(c)]

def remove_nans_from_positions(lat, lon, elev):
    if not isinstance(lat, float):
        lats_noNaNs = remove_nan_from_list(lat)
        lons_noNaNs = remove_nan_from_list(lon)
        elevs_noNaNs = remove_nan_from_list(elev)
    else:
        lats_noNaNs = [lat]
        lons_noNaNs = [lon]
        elevs_noNaNs = [elev]
    return lats_noNaNs, lons_noNaNs, elevs_noNaNs



# combine the latitude data into one list, using values from 2022-2023 where available, but replacing any nans with values from 2023-2024 metadata when needed
skip = False
for waypoint in md_waypoints_22_23:
    #print(type(waypoint))
    try:
        waypoint = int(waypoint)
    except ValueError: # continue with the rest of the iteration
        pass

    lat_22_23 = md_22_23.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) 22-23']
    lon_22_23 = md_22_23.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) 22-23']
    elev_22_23 = md_22_23.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) 22-23']
    #print(lat_22_23)
    #len(lat_22_23)
    if  (~np.isnan(lat_22_23)).any() and (~np.isnan(lon_22_23)).any() and (~np.isnan(elev_22_23)).any():

        lats_noNaNs, lons_noNaNs, elevs_noNaNs = remove_nans_from_positions(lat_22_23, lon_22_23, elev_22_23)
        
        md_22_23.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) combined'] = lats_noNaNs[0]
        md_22_23.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) combined'] = lons_noNaNs[0]
        md_22_23.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) combined'] = elevs_noNaNs[0]
        md_22_23.loc[waypoint, 'combined location source'] = '22-23 metadata'
        #print(f"using 22-23 metadata for {waypoint}")
    else:
        print(f"position data for {waypoint} in 2022-2023 metadata is nans")

        try:
            lat_23_24_from22_23 = md_23_24.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) 22-23']
            lon_23_24_from22_23 = md_23_24.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) 22-23']
            elev_23_24_from22_23 = md_23_24.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) 22-23']
        except KeyError:
            skip_23_24 = True
            print(f"no entry for {waypoint} in 2023-2024 metadata")
            continue

        if   (~np.isnan(lat_23_24_from22_23)).any() and (~np.isnan(lon_23_24_from22_23)).any() and (~np.isnan(elev_23_24_from22_23)).any():
                
            lats_noNaNs, lons_noNaNs, elevs_noNaNs = remove_nans_from_positions(lat_23_24_from22_23, 
                                                                                lon_23_24_from22_23, 
                                                                                elev_23_24_from22_23)

            md_22_23.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) combined'] = lats_noNaNs[0]
            md_22_23.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) combined'] = lons_noNaNs[0]
            md_22_23.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) combined'] = elevs_noNaNs[0]
            md_22_23.loc[waypoint, 'combined location source'] = '23-24 metadata, recorded in 22-23'
            print(f"using 23-24 metadata for {waypoint} recorded in 22-23 season")
        else:
            print(f"no position data for {waypoint} in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)")


            lat_23_24 = md_23_24.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) 23-24']
            lon_23_24 = md_23_24.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) 23-24']
            elev_23_24 = md_23_24.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) 23-24']

            if  (~np.isnan(lat_23_24)).any():
                lats_noNaNs, lons_noNaNs, elevs_noNaNs = remove_nans_from_positions(lat_23_24,
                                                                                    lon_23_24,
                                                                                    elev_23_24)
                
                md_22_23.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) combined'] = lats_noNaNs[0]
                md_22_23.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) combined'] = lons_noNaNs[0]
                md_22_23.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) combined'] = elevs_noNaNs[0]
                md_22_23.loc[waypoint, 'combined location source'] = '23-24 metadata recorded in 23-24 season'
                print(f"using 23-24 metadata for {waypoint} recorded in 23-24 season")
            
            else:
                print(f"no lat data found for {waypoint} ")



out = md_22_23.loc[:, ['latitude (EPSG:4326 - WGS 84) combined',
                      'longitude (EPSG:4326 - WGS 84) combined',
                      'elevation (EPSG:4326 - WGS 84) combined',
                      'combined location source']]
out.index.name = 'waypoint'

position data for G1-22-05 in 2022-2023 metadata is nans
no position data for G1-22-05 in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)
using 23-24 metadata for G1-22-05 recorded in 23-24 season
position data for G1-30-05 in 2022-2023 metadata is nans
no position data for G1-30-05 in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)
using 23-24 metadata for G1-30-05 recorded in 23-24 season
position data for G1-28-05 in 2022-2023 metadata is nans
no position data for G1-28-05 in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)
using 23-24 metadata for G1-28-05 recorded in 23-24 season
position data for G3-23-05 in 2022-2023 metadata is nans
no position data for G3-23-05 in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)
using 23-24 metadata for G3-23-05 recorded in 23-24 season
position data for G7-04-05 in 2022-2023 metadata is nans
no entry for G7-04-

  md_22_23.loc[waypoint, 'combined location source'] = '22-23 metadata'


### Convert to a geopandas dataframe

In [None]:
geom =gpd.points_from_xy(x=out['longitude (EPSG:4326 - WGS 84) combined'], y=out['latitude (EPSG:4326 - WGS 84) combined'])
out = gpd.GeoDataFrame(out, geometry = geom)
out = out.set_crs('EPSG:4326')
out.to_crs(3031, inplace=True) 

In [20]:
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    display(out)

### Shorten the names for writing to a shapefile

In [21]:
out_for_shape = out.copy()
out_for_shape.rename(columns={'latitude (EPSG:4326 - WGS 84) combined': 'lat-WGS84',
                             'longitude (EPSG:4326 - WGS 84) combined': 'lon-WGS84',
                             'elevation (EPSG:4326 - WGS 84) combined': 'z-WGS 84',
                             'combined location source': 'source'}, inplace=True)

In [22]:
out_for_shape.to_file(archived_data_path + '/attended/centerline/centerline_positions.shp', driver='ESRI Shapefile')
out_for_shape.to_csv(archived_data_path + '/attended/centerline/centerline_positions.csv')

## Combine position data from acrossline polarimetric csvs
The polarimetric data from the across line are stored in four directories in `/Users/jkingslake/Documents/data/thwaites_apres/archiving/attended/acrossline/polarimetric/GHOST24_Polarimetric_pRES_OZ`. In each one there is a csv containing the position of the measurement. 

Below we collate these into one csv and geopandas dataframe.

In [42]:
# use glob to find csvs in that dir
acrossline_polarimetric_csvs = glob.glob(archived_data_path + '/attended/acrossline/polarimetric//PpRES_*/*.csv')
polarimetric_acrossline_positions = pd.concat((pd.read_csv(f) for f in acrossline_polarimetric_csvs))

In [43]:
# convert to 
geom =gpd.points_from_xy(x=polarimetric_acrossline_positions['Longitude'], y=polarimetric_acrossline_positions['Latitude'])
polarimetric_acrossline_positions = gpd.GeoDataFrame(polarimetric_acrossline_positions, geometry = geom)
polarimetric_acrossline_positions = polarimetric_acrossline_positions.set_crs('EPSG:4326')
polarimetric_acrossline_positions.to_crs(3031, inplace=True) 
polarimetric_acrossline_positions

Unnamed: 0,Station,Latitude,Longitude,geometry
0,PpRES_20240551_001,-76.457764,-107.389709,POINT (-1410454.662 -441731.5)
0,PpRES_20240551_003,-76.472572,-104.822354,POINT (-1427249.346 -377691.319)
0,PpRES_20240551_004,-76.487418,-106.977237,POINT (-1410474.93 -430612.791)
0,PpRES_20240551_002,-76.467489,-106.578286,POINT (-1415542.4 -421407.444)


In [44]:
polarimetric_acrossline_positions = polarimetric_acrossline_positions.rename(columns={'Station': 'waypoint'})
polarimetric_acrossline_positions.set_index('waypoint', inplace=True)
polarimetric_acrossline_positions

Unnamed: 0_level_0,Latitude,Longitude,geometry
waypoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PpRES_20240551_001,-76.457764,-107.389709,POINT (-1410454.662 -441731.5)
PpRES_20240551_003,-76.472572,-104.822354,POINT (-1427249.346 -377691.319)
PpRES_20240551_004,-76.487418,-106.977237,POINT (-1410474.93 -430612.791)
PpRES_20240551_002,-76.467489,-106.578286,POINT (-1415542.4 -421407.444)


In [45]:
polarimetric_acrossline_positions.to_csv(archived_data_path + '/attended/acrossline/acrossline_positions.csv')