# Preparing metadata for archiving ApRES data from teh centerline of thwaites. 

In [112]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
import glob

## Define the folder where archived data are stored

In [113]:
archived_data_path = '../../../../../data/thwaites_apres/archiving'

## Confirm that the waypoints in the metadata are the same as the directories containing the data
For the centerline attended surveys, the .dat files are each supposed to be in a directory corresponding to their waypoint name. Below we check that this these directory names are all included in the metadata. 

In [114]:
def filter_folders(folder_list, filter_string):
    return [x for x in folder_list if filter_string not in x]

In [115]:
# List contents of the directory and filter for just directories
attended_22_23 = [ f.name for f in os.scandir(archived_data_path + '/attended/centerline/single/2022-2023') if f.is_dir() ]
attended_22_23 = filter_folders(attended_22_23, 'netcdf')
attended_23_24 = [ f.name for f in os.scandir(archived_data_path + '/attended/centerline/single/2023-2024') if f.is_dir() ]
attended_23_24 = filter_folders(attended_23_24, 'netcdf')
print(f"there are {len(attended_22_23)} waypoint 2022-2023 directories")
print(f"there are {len(attended_23_24)} waypoint 2023-2024 directories")

there are 232 waypoint 2022-2023 directories
there are 95 waypoint 2023-2024 directories


In [116]:
# Load metadata for each year and convert to strings
md_22_23 = pd.read_excel(archived_data_path + '/attended/centerline/single/2022-2023/metadata_notes_22-23.xlsx')
md_23_24 = pd.read_excel(archived_data_path + '/attended/centerline/single/2023-2024/metadata_notes_23-24.xlsx')
md_waypoints_22_23 = md_22_23.waypoint
md_waypoints_23_24 = md_23_24.waypoint
md_waypoints_22_23 = [str(x) for x in md_waypoints_22_23]
md_waypoints_23_24 = [str(x) for x in md_waypoints_23_24]

In [117]:
# Check that all the folder names are contained in the metadata. 
print(all([x in md_waypoints_22_23 for x in attended_22_23]))
print(all([x in md_waypoints_23_24 for x in attended_23_24]))


True
True


## Check that the centerline polarimetric folder names are in the metadata

In [118]:
attended_22_23_polar = [ f.name for f in os.scandir(archived_data_path + '/attended/centerline/polarimetric/2022-2023') if f.is_dir() ]
attended_22_23_polar = filter_folders(attended_22_23_polar, 'netcdf')
attended_23_24_polar = [ f.name for f in os.scandir(archived_data_path + '/attended/centerline/polarimetric/2023-2024') if f.is_dir() ]
attended_23_24_polar = filter_folders(attended_23_24_polar, 'netcdf')
print(f"there are {len(attended_22_23_polar)} waypoint 2022-2023 directories")
print(f"there are {len(attended_23_24_polar)} waypoint 2023-2024 directories")

there are 43 waypoint 2022-2023 directories
there are 4 waypoint 2023-2024 directories


In [119]:
print(all([x in md_waypoints_22_23 for x in attended_22_23_polar]))
print(all([x in md_waypoints_23_24 for x in attended_23_24_polar]))

True
True


## Note on naming conventions and changes made to metadata and file names. 
As described in a file supplied by Elizabeth Case in the original collection of files sent to Jonny Kingslake (/Users/jkingslake/Documents/data/thwaites_apres/original/2022-2023/Polarmetric/NamingConvention.md), the naming convection of some sites have waypoint number at the end of their name, typically 250. Within the waypoint name which include this number, there were difference in how they are referred to between the metadata, the folder names for the single measurements, and the folder names for the polarimetric measurements. 

- The metadata originally just had either the box number and the waypoint number,  e.g., G8-249 or just the  the waypoint number, e.g, 248. 
- The folder names for the single measurements originally had the box number, the waypoint number, and an additional number (described in NamingConvention.md), e.g., G9-01-248
- The folder names for the polarimetirc data had the box number and the the waypoint number,  e.g., G8-249.

In the archived versions, I have changed the waypoint names in the metadata and in the folder names of the polarimetric data to be the same as the folder names from the single data, e.g., G9-01-248 wherever possible. In cases where a polarimetric waypoint is not also in the single-measurement waypoints, I have changed the metadata to be the same as the polarimetric folder names. 

## Create a hybrid position record
There are three sources of waypoint positions:
1. The 22-23 metadata (source 1)
2. The 23-24 metadata, but it is noted that this is from the previous season (source 2)
3. The 23-24 metadata, recorded by the operators in 23-24 (source 3)
Note that in some cases waypoints are listed more than once with the same or different positions. 

The cell below combines these using the following logic:
For each waypoint we look first and source 1, then source 2, then finally source 3/ 
1. If any of the locations listed for a waypoint in source 1 has longitude, latitude and elevation present, we use them. If no we move onto step 2.
2. If source 2 has an entry for this waypoint we move on to step 3 (if not we give up and put in NaNs for the position).
3. If any of the locations listed for a waypoint in source 2 has longitude, latitude and elevation present, we use them. If not we move onto step 4.
4. If source 3 has an entry for this waypoint (which it will because isin the same dataframe as source 2), we move onto step 5
5. Is any of the locations listed in source 3 has a latitude present, we use the longitude, latitude and elevations from that entry. 

In [120]:
md_22_23.set_index('waypoint', inplace=True)
md_23_24.set_index('waypoint', inplace=True)

In [121]:
md_22_23.insert(md_22_23.shape[1], 'latitude (EPSG:4326 - WGS 84) combined', np.nan)
md_22_23.insert(md_22_23.shape[1], 'longitude (EPSG:4326 - WGS 84) combined', np.nan)
md_22_23.insert(md_22_23.shape[1], 'elevation (EPSG:4326 - WGS 84) combined', np.nan)
md_22_23.insert(md_22_23.shape[1], 'combined location source', np.nan)#

In [None]:
def remove_nan_from_list(input_list):
    """Remove NaN values from a list."""
    return [c for c in input_list.tolist() if  ~np.isnan(c)]

def remove_nans_from_positions(lat, lon, elev):
    if not isinstance(lat, float):
        lats_noNaNs = remove_nan_from_list(lat)
        lons_noNaNs = remove_nan_from_list(lon)
        elevs_noNaNs = remove_nan_from_list(elev)
    else:
        lats_noNaNs = [lat]
        lons_noNaNs = [lon]
        elevs_noNaNs = [elev]
    return lats_noNaNs, lons_noNaNs, elevs_noNaNs



# combine the latitude data into one list, using values from 2022-2023 where available, but replacing any nans with values from 2023-2024 metadata when needed
skip = False
for waypoint in md_waypoints_22_23:
    #print(type(waypoint))
    try:
        waypoint = int(waypoint)
    except ValueError: # continue with the rest of the iteration
        pass

    lat_22_23 = md_22_23.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) 22-23']
    lon_22_23 = md_22_23.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) 22-23']
    elev_22_23 = md_22_23.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) 22-23']
    #print(lat_22_23)
    #len(lat_22_23)
    if  (~np.isnan(lat_22_23)).any() and (~np.isnan(lon_22_23)).any() and (~np.isnan(elev_22_23)).any():

        lats_noNaNs, lons_noNaNs, elevs_noNaNs = remove_nans_from_positions(lat_22_23, lon_22_23, elev_22_23)
        
        md_22_23.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) combined'] = lats_noNaNs[0]
        md_22_23.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) combined'] = lons_noNaNs[0]
        md_22_23.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) combined'] = elevs_noNaNs[0]
        md_22_23.loc[waypoint, 'combined location source'] = '22-23 metadata'
        #print(f"using 22-23 metadata for {waypoint}")
    else:
        print(f"position data for {waypoint} in 2022-2023 metadata is nans")

        try:
            lat_23_24_from22_23 = md_23_24.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) 22-23']
            lon_23_24_from22_23 = md_23_24.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) 22-23']
            elev_23_24_from22_23 = md_23_24.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) 22-23']
        except KeyError:
            skip_23_24 = True
            print(f"no entry for {waypoint} in 2023-2024 metadata")
            continue

        if   (~np.isnan(lat_23_24_from22_23)).any() and (~np.isnan(lon_23_24_from22_23)).any() and (~np.isnan(elev_23_24_from22_23)).any():
                
            lats_noNaNs, lons_noNaNs, elevs_noNaNs = remove_nans_from_positions(lat_23_24_from22_23, 
                                                                                lon_23_24_from22_23, 
                                                                                elev_23_24_from22_23)

            md_22_23.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) combined'] = lats_noNaNs[0]
            md_22_23.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) combined'] = lons_noNaNs[0]
            md_22_23.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) combined'] = elevs_noNaNs[0]
            md_22_23.loc[waypoint, 'combined location source'] = '23-24 metadata, recorded in 22-23'
            print(f"using 23-24 metadata for {waypoint} recorded in 22-23 season")
        else:
            print(f"no position data for {waypoint} in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)")


            lat_23_24 = md_23_24.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) 23-24']
            lon_23_24 = md_23_24.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) 23-24']
            elev_23_24 = md_23_24.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) 23-24']

            if  (~np.isnan(lat_23_24)).any():
                lats_noNaNs, lons_noNaNs, elevs_noNaNs = remove_nans_from_positions(lat_23_24,
                                                                                    lon_23_24,
                                                                                    elev_23_24)
                
                md_22_23.loc[waypoint, 'latitude (EPSG:4326 - WGS 84) combined'] = lats_noNaNs[0]
                md_22_23.loc[waypoint, 'longitude (EPSG:4326 - WGS 84) combined'] = lons_noNaNs[0]
                md_22_23.loc[waypoint, 'elevation (EPSG:4326 - WGS 84) combined'] = elevs_noNaNs[0]
                md_22_23.loc[waypoint, 'combined location source'] = '23-24 metadata recorded in 23-24 season'
                print(f"using 23-24 metadata for {waypoint} recorded in 23-24 season")
            
            else:
                print(f"no lat data found for {waypoint} ")



out = md_22_23.loc[:, ['latitude (EPSG:4326 - WGS 84) combined',
                      'longitude (EPSG:4326 - WGS 84) combined',
                      'elevation (EPSG:4326 - WGS 84) combined',
                      'combined location source']]
out.index.name = 'waypoint'
out

position data for G1-22-05 in 2022-2023 metadata is nans
no position data for G1-22-05 in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)
using 23-24 metadata for G1-22-05 recorded in 23-24 season
position data for G1-30-05 in 2022-2023 metadata is nans
no position data for G1-30-05 in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)
using 23-24 metadata for G1-30-05 recorded in 23-24 season
position data for G1-28-05 in 2022-2023 metadata is nans
no position data for G1-28-05 in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)
using 23-24 metadata for G1-28-05 recorded in 23-24 season
position data for G3-23-05 in 2022-2023 metadata is nans
no position data for G3-23-05 in 2023-2024 (recorded in 22-23), trying to find it in 2023-2024 (recorded in 23-24)
using 23-24 metadata for G3-23-05 recorded in 23-24 season
position data for G7-04-05 in 2022-2023 metadata is nans
no entry for G7-04-

Unnamed: 0_level_0,latitude (EPSG:4326 - WGS 84) combined,longitude (EPSG:4326 - WGS 84) combined,elevation (EPSG:4326 - WGS 84) combined,combined location source
waypoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G1-21-05,-76.002191,-107.571318,882.202698,22-23 metadata
G1-21-05,-76.002191,-107.571318,882.202698,22-23 metadata
G1-21-05,-76.002191,-107.571318,882.202698,22-23 metadata
G1-21-05,-76.002191,-107.571318,882.202698,22-23 metadata
G1-22-05,-76.001810,-107.573510,,23-24 metadata recorded in 23-24 season
...,...,...,...,...
G10-042,-77.595419,-109.004602,1320.478394,22-23 metadata
G10-042,-77.595419,-109.004602,1320.478394,22-23 metadata
G10-042,-77.595419,-109.004602,1320.478394,22-23 metadata
G10-10-218,-77.598552,-109.054292,1318.649536,22-23 metadata


## Create season-specific positions csvs

In [143]:
out_22_23 = md_22_23.loc[:, ['longitude (EPSG:4326 - WGS 84) 22-23',
                'latitude (EPSG:4326 - WGS 84) 22-23',
                'elevation (EPSG:4326 - WGS 84) 22-23']]
out_23_24 = md_23_24.loc[:, ['longitude (EPSG:4326 - WGS 84) 23-24',
                'latitude (EPSG:4326 - WGS 84) 23-24',
                'elevation (EPSG:4326 - WGS 84) 23-24']]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    display(out_23_24)
# rename columns to long, lat, elev
out_22_23.columns = ['lon',
                     'lat',
                     'elevation']
out_23_24.columns = ['lon',
                     'lat',
                     'elevation']

# remove duplicates
out_22_23 = out_22_23.loc[~out_22_23.index.duplicated(keep="first"),:]
out_23_24 = out_23_24.loc[~out_23_24.index.duplicated(keep="first"),:]

# remove 'G3-12-05-2', 'G3-20-05-2' from 23-24 because they were visited twice and that is too complex to include int eh netcdf .
out_23_24 = out_23_24.drop(index=['G3-12-05-2', 'G3-20-05-2'])

# add a unique number for each waypoint (based on the first year because that has more waypoints).
out_22_23.insert(0, 'waypoint_number', range(1, 1 + len(out_22_23)))
out_23_24.insert(0, 'waypoint_number', out_22_23.loc[out_23_24.index,'waypoint_number'])

# sort by the waypopint number
out_22_23 = out_22_23.sort_values('waypoint_number')
out_23_24 = out_23_24.sort_values('waypoint_number')

out_23_24#.loc['G8-07-262',:]  

Unnamed: 0_level_0,waypoint_number,lon,lat,elevation
waypoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G1-21-05,1,-107.57095,-75.99746,
G1-22-05,2,-107.57351,-76.00181,
G1-23-05,3,-107.57617,-76.00628,
G1-24-05,4,-107.57902,-76.01073,
G1-29-05,5,-107.59096,-76.03301,
...,...,...,...,...
G4-36-05,123,-107.92979,-76.60682,1122.0
G4-40-05,125,,,
G5-03-05,127,-107.95187,-76.64241,1134.0
G5-11-05,131,-107.97378,-76.67796,1133.0


## Write season-specific positions csvs

In [None]:
# convert to geodataframe, add crs
geom =gpd.points_from_xy(x=out_22_23['lon'], y=out_22_23['lat'])
out_22_23 = gpd.GeoDataFrame(out_22_23, geometry = geom)
out_22_23 = out_22_23.set_crs('EPSG:4326')

# save
out_22_23.to_csv(archived_data_path + '/attended/centerline/single/2022-2023/positions.csv')
out_22_23.to_csv(archived_data_path + '/attended/centerline/polarimetric/2022-2023/positions.csv')

In [None]:
# convert to geodataframe, add crs
geom =gpd.points_from_xy(x=out_23_24['lon'], y=out_23_24['lat'])
out_23_24 = gpd.GeoDataFrame(out_23_24, geometry = geom)
out_23_24 = out_23_24.set_crs('EPSG:4326')

# save
out_23_24.to_csv(archived_data_path + '/attended/centerline/single/2023-2024/positions.csv')
out_23_24.to_csv(archived_data_path + '/attended/centerline/polarimetric/2023-2024/positions.csv')

## Write hybrid position file
(created in a previous section above)

In [145]:
hybrid = md_22_23.loc[:, ['latitude (EPSG:4326 - WGS 84) combined',
                      'longitude (EPSG:4326 - WGS 84) combined',
                      'elevation (EPSG:4326 - WGS 84) combined',
                      'combined location source']]
hybrid.index.name = 'waypoint'
geom =gpd.points_from_xy(x=hybrid['longitude (EPSG:4326 - WGS 84) combined'], y=hybrid['latitude (EPSG:4326 - WGS 84) combined'])
hybrid = gpd.GeoDataFrame(hybrid, geometry = geom)
hybrid = hybrid.set_crs('EPSG:4326')  
hybrid = hybrid.loc[~out.index.duplicated(keep="first"),:]

# rename columns to long, lat, elev
hybrid.rename(columns={'latitude (EPSG:4326 - WGS 84) combined': 'lat',
                             'longitude (EPSG:4326 - WGS 84) combined': 'lon',
                             'elevation (EPSG:4326 - WGS 84) combined': 'elevation',
                             'combined location source': 'source'}, inplace=True)

# add a unique number for each waypoint (based on the first year because that has more waypoints).
hybrid.insert(0, 'waypoint_number', out_22_23.loc[hybrid.index,'waypoint_number'])

#out.to_file(archived_data_path + '/attended/centerline/positions_hybrid.shp', driver='ESRI Shapefile')
hybrid.to_csv(archived_data_path + '/attended/centerline/positions_hybrid.csv')
hybrid

Unnamed: 0_level_0,waypoint_number,lat,lon,elevation,source,geometry
waypoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
G1-21-05,1,-76.002191,-107.571318,882.202698,22-23 metadata,POINT (-107.57132 -76.00219)
G1-22-05,2,-76.001810,-107.573510,,23-24 metadata recorded in 23-24 season,POINT (-107.57351 -76.00181)
G1-23-05,3,-76.011062,-107.576420,898.020508,22-23 metadata,POINT (-107.57642 -76.01106)
G1-24-05,4,-76.015486,-107.579000,892.305969,22-23 metadata,POINT (-107.579 -76.01549)
G1-29-05,5,-76.037646,-107.591433,894.773804,22-23 metadata,POINT (-107.59143 -76.03765)
...,...,...,...,...,...,...
G10-08-220,242,-77.580705,-109.037602,1346.282715,22-23 metadata,POINT (-109.0376 -77.5807)
G10-09-219,243,-77.589632,-109.045977,1340.057617,22-23 metadata,POINT (-109.04598 -77.58963)
G10-042,244,-77.595419,-109.004602,1320.478394,22-23 metadata,POINT (-109.0046 -77.59542)
G10-10-218,245,-77.598552,-109.054292,1318.649536,22-23 metadata,POINT (-109.05429 -77.59855)


## Combine position data from acrossline polarimetric csvs
The polarimetric data from the across line are stored in four directories in `/Users/jkingslake/Documents/data/thwaites_apres/archiving/attended/acrossline/polarimetric/GHOST24_Polarimetric_pRES_OZ`. In each one there is a csv containing the position of the measurement. 

Below we collate these into one csv.

In [131]:
# use glob to find csvs in that dir
acrossline_polarimetric_csvs = glob.glob(archived_data_path + '/attended/acrossline/polarimetric//PpRES_*/*.csv')


In [132]:
polarimetric_acrossline_positions = pd.concat((pd.read_csv(f) for f in acrossline_polarimetric_csvs))

# add waypoint number
polarimetric_acrossline_positions.insert(0, 'waypoint_number', range(1, 1 + len(polarimetric_acrossline_positions)))

# rename latitude and longituse to lat and lon
polarimetric_acrossline_positions.rename(columns={'Latitude': 'lat',
                                            'Longitude': 'lon'}, inplace=True)

# insert a column of nans called elevation
polarimetric_acrossline_positions.insert(4, 'elevation', np.nan)

# rename Stations -- > waypoints and set as index
polarimetric_acrossline_positions = polarimetric_acrossline_positions.rename(columns={'Station': 'waypoint'})
polarimetric_acrossline_positions.set_index('waypoint', inplace=True)

# swap the positions of lat and lon
cols = polarimetric_acrossline_positions.columns.tolist()
cols = [cols[0], cols[2], cols[1]] + cols[3:]   
polarimetric_acrossline_positions = polarimetric_acrossline_positions[cols]

polarimetric_acrossline_positions

Unnamed: 0_level_0,waypoint_number,lon,lat,elevation
waypoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PpRES_20240551_001,1,-107.389709,-76.457764,
PpRES_20240551_003,2,-104.822354,-76.472572,
PpRES_20240551_004,3,-106.977237,-76.487418,
PpRES_20240551_002,4,-106.578286,-76.467489,


In [133]:
# convert to geodataframe, add crs
geom =gpd.points_from_xy(x=polarimetric_acrossline_positions['lon'], y=polarimetric_acrossline_positions['lat'])
polarimetric_acrossline_positions = gpd.GeoDataFrame(polarimetric_acrossline_positions, geometry = geom)

polarimetric_acrossline_positions

Unnamed: 0_level_0,waypoint_number,lon,lat,elevation,geometry
waypoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PpRES_20240551_001,1,-107.389709,-76.457764,,POINT (-107.38971 -76.45776)
PpRES_20240551_003,2,-104.822354,-76.472572,,POINT (-104.82235 -76.47257)
PpRES_20240551_004,3,-106.977237,-76.487418,,POINT (-106.97724 -76.48742)
PpRES_20240551_002,4,-106.578286,-76.467489,,POINT (-106.57829 -76.46749)


In [134]:
polarimetric_acrossline_positions.to_csv(archived_data_path + '/attended/acrossline/polarimetric/positions.csv')