Remember to 'pip install [package_name]' if these imports are not found

In [1]:
# Imports from selenium, request_futures, requests, bs4, pandas
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from requests_futures.sessions import FuturesSession
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import geopy.distance

# Native python imports
import re
import math
from collections import Counter
from concurrent.futures import as_completed

# Enter Lat /Lon coordinates and Day of Interest
These should be the only inputs needed before executing the rest of the script.

In [2]:
# Defining constants for our current usecase
day_of_interest = 51 # Day of Year (DOY) of February 20, which is our current examination
g_truth_lat, g_truth_lon = 75.92, -149.69 # Selecting a coordinate of interest (EX: Center of SAR image)

### Run this section to get references to the buoy data
The buoy data is listed as a table of ids that is nested within an iframe of the website. Because of this iframe, we use selenium to simulate a web-client until the data loads, then grab it using BeautifulSoup.

In [3]:
#####
base_url = f"https://iabp.apl.uw.edu/IABP_Table.html"

options = webdriver.ChromeOptions()
options.add_argument('--headless')
# executable_path param is not needed if you updated PATH
browser = webdriver.Chrome(options=options)

try:
    browser.get(base_url)
    timeout_in_seconds = 10
    WebDriverWait(browser, timeout_in_seconds).until(ec.frame_to_be_available_and_switch_to_it("myframe"))
    html = browser.page_source
    soup = BeautifulSoup(html, "html.parser")
    print("Got the site!")
except TimeoutException:
    print("I give up...")
finally:
    browser.quit()
    
table = soup.find('table')
tablebody = soup.find('tbody')
tablerow = tablebody.find_all('tr')

buoy_ids = []
failed = 0
for row in tablerow:
    buoy_id = row.find('td')
    try:
        buoy_ids.append(buoy_id.get_text())
    except:
        failed += 1
num_buoys = len(buoy_ids)
print(f"Buoys to search through: {num_buoys}")
print(f"Rows without buoy ids: {failed}")

Got the site!
Buoys to search through: 236
Rows without buoy ids: 2


### Run this next section to populate the buoy dictionary with distances
Get all of the data from the different buoys, filter to only those that have record of our time of interest, and then calculate their distances from our point of interest.

In [4]:
# Defining url getter
def get_url(url:str):
    return f"https://iabp.apl.uw.edu/WebData/{url}.dat"

regExStr = f'\d{{5,20}}\s*?2023(?:\s*?\d{{2}}){{2}}\s*?(?:{day_of_interest}|{day_of_interest + 1}).*'

# Declare our dictionary and counters
distances_dict = {}
num_completed = 0

# Open a session to speed up the time it takes to make and receive all of our requests
with FuturesSession() as session:
    
    # Get the url for each buoy id, and then get the data
    urls = [get_url(id_num) for id_num in buoy_ids]
    futures = [session.get(url) for url in urls]
    
    # Event listener to execute as the requests resolve
    for future in as_completed(futures):
        print(f"{num_completed + 1} / {num_buoys} requests resolved")
        
        # Convert from byte to string format
        response = future.result().content.decode('UTF-8')
        
        # Get the column headers (and removing whitespace) using the split method
        column_headers = response.partition('\n')[0].split()
        
        # Find all rows of buoy data using regex
        daily_data = re.findall(regExStr, response)
        daily_data_len = len(daily_data)
        
        # Iterate through the data per buoy to calculate distance
        if daily_data_len > 0:
            buoy_id = daily_data[0].split()[0]
            buoy_dist = 0
            print(f"Processing {num_completed+1}/{num_buoys} pages, with {daily_data_len} rows.")
            for row in daily_data:
                try:
                    # Calculating avg distance of buoy 
                    lat_index = column_headers.index("Lat")
                    lon_index = column_headers.index("Lon")
                    splits = row.split()
                    lat = splits[lat_index]
                    lon = splits[lon_index]
                    seg_dist = math.dist([float(lat), float(lon)], [g_truth_lat, g_truth_lon])
                    buoy_dist += seg_dist
                except:
                    print("Could not find lat/lon column for ", buoy_id)
                    
            # Set the distance in the dictionary
            avg_dist = buoy_dist / daily_data_len
            distances_dict[buoy_id] = avg_dist
            print(f"Finished processing {buoy_id}. Average Dist = {avg_dist}")        
        else:
            print(f"Request {num_completed + 1} has no matching data.", )
        num_completed += 1
        

1 / 236 requests resolved
Request 1 has no matching data.
2 / 236 requests resolved
Request 2 has no matching data.
3 / 236 requests resolved
Request 3 has no matching data.
4 / 236 requests resolved
Request 4 has no matching data.
5 / 236 requests resolved
Request 5 has no matching data.
6 / 236 requests resolved
Request 6 has no matching data.
7 / 236 requests resolved
Request 7 has no matching data.
8 / 236 requests resolved
Processing 8/236 pages, with 48 rows.
Finished processing 300234061162580. Average Dist = 289.36706386294003
9 / 236 requests resolved
Processing 9/236 pages, with 45 rows.
Finished processing 300234061164500. Average Dist = 301.09179657815343
10 / 236 requests resolved
Request 10 has no matching data.
11 / 236 requests resolved
Processing 11/236 pages, with 48 rows.
Finished processing 300234011751690. Average Dist = 43.464339734416534
12 / 236 requests resolved
Processing 12/236 pages, with 39 rows.
Finished processing 300234065495020. Average Dist = 10.962019

106 / 236 requests resolved
Request 106 has no matching data.
107 / 236 requests resolved
Request 107 has no matching data.
108 / 236 requests resolved
Request 108 has no matching data.
109 / 236 requests resolved
Processing 109/236 pages, with 48 rows.
Finished processing 300534060166870. Average Dist = 76.49499614250932
110 / 236 requests resolved
Request 110 has no matching data.
111 / 236 requests resolved
Request 111 has no matching data.
112 / 236 requests resolved
Processing 112/236 pages, with 48 rows.
Finished processing 300534060168940. Average Dist = 89.29780990345093
113 / 236 requests resolved
Processing 113/236 pages, with 48 rows.
Finished processing 300534060951200. Average Dist = 33.02224675311914
114 / 236 requests resolved
Processing 114/236 pages, with 48 rows.
Finished processing 300534060951600. Average Dist = 36.83065213081699
115 / 236 requests resolved
Processing 115/236 pages, with 4 rows.
Finished processing 300534061515080. Average Dist = 131.61750100433406


208 / 236 requests resolved
Request 208 has no matching data.
209 / 236 requests resolved
Request 209 has no matching data.
210 / 236 requests resolved
Request 210 has no matching data.
211 / 236 requests resolved
Request 211 has no matching data.
212 / 236 requests resolved
Request 212 has no matching data.
213 / 236 requests resolved
Request 213 has no matching data.
214 / 236 requests resolved
Request 214 has no matching data.
215 / 236 requests resolved
Request 215 has no matching data.
216 / 236 requests resolved
Processing 216/236 pages, with 96 rows.
Finished processing 900124. Average Dist = 31.453285444068314
217 / 236 requests resolved
Processing 217/236 pages, with 96 rows.
Finished processing 900115. Average Dist = 3.8439511057693836
218 / 236 requests resolved
Processing 218/236 pages, with 96 rows.
Finished processing 900121. Average Dist = 4.00223838183818
219 / 236 requests resolved
Processing 219/236 pages, with 95 rows.
Finished processing 900122. Average Dist = 13.92

### Run this section to extract the data from the nearest buoys

In [5]:
num_buoys = 5

df_cols = []
data_frame_data = []

closest_buoys = list(Counter(distances_dict).most_common()[-num_buoys:])
closest_buoys.reverse()
for b_id, dist in closest_buoys:

    response = get(get_url(b_id)).content.decode('UTF-8')
    daily_data = re.findall(regExStr, response)
    if len(daily_data) > 0:
        column_headers = response.partition('\n')[0].split()
        # Make sure we have all column headers
        for header in column_headers:
            if header not in df_cols:
                df_cols.append(header)
        # Then push in the data so we can later create the df
        for i, row in enumerate(daily_data):
            splits = row.split()
            data_frame_data.append(splits)

df = pd.DataFrame(data_frame_data, columns=column_headers)
print(df.head(5))
print(df.tail(5))

   BuoyID  Year Hour Min      DOY  POS_DOY       Lat         Lon    BP    Ts
0  902007  2023   00  00  51.0005  51.0005  77.75950  -146.60420  None  None
1  902007  2023   00  31  51.0216  51.0216  77.75920  -146.60110  None  None
2  902007  2023   01  01  51.0424  51.0424  77.75890  -146.59860  None  None
3  902007  2023   01  30  51.0631  51.0631  77.75850  -146.59560  None  None
4  902007  2023   02  01  51.0841  51.0841  77.75820  -146.59280  None  None
              BuoyID  Year Hour Min      DOY  POS_DOY       Lat         Lon  \
379  300534062025520  2023   19  00  52.7920  52.7920  80.94306  -149.77117   
380  300534062025520  2023   20  00  52.8336  52.8336  80.94473  -149.76025   
381  300534062025520  2023   21  00  52.8753  52.8753  80.94655  -149.74965   
382  300534062025520  2023   22  00  52.9173  52.9173  80.94888  -149.73568   
383  300534062025520  2023   23  00  52.9586  52.9586  80.95106  -149.72842   

          BP      Ts  
379  1030.70  -17.50  
380  1030.70  -17

### Print the distance that each buoy traveled
It is beneficial to know whether the buoy covered significant distance over the small time frame, as that will influence whether you should expect different data sets to be coincident.

In [10]:
dfg = df.groupby('BuoyID')

for k, v in dfg:
    lat_s = float(v['Lat'].iloc[0])
    lat_f = float(v['Lat'].iloc[-1])
    
    lon_s = float(v['Lon'].iloc[0])
    lon_f = float(v['Lon'].iloc[-1])
    trav_dist = geopy.distance.geodesic((lat_s, lon_s), (lat_f, lon_f)).m
    print(f"{k}: {trav_dist}")


300534061592170: 18512.296488320702
300534062025520: 18951.28387793538
900115: 16366.16271014724
900121: 12752.664877380028
902007: 12213.399609484431


### Run this section to write to csv

In [None]:
# Comment / Uncomment this last line to toggle whether the file gets written
fileName = "Nearest5Buoys.csv"

df.to_csv(fileName)