Remember to 'pip install [package_name]' if these imports are not found

In [1]:
# Imports from selenium, request_futures, requests, bs4, pandas
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from requests_futures.sessions import FuturesSession
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import geopy.distance

# Native python imports
import re
import math
from collections import Counter
from concurrent.futures import as_completed

# Enter Lat /Lon coordinates and Day of Interest
These should be the only inputs needed before executing the rest of the script.

In [2]:
# Defining constants for our current usecase
day_of_interest = 5 # Day of Year (DOY) of February 20, which is our current examination
g_truth_lat, g_truth_lon = 73.3948, -141.6623 # Selecting a coordinate of interest (EX: Center of SAR image)
year = 2024 # Which year we're interested in
num_closest_buoys = None  # How many buoy coordinates you care to have. NONE = ALL found bouys

### Run this section to get references to the buoy data
The buoy data is listed as a table of ids that is nested within an iframe of the website. Because of this iframe, we use selenium to simulate a web-client until the data loads, then grab it using BeautifulSoup.

In [3]:
#####
base_url = f"https://iabp.apl.uw.edu/IABP_Table.html"

options = webdriver.ChromeOptions()
options.add_argument('--headless')
# executable_path param is not needed if you updated PATH
browser = webdriver.Chrome(options=options)

try:
    browser.get(base_url)
    timeout_in_seconds = 10
    WebDriverWait(browser, timeout_in_seconds).until(ec.frame_to_be_available_and_switch_to_it("myframe"))
    html = browser.page_source
    soup = BeautifulSoup(html, "html.parser")
    print("Got the site!")
except TimeoutException:
    print("I give up...")
finally:
    browser.quit()
    
table = soup.find('table')
tablebody = soup.find('tbody')
tablerow = tablebody.find_all('tr')

buoy_ids = []
failed = 0
for row in tablerow:
    buoy_id = row.find('td')
    try:
        buoy_ids.append(buoy_id.get_text())
    except:
        failed += 1
num_buoys = len(buoy_ids)
print(f"Buoys to search through: {num_buoys}")
print(f"Rows without buoy ids: {failed}")

Got the site!
Buoys to search through: 212
Rows without buoy ids: 2


### Run this next section to populate the buoy dictionary with distances
Get all of the data from the different buoys, filter to only those that have record of our time of interest, and then calculate their distances from our point of interest.

In [4]:
# Defining url getter
def get_url(url:str):
    return f"https://iabp.apl.uw.edu/WebData/{url}.dat"

regExStr = f'\d{{5,20}}\s*?{year}(?:\s*?\d{{2}}){{2}}\s*?(?:{day_of_interest}|{day_of_interest + 1}).*'

# Declare our dictionary and counters
distances_dict = {}
num_completed = 0

# Open a session to speed up the time it takes to make and receive all of our requests
with FuturesSession() as session:
    
    # Get the url for each buoy id, and then get the data
    urls = [get_url(id_num) for id_num in buoy_ids]
    futures = [session.get(url) for url in urls]
    
    # Event listener to execute as the requests resolve
    for future in as_completed(futures):
        print(f"{num_completed + 1} / {num_buoys} requests resolved")
        
        # Convert from byte to string format
        response = future.result().content.decode('UTF-8')
        
        # Get the column headers (and removing whitespace) using the split method
        column_headers = response.partition('\n')[0].split()
        
        # Find all rows of buoy data using regex
        daily_data = re.findall(regExStr, response)
        daily_data_len = len(daily_data)
        
        # Iterate through the data per buoy to calculate distance
        if daily_data_len > 0:
            buoy_id = daily_data[0].split()[0]
            buoy_dist = 0
            print(f"Processing {num_completed+1}/{num_buoys} pages, with {daily_data_len} rows.")
            for row in daily_data:
                try:
                    # Calculating avg distance of buoy 
                    lat_index = column_headers.index("Lat")
                    lon_index = column_headers.index("Lon")
                    splits = row.split()
                    lat = splits[lat_index]
                    lon = splits[lon_index]
                    seg_dist = math.dist([float(lat), float(lon)], [g_truth_lat, g_truth_lon])
                    buoy_dist += seg_dist
                except:
                    print("Could not find lat/lon column for ", buoy_id)
                    
            # Set the distance in the dictionary
            avg_dist = buoy_dist / daily_data_len
            distances_dict[buoy_id] = avg_dist
            print(f"Finished processing {buoy_id}. Average Dist = {avg_dist}")        
        else:
            print(f"Request {num_completed + 1} has no matching data.", )
        num_completed += 1
        

1 / 212 requests resolved
Request 1 has no matching data.
2 / 212 requests resolved
Request 2 has no matching data.
3 / 212 requests resolved
Request 3 has no matching data.
4 / 212 requests resolved
Request 4 has no matching data.
5 / 212 requests resolved
Processing 5/212 pages, with 39 rows.
Finished processing 300025010734900. Average Dist = 7.553004714723523
6 / 212 requests resolved
Request 6 has no matching data.
7 / 212 requests resolved
Processing 7/212 pages, with 52 rows.
Finished processing 300125061832760. Average Dist = 7.477765619470949
8 / 212 requests resolved
Request 8 has no matching data.
9 / 212 requests resolved
Request 9 has no matching data.
10 / 212 requests resolved
Processing 10/212 pages, with 17 rows.
Finished processing 300234065171790. Average Dist = 244.6074030368834
11 / 212 requests resolved
Processing 11/212 pages, with 40 rows.
Finished processing 300234063516460. Average Dist = 37.313870902493
12 / 212 requests resolved
Processing 12/212 pages, with

### Run this section to extract the data from the nearest buoys

In [10]:
num_buoys = len(distances_dict.keys()) if num_closest_buoys is None else num_closest_buoys

df_cols = []
data_frame_data = []

closest_buoys = list(Counter(distances_dict).most_common()[-num_buoys:])
closest_buoys.reverse()
for b_id, dist in closest_buoys:
    print(b_id)
    response = get(get_url(b_id)).content.decode('UTF-8')
    daily_data = re.findall(regExStr, response)
    if len(daily_data) > 0:
        column_headers = response.partition('\n')[0].split()
        # Make sure we have all column headers
        for header in column_headers:
            if header not in df_cols:
                df_cols.append(header)
        # Then push in the data so we can later create the df
        for i, row in enumerate(daily_data):
            splits = row.split()
            data_frame_data.append(splits)

df = pd.DataFrame(data_frame_data, columns=df_cols)
print(df.head(5))
print(df.tail(5))

162
162
300234065495020
300534062720770
300234066991420
300534061808320
300125061832760


### Print the distance that each buoy traveled
It is beneficial to know whether the buoy covered significant distance over the small time frame, as that will influence whether you should expect different data sets to be coincident.

In [None]:
dfg = df.groupby('BuoyID')

for k, v in dfg:
    lat_s = float(v['Lat'].iloc[0])
    lat_f = float(v['Lat'].iloc[-1])
    
    lon_s = float(v['Lon'].iloc[0])
    lon_f = float(v['Lon'].iloc[-1])
    trav_dist = geopy.distance.geodesic((lat_s, lon_s), (lat_f, lon_f)).m
    print(f"{k}: {trav_dist}")


### Run this section to write to csv

In [None]:
# Comment / Uncomment this last line to toggle whether the file gets written
fileName = "AllBuoys-1-5-24.csv"

df.to_csv(fileName)