In [1]:
!pip install beautifulsoup4 requests PyYAML


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [24]:
import os
import requests
import yaml
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime

def load_existing_data(filename):
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return yaml.safe_load(f)
    else:
        return []

def download_and_parse(url, existing_data):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get the scheme and network location from the URL
    parsed_url = urlparse(url)
    base_url = parsed_url.scheme + '://' + parsed_url.netloc

    tables = soup.find_all('table')  # find all tables on the webpage

    data = []
    for table in tables:
        rows = table.find_all('tr')
        year = rows[0].find('th').text.strip()  # get the year from the first row

        for row in rows[1:]:  # skip the header row
            cols = row.find_all('td')
            date = cols[0].text.strip()
            
            # Insert a space between the month and day if it's missing
            date = re.sub(r'(\D)(\d)', r'\1 \2', date)
            date = ' '.join(date.split())  # This will remove extra spaces
            
            mp3_link = urljoin(base_url, cols[1].find('a')['href'])

            # Skip if this item already exists in the data
            if any(d['url'] == mp3_link for d in existing_data):
                print(f"Skipping {mp3_link}...")
                continue

            print(f"Getting file length for {mp3_link}...")
            # Send a HEAD request to get the file size
            mp3_response = requests.head(mp3_link)
            length = mp3_response.headers.get('content-length')

            data.append({
                'title': date,
                'date': date,
                'length': int(length) if length else None,
                'url': mp3_link,
                'season': year,
            })

    # Combine with existing data
    data.extend(existing_data)

    # Sort by date
    data.sort(key=lambda d: datetime.strptime(d['date'], '%B %d, %Y'), reverse=True)

    # Calculate episode numbers
    for i, item in enumerate(data, start=1):
        item['episode'] = i

    return data

def save_as_yaml(data, filename):
    with open(filename, 'w') as f:
        yaml.dump(data, f)

In [25]:
filename = '../_data/arrl_15_wpm_code_archive.yaml'
url = 'https://www.arrl.org/15-wpm-code-archive'
existing_data = load_existing_data(filename)
data = download_and_parse(url, existing_data)
save_as_yaml(data, filename)

Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/240110_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/240124_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/240207_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/240221_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/240306_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/240320_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/230111_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/230125_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/230208_15WPM.mp3...
Getting file length for https://www.arrl.org/files/file/Morse/Archive/15%20WPM/230222_15WPM.mp3...
Getting fi