In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re
import csv
import glob as gl
import tqdm
import pandas as pd
import numpy as np

In [2]:
def prevailing_wind_direction(wind_dir_series):
    rad = np.deg2rad(wind_dir_series)
    sin_mean = np.nanmean(np.sin(rad))
    cos_mean = np.nanmean(np.cos(rad))
    mean_dir_rad = np.arctan2(sin_mean, cos_mean)
    mean_dir_deg = (np.rad2deg(mean_dir_rad)) % 360
    return mean_dir_deg

In [3]:
def download_file(file_url, download_dir):
    """Download the file from the specified URL."""
    local_filename = os.path.join(download_dir, os.path.basename(file_url))
    with requests.get(file_url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded: {local_filename}")
    return os.path.basename(file_url)

def scrape_and_download(url, file_pattern, download_dir):
    """Scrape the website and download files matching the pattern."""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links on the webpage and download the matching files
    for link in soup.find_all('a', href=True):
        file_url = link['href']
        if file_pattern.match(file_url):
            # Prepend the base URL if necessary
            if not file_url.startswith('http'):
                file_url = url + file_url
            # Download the file
            download_file(file_url, download_dir)
            
def download_vito(year):
    # Directory where you want to save the downloaded files
    download_dir = '../../Data/capebird'
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    
    # Pattern to match the file names
    file_pattern = re.compile(r'cbd.*q1h\.txt')

    # Find data online
    url = 'https://amrc.ssec.wisc.edu/data/ftp/pub/aws/q1h/' + str(year) + '/'

    scrape_and_download(url, file_pattern, download_dir)

In [4]:
for year in [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]:
    download_vito(year)

Downloaded: ../../Data/capebird\cbd200109q1h.txt
Downloaded: ../../Data/capebird\cbd200110q1h.txt
Downloaded: ../../Data/capebird\cbd200111q1h.txt
Downloaded: ../../Data/capebird\cbd200112q1h.txt
Downloaded: ../../Data/capebird\cbd200201q1h.txt
Downloaded: ../../Data/capebird\cbd200202q1h.txt
Downloaded: ../../Data/capebird\cbd200203q1h.txt
Downloaded: ../../Data/capebird\cbd200204q1h.txt
Downloaded: ../../Data/capebird\cbd200205q1h.txt
Downloaded: ../../Data/capebird\cbd200206q1h.txt
Downloaded: ../../Data/capebird\cbd200207q1h.txt
Downloaded: ../../Data/capebird\cbd200208q1h.txt
Downloaded: ../../Data/capebird\cbd200209q1h.txt
Downloaded: ../../Data/capebird\cbd200210q1h.txt
Downloaded: ../../Data/capebird\cbd200211q1h.txt
Downloaded: ../../Data/capebird\cbd200212q1h.txt
Downloaded: ../../Data/capebird\cbd200301q1h.txt
Downloaded: ../../Data/capebird\cbd200302q1h.txt
Downloaded: ../../Data/capebird\cbd200303q1h.txt
Downloaded: ../../Data/capebird\cbd200304q1h.txt
Downloaded: ../../Da

In [5]:
path = '../../Data/capebird/'
# create list of file names
aws_files = gl.glob(path + "/cbd*.txt")
# create data frame
columns = ['year', 'doy', 'month', 'day', 'hour', 'temperature', 'pressure', 'wind_speed', 'wind_direction', 'relative_humidity', '?']
content = []
for filename in tqdm.tqdm(aws_files, total=len(aws_files)):
    df = pd.read_csv(filename, names=columns, sep='\s+')
    df.drop(index=df.index[:2], axis=0, inplace=True)
    content.append(df)
vito = pd.concat(content)
# create datetime column and set as index:
vito['DateTime'] = vito['year'].astype(str) + '-' + vito['month'].astype(str) + '-' + vito['day'].astype(str) + ' ' + vito['hour'].astype(str)
vito['DateTime'] = pd.to_datetime(vito['DateTime'],format='%Y-%m-%d %H%M')
vito.index = vito['DateTime']
del vito['DateTime']
# make data into floats
for i in vito.columns:
    vito[i] = vito[i].astype(float)
# replace fill values with nans:
vito.replace(444.0, np.nan, inplace=True)
# resample to daily resolution
del vito['hour']
agg_dict = dict.fromkeys([col for col in vito.columns if col != 'wind_direction'], 'mean')
agg_dict['wind_direction'] = prevailing_wind_direction
vito = vito.resample('1D').agg(agg_dict)

vito

100%|████████████████████████████████████████████████████████████████████████████████| 276/276 [00:08<00:00, 31.89it/s]
  sin_mean = np.nanmean(np.sin(rad))
  cos_mean = np.nanmean(np.cos(rad))


Unnamed: 0_level_0,year,doy,month,day,temperature,pressure,wind_speed,relative_humidity,?,wind_direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2001-09-01,2001.0,244.0,9.0,1.0,-29.100000,996.429167,1.233333,68.633333,,61.538269
2001-09-02,2001.0,245.0,9.0,2.0,-32.122727,1002.050000,1.772727,71.581818,,198.594071
2001-09-03,2001.0,246.0,9.0,3.0,-28.838889,996.100000,1.633333,62.533333,,165.214198
2001-09-04,2001.0,247.0,9.0,4.0,-23.875000,979.960000,2.090000,46.780000,,39.954682
2001-09-05,2001.0,248.0,9.0,5.0,-27.855000,976.675000,3.605000,54.060000,,213.536965
...,...,...,...,...,...,...,...,...,...,...
2024-12-27,2024.0,362.0,12.0,27.0,-2.845833,983.720833,4.004167,95.089474,,272.974598
2024-12-28,2024.0,363.0,12.0,28.0,-3.654167,986.000000,4.033333,91.877273,,25.381716
2024-12-29,2024.0,364.0,12.0,29.0,-3.666667,987.208333,2.995833,94.460870,,7.521921
2024-12-30,2024.0,365.0,12.0,30.0,-2.212500,980.312500,6.433333,74.279167,,233.514647


In [6]:
vito.to_csv('../../Data/capebird/capebird.csv')