In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re
import csv
import glob as gl
import tqdm
import pandas as pd
import numpy as np

In [2]:
def prevailing_wind_direction(wind_dir_series):
    rad = np.deg2rad(wind_dir_series)
    sin_mean = np.nanmean(np.sin(rad))
    cos_mean = np.nanmean(np.cos(rad))
    mean_dir_rad = np.arctan2(sin_mean, cos_mean)
    mean_dir_deg = (np.rad2deg(mean_dir_rad)) % 360
    return mean_dir_deg

In [3]:
def download_file(file_url, download_dir):
    """Download the file from the specified URL."""
    local_filename = os.path.join(download_dir, os.path.basename(file_url))
    with requests.get(file_url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded: {local_filename}")
    return os.path.basename(file_url)

def scrape_and_download(url, file_pattern, download_dir):
    """Scrape the website and download files matching the pattern."""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links on the webpage and download the matching files
    for link in soup.find_all('a', href=True):
        file_url = link['href']
        if file_pattern.match(file_url):
            # Prepend the base URL if necessary
            if not file_url.startswith('http'):
                file_url = url + file_url
            # Download the file
            download_file(file_url, download_dir)
            
def download_vito(year):
    # Directory where you want to save the downloaded files
    download_dir = '../../Data/ferrell'
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    
    # Pattern to match the file names
    file_pattern = re.compile(r'fer.*q1h\.txt')

    # Find data online
    url = 'https://amrc.ssec.wisc.edu/data/ftp/pub/aws/q1h/' + str(year) + '/'

    scrape_and_download(url, file_pattern, download_dir)

In [None]:
for year in [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]:
    download_vito(year)

In [4]:
path = '../../Data/ferrell/'
# create list of file names
aws_files = gl.glob(path + "/fer*.txt")
# create data frame
columns = ['year', 'doy', 'month', 'day', 'hour', 'temperature', 'pressure', 'wind_speed', 'wind_direction', 'relative_humidity', '?']
content = []
for filename in tqdm.tqdm(aws_files, total=len(aws_files)):
    df = pd.read_csv(filename, names=columns, sep='\s+')
    df.drop(index=df.index[:2], axis=0, inplace=True)
    content.append(df)
vito = pd.concat(content)
# create datetime column and set as index:
vito['DateTime'] = vito['year'].astype(str) + '-' + vito['month'].astype(str) + '-' + vito['day'].astype(str) + ' ' + vito['hour'].astype(str)
vito['DateTime'] = pd.to_datetime(vito['DateTime'],format='%Y-%m-%d %H%M')
vito.index = vito['DateTime']
del vito['DateTime']
# make data into floats
for i in vito.columns:
    vito[i] = vito[i].astype(float)
# replace fill values with nans:
vito.replace(444.0, np.nan, inplace=True)
# resample to daily resolution
del vito['hour']
agg_dict = dict.fromkeys([col for col in vito.columns if col != 'wind_direction'], 'mean')
agg_dict['wind_direction'] = prevailing_wind_direction
vito = vito.resample('1D').agg(agg_dict)

vito

100%|████████████████████████████████████████████████████████████████████████████████| 276/276 [00:08<00:00, 31.17it/s]
  sin_mean = np.nanmean(np.sin(rad))
  cos_mean = np.nanmean(np.cos(rad))


Unnamed: 0_level_0,year,doy,month,day,temperature,pressure,wind_speed,relative_humidity,?,wind_direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2001-09-01,2001.0,244.0,9.0,1.0,-41.854167,993.595833,1.262500,68.500000,,276.908327
2001-09-02,2001.0,245.0,9.0,2.0,-41.956522,999.739130,0.343478,67.891304,,346.576150
2001-09-03,2001.0,246.0,9.0,3.0,-38.147368,991.431579,3.400000,72.673684,,276.852108
2001-09-04,2001.0,247.0,9.0,4.0,-37.495000,977.000000,2.600000,73.610000,,265.567838
2001-09-05,2001.0,248.0,9.0,5.0,-37.520833,974.154167,4.975000,73.475000,,207.800389
...,...,...,...,...,...,...,...,...,...,...
2024-12-27,2024.0,362.0,12.0,27.0,-5.263636,982.941667,,88.550000,,
2024-12-28,2024.0,363.0,12.0,28.0,-6.208333,985.737500,,87.783333,,
2024-12-29,2024.0,364.0,12.0,29.0,-5.016667,986.441667,,87.991667,,
2024-12-30,2024.0,365.0,12.0,30.0,-4.329167,980.454167,,86.850000,,


In [5]:
# Add years from Wang et al., 2022
t_column_name = 'Temperature(¡æ)'
p_column_name = 'Pressure(hPa)'
ws_column_name = 'Wind Speed(m/s)'
wd_column_name = 'Wind Direction'

TP = pd.read_csv('../../Data/ferrell/Ferrell_3h.csv', header=0, encoding='latin1')
TP['DateTime'] = pd.to_datetime(dict(year=TP['Year'], month=TP['Month'], day=TP['Day'], hour=TP['Three-hourly observation time(UTC)']))
TP.index = TP['DateTime']
TP = TP[[t_column_name, p_column_name, ws_column_name, wd_column_name]]
TP.rename(columns={t_column_name: 'temperature', p_column_name: 'pressure', ws_column_name: 'wind_speed', wd_column_name: 'wind_direction'}, inplace=True)
agg_dict = dict.fromkeys([col for col in TP.columns if col != 'wind_direction'], 'mean')
agg_dict['wind_direction'] = prevailing_wind_direction
TP = TP.resample('1D').agg(agg_dict)

TP

  sin_mean = np.nanmean(np.sin(rad))
  cos_mean = np.nanmean(np.cos(rad))


Unnamed: 0_level_0,temperature,pressure,wind_speed,wind_direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-12-01,,,,
1980-12-02,,,,
1980-12-03,,,,
1980-12-04,,,,
1980-12-05,,,,
...,...,...,...,...
2021-12-27,-7.942857,971.7875,5.750,183.794445
2021-12-28,-6.875000,971.3625,7.725,202.469713
2021-12-29,-6.675000,973.5125,8.075,179.447430
2021-12-30,-7.575000,980.6875,7.375,205.165312


In [6]:
df = pd.concat([vito, TP], ignore_index=False)
df = df[~df.index.duplicated(keep='first')]
df = df.sort_index()

df

Unnamed: 0_level_0,year,doy,month,day,temperature,pressure,wind_speed,relative_humidity,?,wind_direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980-12-01,,,,,,,,,,
1980-12-02,,,,,,,,,,
1980-12-03,,,,,,,,,,
1980-12-04,,,,,,,,,,
1980-12-05,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
2024-12-27,2024.0,362.0,12.0,27.0,-5.263636,982.941667,,88.550000,,
2024-12-28,2024.0,363.0,12.0,28.0,-6.208333,985.737500,,87.783333,,
2024-12-29,2024.0,364.0,12.0,29.0,-5.016667,986.441667,,87.991667,,
2024-12-30,2024.0,365.0,12.0,30.0,-4.329167,980.454167,,86.850000,,


In [7]:
df.to_csv('../../Data/ferrell/ferrell.csv')