In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re
import csv
import glob as gl
import tqdm
import pandas as pd
import numpy as np

In [2]:
def prevailing_wind_direction(wind_dir_series):
    rad = np.deg2rad(wind_dir_series)
    sin_mean = np.nanmean(np.sin(rad))
    cos_mean = np.nanmean(np.cos(rad))
    mean_dir_rad = np.arctan2(sin_mean, cos_mean)
    mean_dir_deg = (np.rad2deg(mean_dir_rad)) % 360
    return mean_dir_deg

In [3]:
def download_file(file_url, download_dir):
    """Download the file from the specified URL."""
    local_filename = os.path.join(download_dir, os.path.basename(file_url))
    with requests.get(file_url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded: {local_filename}")
    return os.path.basename(file_url)

def scrape_and_download(url, file_pattern, download_dir):
    """Scrape the website and download files matching the pattern."""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links on the webpage and download the matching files
    for link in soup.find_all('a', href=True):
        file_url = link['href']
        if file_pattern.match(file_url):
            # Prepend the base URL if necessary
            if not file_url.startswith('http'):
                file_url = url + file_url
            # Download the file
            download_file(file_url, download_dir)
            
def download_vito(year):
    # Directory where you want to save the downloaded files
    download_dir = '../../Data/margaret'
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    
    # Pattern to match the file names
    file_pattern = re.compile(r'mgt.*q1h\.txt')

    # Find data online
    url = 'https://amrc.ssec.wisc.edu/data/ftp/pub/aws/q1h/' + str(year) + '/'

    scrape_and_download(url, file_pattern, download_dir)

In [4]:
for year in [2008, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]:
    download_vito(year)

Downloaded: ../../Data/margaret\mgt200811q1h.txt
Downloaded: ../../Data/margaret\mgt200812q1h.txt
Downloaded: ../../Data/margaret\mgt200811q1h.txt
Downloaded: ../../Data/margaret\mgt200812q1h.txt
Downloaded: ../../Data/margaret\mgt200901q1h.txt
Downloaded: ../../Data/margaret\mgt200902q1h.txt
Downloaded: ../../Data/margaret\mgt200903q1h.txt
Downloaded: ../../Data/margaret\mgt200904q1h.txt
Downloaded: ../../Data/margaret\mgt200905q1h.txt
Downloaded: ../../Data/margaret\mgt200906q1h.txt
Downloaded: ../../Data/margaret\mgt200907q1h.txt
Downloaded: ../../Data/margaret\mgt200908q1h.txt
Downloaded: ../../Data/margaret\mgt200909q1h.txt
Downloaded: ../../Data/margaret\mgt200910q1h.txt
Downloaded: ../../Data/margaret\mgt200911q1h.txt
Downloaded: ../../Data/margaret\mgt200912q1h.txt
Downloaded: ../../Data/margaret\mgt201001q1h.txt
Downloaded: ../../Data/margaret\mgt201002q1h.txt
Downloaded: ../../Data/margaret\mgt201003q1h.txt
Downloaded: ../../Data/margaret\mgt201004q1h.txt
Downloaded: ../../Da

In [5]:
path = '../../Data/margaret/'
# create list of file names
aws_files = gl.glob(path + "/mgt*.txt")
# create data frame
columns = ['year', 'doy', 'month', 'day', 'hour', 'temperature', 'pressure', 'wind_speed', 'wind_direction', 'relative_humidity', '?']
content = []
for filename in tqdm.tqdm(aws_files, total=len(aws_files)):
    df = pd.read_csv(filename, names=columns, sep='\s+')
    df.drop(index=df.index[:2], axis=0, inplace=True)
    content.append(df)
vito = pd.concat(content)
# create datetime column and set as index:
vito['DateTime'] = vito['year'].astype(str) + '-' + vito['month'].astype(str) + '-' + vito['day'].astype(str) + ' ' + vito['hour'].astype(str)
vito['DateTime'] = pd.to_datetime(vito['DateTime'],format='%Y-%m-%d %H%M')
vito.index = vito['DateTime']
del vito['DateTime']
# make data into floats
for i in vito.columns:
    vito[i] = vito[i].astype(float)
# replace fill values with nans:
vito.replace(444.0, np.nan, inplace=True)
# resample to daily resolution
del vito['hour']
agg_dict = dict.fromkeys([col for col in vito.columns if col != 'wind_direction'], 'mean')
agg_dict['wind_direction'] = prevailing_wind_direction
vito = vito.resample('1D').agg(agg_dict)

vito

100%|████████████████████████████████████████████████████████████████████████████████| 194/194 [00:05<00:00, 35.97it/s]
  sin_mean = np.nanmean(np.sin(rad))
  cos_mean = np.nanmean(np.cos(rad))


Unnamed: 0_level_0,year,doy,month,day,temperature,pressure,wind_speed,relative_humidity,?,wind_direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2008-11-01,2008.0,306.0,11.0,1.0,,,,,,
2008-11-02,2008.0,307.0,11.0,2.0,,,,,,
2008-11-03,2008.0,308.0,11.0,3.0,,,,,,
2008-11-04,2008.0,309.0,11.0,4.0,,,,,,
2008-11-05,2008.0,310.0,11.0,5.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...
2024-12-27,2024.0,362.0,12.0,27.0,-6.939130,982.700000,2.204167,71.583333,,30.863387
2024-12-28,2024.0,363.0,12.0,28.0,-8.225000,982.191667,2.950000,68.083333,,123.146850
2024-12-29,2024.0,364.0,12.0,29.0,-6.395833,978.879167,5.379167,73.166667,,193.031668
2024-12-30,2024.0,365.0,12.0,30.0,-7.337500,974.650000,4.691667,73.458333,,226.385910


In [6]:
vito.to_csv('../../Data/margaret/margaret.csv')