In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re
import csv
import glob as gl
import tqdm
import pandas as pd
import numpy as np

In [2]:
def prevailing_wind_direction(wind_dir_series):
    rad = np.deg2rad(wind_dir_series)
    sin_mean = np.nanmean(np.sin(rad))
    cos_mean = np.nanmean(np.cos(rad))
    mean_dir_rad = np.arctan2(sin_mean, cos_mean)
    mean_dir_deg = (np.rad2deg(mean_dir_rad)) % 360
    return mean_dir_deg

In [3]:
def download_file(file_url, download_dir):
    """Download the file from the specified URL."""
    local_filename = os.path.join(download_dir, os.path.basename(file_url))
    with requests.get(file_url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded: {local_filename}")
    return os.path.basename(file_url)

def scrape_and_download(url, file_pattern, download_dir):
    """Scrape the website and download files matching the pattern."""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links on the webpage and download the matching files
    for link in soup.find_all('a', href=True):
        file_url = link['href']
        if file_pattern.match(file_url):
            # Prepend the base URL if necessary
            if not file_url.startswith('http'):
                file_url = url + file_url
            # Download the file
            download_file(file_url, download_dir)
            
def download_vito(year):
    # Directory where you want to save the downloaded files
    download_dir = '../../Data/manuela'
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    
    # Pattern to match the file names
    file_pattern = re.compile(r'mla.*q1h\.txt')

    # Find data online
    url = 'https://amrc.ssec.wisc.edu/data/ftp/pub/aws/q1h/' + str(year) + '/'

    scrape_and_download(url, file_pattern, download_dir)

In [None]:
for year in [i for i in range(2001, 2025)]:
    download_vito(year)

In [4]:
path = '../../Data/manuela/'
# create list of file names
aws_files = gl.glob(path + "/mla*.txt")
# create data frame
columns = ['year', 'doy', 'month', 'day', 'hour', 'temperature', 'pressure', 'wind_speed', 'wind_direction', 'relative_humidity', '?']
content = []
for filename in tqdm.tqdm(aws_files, total=len(aws_files)):
    df = pd.read_csv(filename, names=columns, sep='\s+')
    df.drop(index=df.index[:2], axis=0, inplace=True)
    content.append(df)
vito = pd.concat(content)
# create datetime column and set as index:
vito['DateTime'] = vito['year'].astype(str) + '-' + vito['month'].astype(str) + '-' + vito['day'].astype(str) + ' ' + vito['hour'].astype(str)
vito['DateTime'] = pd.to_datetime(vito['DateTime'],format='%Y-%m-%d %H%M')
vito.index = vito['DateTime']
del vito['DateTime']
# make data into floats
for i in vito.columns:
    vito[i] = vito[i].astype(float)
# replace fill values with nans:
vito.replace(444.0, np.nan, inplace=True)
# resample to daily resolution
del vito['hour']
agg_dict = dict.fromkeys([col for col in vito.columns if col != 'wind_direction'], 'mean')
agg_dict['wind_direction'] = prevailing_wind_direction
vito = vito.resample('1D').agg(agg_dict)

vito

100%|████████████████████████████████████████████████████████████████████████████████| 277/277 [00:08<00:00, 32.38it/s]
  sin_mean = np.nanmean(np.sin(rad))
  cos_mean = np.nanmean(np.cos(rad))


Unnamed: 0_level_0,year,doy,month,day,temperature,pressure,wind_speed,relative_humidity,?,wind_direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2001-09-01,2001.0,244.0,9.0,1.0,-22.934783,985.934783,,29.456522,-0.082609,
2001-09-02,2001.0,245.0,9.0,2.0,-23.343478,992.247826,,32.843478,-0.108696,
2001-09-03,2001.0,246.0,9.0,3.0,-27.255000,990.355000,,42.305000,-0.110000,
2001-09-04,2001.0,247.0,9.0,4.0,-21.315000,972.595000,,31.640000,-0.130000,
2001-09-05,2001.0,248.0,9.0,5.0,-25.691304,969.965217,,33.517391,-0.117391,
...,...,...,...,...,...,...,...,...,...,...
2024-12-27,2024.0,362.0,12.0,27.0,-0.408696,979.004167,6.725000,54.437500,,275.874536
2024-12-28,2024.0,363.0,12.0,28.0,-0.808333,979.704167,4.416667,64.000000,,171.327619
2024-12-29,2024.0,364.0,12.0,29.0,-0.450000,981.862500,4.079167,82.750000,,27.179111
2024-12-30,2024.0,365.0,12.0,30.0,-0.737500,975.108333,22.404167,42.945833,,261.221025


In [5]:
# Add years from Wang et al., 2022
t_column_name = 'Temperature(¡æ)'
p_column_name = 'Pressure(hPa)'
ws_column_name = 'Wind Speed(m/s)'
wd_column_name = 'Wind Direction'

TP = pd.read_csv('../../Data/manuela/Manuela_3h.csv', header=0, encoding='latin1')
TP['DateTime'] = pd.to_datetime(dict(year=TP['Year'], month=TP['Month'], day=TP['Day'], hour=TP['Three-hourly observation time(UTC)']))
TP.index = TP['DateTime']
TP = TP[[t_column_name, p_column_name, ws_column_name, wd_column_name]]
TP.rename(columns={t_column_name: 'temperature', p_column_name: 'pressure', ws_column_name: 'wind_speed', wd_column_name: 'wind_direction'}, inplace=True)
agg_dict = dict.fromkeys([col for col in TP.columns if col != 'wind_direction'], 'mean')
agg_dict['wind_direction'] = prevailing_wind_direction
TP = TP.resample('1D').agg(agg_dict)

TP

  sin_mean = np.nanmean(np.sin(rad))
  cos_mean = np.nanmean(np.cos(rad))


Unnamed: 0_level_0,temperature,pressure,wind_speed,wind_direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1984-01-01,,,,
1984-01-02,,,,
1984-01-03,,,,
1984-01-04,,,,
1984-01-05,,997.4000,,
...,...,...,...,...
2021-12-27,-1.275000,965.8750,5.125000,231.678026
2021-12-28,-2.133333,966.7750,3.075000,34.130793
2021-12-29,-4.057143,968.9625,3.762500,155.967164
2021-12-30,-5.014286,977.5625,4.700000,243.325751


In [6]:
df = pd.concat([vito, TP], ignore_index=False)
df = df[~df.index.duplicated(keep='first')]
df = df.sort_index()

df

Unnamed: 0_level_0,year,doy,month,day,temperature,pressure,wind_speed,relative_humidity,?,wind_direction
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1984-01-01,,,,,,,,,,
1984-01-02,,,,,,,,,,
1984-01-03,,,,,,,,,,
1984-01-04,,,,,,,,,,
1984-01-05,,,,,,997.400000,,,,
...,...,...,...,...,...,...,...,...,...,...
2024-12-27,2024.0,362.0,12.0,27.0,-0.408696,979.004167,6.725000,54.437500,,275.874536
2024-12-28,2024.0,363.0,12.0,28.0,-0.808333,979.704167,4.416667,64.000000,,171.327619
2024-12-29,2024.0,364.0,12.0,29.0,-0.450000,981.862500,4.079167,82.750000,,27.179111
2024-12-30,2024.0,365.0,12.0,30.0,-0.737500,975.108333,22.404167,42.945833,,261.221025


In [7]:
df.to_csv('../../Data/manuela/manuela.csv')