# Madis Download

In [2]:
from dateutil import rrule
from datetime import datetime
import multiprocessing
from pathlib import Path
import xarray as xr
import warnings
import pandas as pd
import tqdm
from IPython.display import clear_output

In [3]:
dataPath = Path('/Users/jonathangiezendanner/Documents/MIT/Projects/WindDataNE-US')
targetFolder = dataPath / 'madis' / 'raw'
targetFolder.mkdir(parents=True, exist_ok=True)

In [5]:
mesonetUrl = lambda year, month, day, hour: f'https://madis-data.ncep.noaa.gov/madisPublic1/data/archive/{year}/{month:02d}/{day:02d}/LDAD/mesonet/netCDF/{year}{month:02d}{day:02d}_{hour:02d}00.gz'

In [6]:
startDate = '2019-01-01'
endDate = '2024-01-01'
dt = rrule.HOURLY

dates = list(rrule.rrule(dt, dtstart=datetime.strptime(startDate, '%Y-%m-%d'),
                      until=datetime.strptime(endDate, '%Y-%m-%d')))[:-1]

In [8]:
def getDataForDate(date):
    year = date.year
    month = date.month
    day = date.day
    hour = date.hour
    url = mesonetUrl(year, month, day, hour)
    outputFile = targetFolder/f'{year}/{month}/mesonet/{year}{month:02d}{day:02d}_{hour:02d}00.gz'
    if outputFile.exists():
        return
    
    outputFile.parent.mkdir(parents=True, exist_ok=True)
    outputFileStr = outputFile.as_posix()
    
    try:
        !wget $url -O "$outputFileStr" --no-check-certificate
    except:
        print('ERRRORR')
        print(date)
        print('ERRRORR')
        return

In [9]:
with multiprocessing.Pool(128) as pool:
    pool.map(getDataForDate, dates)

In [None]:
# for date in dates:
#     getDataForDate(date)

# Concatenate MADIS data by Month

In [3]:
var_list = ['stationId', 'reportTime', 'latitude', 'longitude', 'elevation', 'dewpoint', 'dewpointDD', 'temperature', 'temperatureDD', 'windSpeed', 'windSpeedDD', 'windDir', 'windDirDD', 'relHumidity', 'relHumidityDD', 'dewpoint', 'dewpointDD', 'solarRadiation']

In [None]:
inputPath = lambda year, month: dataPath / f'{year}/{month}/mesonet/'

In [5]:
targetPath = lambda year, month: dataPath.parent/'raw_monthly'/'mesonet'/str(year)

In [6]:
def processOnOpening(d):
    d = d[var_list]
    data_vars = {}
    for varname in var_list:
        data_vars[varname] = xr.Variable(
            dims=d.data_vars[varname].dims,
            data=d.data_vars[varname],
            encoding={'chunks': d.data_vars[varname].shape})
    data = xr.Dataset(data_vars=data_vars)
    return data

In [7]:
def processFile(file):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # xr.open_dataset(datafiles[0])
        data = xr.open_mfdataset(file, preprocess=processOnOpening)  #, coords='all')#, parallel=True)
        data = data.to_pandas().reset_index(drop=True)
        data.insert(0, 'minute', data.reportTime.dt.minute.values)
        data.insert(0, 'hour', data.reportTime.dt.hour.values)
        data.insert(0, 'day', data.reportTime.dt.day.values)
        data.insert(0, 'month', data.reportTime.dt.month.values)
        data.insert(0, 'year', data.reportTime.dt.year.values)
        data = data.drop(columns='reportTime')
    return data

In [None]:
for year in range(2019, 2024):
    for month in range(1, 13):
        
        outPath = targetPath(year, month)/(f'{month}.nc')
        if outPath.exists():
            continue
        
        datafiles = list(inputPath(year, month).glob('*.gz'))
        out = []
        with multiprocessing.Pool(32) as pool:
            for result in tqdm.tqdm(pool.imap_unordered(processFile, datafiles), total=len(datafiles)):
                out.append(result)
        out = pd.concat(out).to_xarray()
        
        outPath.parent.mkdir(parents=True, exist_ok=True)
        out.to_netcdf(outPath)
    
    clear_output(wait=False)