In [1]:
import pandas as pd
import numpy as np
import awswrangler as wr

In [23]:
# this cell tagged as 'parameters'
min_year = 1995
max_year = 2024
station_id = 42039
write_location = ''

In [29]:
columns = [
    'year',
    'month',
    'day',
    'hour',
    'minute',
    'wind_dir',
    'wind_spd',
    'gust',
    'wave_height',
    'dominant_period',
    'avg_period',
    'wave_direction',
    'pressure',
    'air_temp',
    'water_temp',
    'dew_point',
    'visibility',
    'tide'
]

columns_no_minutes = [
    'year',
    'month',
    'day',
    'hour',
    'wind_dir',
    'wind_spd',
    'gust',
    'wave_height',
    'dominant_period',
    'avg_period',
    'wave_direction',
    'pressure',
    'air_temp',
    'water_temp',
    'dew_point',
    'visibility',
    'tide'
]

col_nans = {
    'wind_dir': 999,
    'wind_spd': 99.0,
    'gust': 99.0,
    'wave_height': 99.0,
    'dominant_period': 99.0,
    'avg_period': 99.0,
    'wave_direction': 999,
    'pressure': 9999.0,
    'air_temp': 999.0,
    'water_temp': 999.0,
    'dew_point': 99.0,
    'visibility': 99.0,
    'tide': 99.0
}

In [30]:
def nan_cols(col: pd.Series):
    if col.name not in col_nans:
        return col
    return col.replace(col_nans[col.name], np.nan)


In [58]:
hist_waves = list()
from urllib.error import HTTPError
import io
from urllib.request import urlopen


for year in range(min_year, max_year):
    try:
        file_uri = f'https://www.ndbc.noaa.gov/view_text_file.php?filename={station_id}h{year}.txt.gz&dir=data/historical/stdmet/'
        with urlopen(file_uri) as response:
            file_bytes = io.BytesIO(response.read())


        file_line_one = file_bytes.readline()
        print(file_line_one[:4])
        has_minutes = not file_line_one[:3] == 'YY '
        skip_two = file_line_one[:3] == '#YY'
        df_args = {
           'skiprows': 2 if skip_two else 1,
            'names': columns if has_minutes else columns_no_minutes,
            'infer_nrows': 500
        }

        
        # if has_minutes:
        #     df_args = {
        #        'skiprows': 2,
        #         'names': columns,
        #         'infer_nrows': 500
        #     }
        # elif 
        # else:
        #     df_args = {
        #         'skiprows': 1,
        #         'names': columns_no_minutes,
        #         'infer_nrows': 500
        #     }
            # print('new_file')
        print(file_bytes.readline())
        file_bytes.seek(0)
    
        df = pd.read_fwf(file_bytes, **df_args).apply(nan_cols)
        if not has_minutes:
            df.loc[:, ('minute',)] = np.nan
            df = df[columns]
        df['source'] = file_uri
        hist_waves.append(df)
        print(f'Found {year} at {file_uri}')
    except HTTPError as e:
        if e.code == 404:
            print(f'404: {file_uri}')
            continue
        raise e

b'YY M'
b'95 12 12 18 088 06.9 09.7 01.00 05.60 04.60 999 1024.9  18.3  22.2 999.0 99.0\r\n'
Found 1995 at https://www.ndbc.noaa.gov/view_text_file.php?filename=42039h1995.txt.gz&dir=data/historical/stdmet/
b'YY M'
b'96 01 01 00 999 99.0 99.0 99.00 99.00 99.00 999 9999.0 999.0 999.0 999.0 99.0\n'
Found 1996 at https://www.ndbc.noaa.gov/view_text_file.php?filename=42039h1996.txt.gz&dir=data/historical/stdmet/
b'YY M'
b'97 01 01 00 114  4.3  5.1   .42  5.00  3.95 999 1021.0  20.6  22.2 999.0 99.0\n'
Found 1997 at https://www.ndbc.noaa.gov/view_text_file.php?filename=42039h1997.txt.gz&dir=data/historical/stdmet/
b'YY M'
b'98 01 01 00 999 99.0 99.0 99.00 99.00 99.00 999 9999.0 999.0 999.0 999.0 99.0\r\n'
Found 1998 at https://www.ndbc.noaa.gov/view_text_file.php?filename=42039h1998.txt.gz&dir=data/historical/stdmet/
b'YYYY'
b'1999 01 01 00  44  2.9  4.4   .22  3.23  3.60 999 1021.6  18.3  22.4   9.7 99.0\r\n'
Found 1999 at https://www.ndbc.noaa.gov/view_text_file.php?filename=42039h1999.tx

In [59]:
hw_df = pd.concat(hist_waves)

In [60]:
hw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 413664 entries, 0 to 45761
Data columns (total 19 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   year             413664 non-null  object 
 1   month            413664 non-null  object 
 2   day              413664 non-null  object 
 3   hour             413664 non-null  object 
 4   minute           413664 non-null  object 
 5   wind_dir         411457 non-null  object 
 6   wind_spd         410140 non-null  object 
 7   gust             409962 non-null  object 
 8   wave_height      410994 non-null  object 
 9   dominant_period  411211 non-null  object 
 10  avg_period       412522 non-null  object 
 11  wave_direction   412706 non-null  object 
 12  pressure         413602 non-null  object 
 13  air_temp         412279 non-null  object 
 14  water_temp       397553 non-null  object 
 15  dew_point        336614 non-null  object 
 16  visibility       17500 non-null   object 
 1

In [61]:
hw_df.sample(10)

Unnamed: 0,year,month,day,hour,minute,wind_dir,wind_spd,gust,wave_height,dominant_period,avg_period,wave_direction,pressure,air_temp,water_temp,dew_point,visibility,tide,source
13654,2022,4,5,21,20,194 6.2,7.5,99.0,99.0,99.0,999 1009.1,24.9,23.4,23.5,99.0,99.0,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
426,2021,1,3,22,50,336 4.7,6.6,99.0,99.0,99.0,999 1019.4,14.5,21.7,9.8,99.0,99.0,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
39047,2022,9,29,8,50,19 12.7,16.0,99.0,99.0,99.0,999 1013.2,23.0,999.0,14.9,99.0,99.0,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
1980,2013,3,24,20,50,283 10.3,12.6,2.16,9.09,5.2,999 1004.8,22.8,22.6,18.9,99.0,99.0,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
14115,2021,5,24,2,10,108 3.3,4.5,99.0,99.0,99.0,999 1022.1,23.6,25.0,19.3,99.0,99.0,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
15987,2021,6,6,2,10,130 5.2,6.3,99.0,99.0,99.0,999 1016.9,26.7,26.2,25.5,99.0,99.0,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
29198,2023,7,22,21,30,253 4.4,5.2,99.0,99.0,99.0,999 1014.3,30.3,999.0,26.2,99.0,99.0,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
2393,2001,4,10,17,112,3.1,4.8,0.64,5.88,5.35,182.0,1018.3,22.7,23.7,19.2,,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
1173,2002,2,18,21,88,7.6,9.3,1.48,6.67,4.89,73.0,1022.8,16.0,20.6,7.8,,,,https://www.ndbc.noaa.gov/view_text_file.php?f...
7012,2011,11,3,12,50,209 3.2,4.2,1.03,5.56,4.57,999 1019.9,23.5,24.6,999.0,99.0,99.0,,,https://www.ndbc.noaa.gov/view_text_file.php?f...


In [62]:
if write_location:
    wr.s3.to_parquet(hw_df, write_location)

In [63]:
hw_df.year.value_counts()

year
2022    52529
2020    52273
2021    46051
2023    45761
2019    26901
96       8952
2004     8774
2008     8768
2016     8767
2002     8760
97       8760
2003     8760
2009     8757
2010     8731
2006     8707
2013     8701
2007     8691
2001     8582
2014     8530
2000     8346
2011     8340
1999     8088
2015     8025
2017     7982
98       7536
2005     7295
2018     6724
2012     5095
95        461
#yr        17
Name: count, dtype: int64

In [64]:
hw_df[hw_df.year == 5].source.iloc[0]

IndexError: single positional indexer is out-of-bounds