In [34]:
import xarray as xr
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import numpy as np

In [35]:
class OceanDataGenerator:
    def __init__(self):
        self.buoy_id = "201" # SCRIPPS NEARSHORE, CA
        self.buoy_features = [
            'waveHs', 'waveTp', 'waveTa', 'waveDp', 
            'wavePeakPSD', 'sstSeaSurfaceTemperature'
        ]
        self.noaa_wind_station = "9410230" # Scripps Pier
        self.owm_api_key = "21bf8f260bb281800b97a8b5bc71ef51"
        self.lat, self.lon = 32.8328, -117.2713
        self.headers = {'User-Agent': 'Mozilla/5.0'}
        
        # Classification Config
        self.viz_bins = [0, 10, 15, 25, 200]
        self.viz_labels = [0, 1, 2, 3] # 0: Poor, 1: Fair, 2: Good, 3: Excellent

    def _apply_circular_transform(self, df, column):
        rads = np.deg2rad(df[column])
        df[f'{column}_sine'] = np.sin(rads)
        df[f'{column}_cos'] = np.cos(rads)
        return df

    def fetch_buoy_data(self, days=650):
        """Fetches CDIP Archive & Realtime using pandas timing for stability."""
        base_url = "http://thredds.cdip.ucsd.edu/thredds/dodsC/cdip"
        urls = [
            f"{base_url}/archive/{self.buoy_id}p1/{self.buoy_id}p1_historic.nc",
            f"{base_url}/realtime/{self.buoy_id}p1_rt.nc"
        ]
        
        # Using pd.Timestamp.now() avoids the 'datetime' attribute errors
        start_date = (pd.Timestamp.now() - pd.Timedelta(days=days)).strftime('%Y-%m-%d')
        all_wave_aggs, all_sst_aggs = [], []

        for url in urls:
            try:
                ds = xr.open_dataset(url, engine='pydap')
                ds_subset = ds[self.buoy_features].sel(waveTime=slice(start_date, None))
                
                df_wave = ds_subset.drop_dims('sstTime').to_dataframe()
                df_wave = self._apply_circular_transform(df_wave, 'waveDp')
                
                # Add Steepness and Energy pre-aggregation
                df_wave['wave_steepness'] = df_wave['waveHs'] / df_wave['waveTp']
                df_wave['swell_energy'] = (df_wave['waveHs']**2) * df_wave['waveTp']
                
                wave_agg = df_wave.resample('D').agg({
                    'waveHs': ['max', 'mean'], 
                    'waveTp': 'mean',
                    'wave_steepness': 'mean', # New feature
                    'swell_energy': 'mean',   # New feature
                    'waveDp_sine': 'mean', 
                    'waveDp_cos': 'mean', 
                    'wavePeakPSD': 'max'
                })
                wave_agg.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in wave_agg.columns.values]
                all_wave_aggs.append(wave_agg)

                try:
                    df_sst = ds_subset.drop_dims('waveTime').to_dataframe()
                    all_sst_aggs.append(df_sst.resample('D').agg({'sstSeaSurfaceTemperature': 'mean'}))
                except: pass
            except Exception as e:
                print(f"Skipping {url}: {e}")

        final_wave = pd.concat(all_wave_aggs).sort_index()
        final_wave = final_wave[~final_wave.index.duplicated(keep='last')]
        final_sst = pd.concat(all_sst_aggs).sort_index()
        final_sst = final_sst[~final_sst.index.duplicated(keep='last')]

        return final_wave.join(final_sst, how='inner')

    def fetch_wind_data(self, days=650):
        """Wind fetcher with fixed pd.Timedelta logic."""
        end_date = pd.Timestamp.now().normalize()
        start_date = end_date - pd.Timedelta(days=days)
        all_chunks, curr = [], start_date
        
        while curr < end_date:
            curr_end = min(curr + pd.Timedelta(days=30), end_date)
            url = (f"https://api.tidesandcurrents.noaa.gov/api/prod/datagetter?"
                   f"begin_date={curr.strftime('%Y%m%d')}&end_date={curr_end.strftime('%Y%m%d')}&"
                   f"station={self.noaa_wind_station}&product=wind&units=metric&time_zone=lst_ldt&format=json")
            try:
                res = requests.get(url, headers=self.headers, timeout=15).json()
                if 'data' in res: all_chunks.append(pd.DataFrame(res['data']))
            except: pass
            curr = curr_end + pd.Timedelta(minutes=6)

        if not all_chunks: return pd.DataFrame()
        df = pd.concat(all_chunks).drop_duplicates('t')
        df['t'] = pd.to_datetime(df['t'])
        df.set_index('t', inplace=True)
        for col in ['s', 'g', 'd']: df[col] = pd.to_numeric(df[col], errors='coerce')
        
        df['wind_x'], df['wind_y'] = np.cos(np.radians(df['d'])), np.sin(np.radians(df['d']))
        daily = df.resample('D').agg({'s': 'mean', 'g': 'max', 'wind_x': 'mean', 'wind_y': 'mean'}).rename(columns={'s': 'wind_speed', 'g': 'wind_gust'})
        daily['wind_dir_mean'] = np.degrees(np.arctan2(daily['wind_y'], daily['wind_x'])) % 360
        return daily.drop(columns=['wind_x', 'wind_y'])
    
    def fetch_tide_data(self, days=650):
        """Fetches daily tidal max and delta (flux) from NOAA."""
        end_date = pd.Timestamp.now().normalize()
        start_date = end_date - pd.Timedelta(days=days)
        all_chunks, curr = [], start_date
        
        while curr < end_date:
            curr_end = min(curr + pd.Timedelta(days=30), end_date)
            url = (f"https://api.tidesandcurrents.noaa.gov/api/prod/datagetter?"
                   f"begin_date={curr.strftime('%Y%m%d')}&end_date={curr_end.strftime('%Y%m%d')}&"
                   f"station={self.noaa_wind_station}&product=water_level&datum=mllw&units=metric&time_zone=lst_ldt&format=json")
            try:
                res = requests.get(url, headers=self.headers, timeout=15).json()
                if 'data' in res: all_chunks.append(pd.DataFrame(res['data']))
            except: pass
            curr = curr_end + pd.Timedelta(minutes=6)

        if not all_chunks: return pd.DataFrame()
        df = pd.concat(all_chunks)
        df['t'] = pd.to_datetime(df['t'])
        df.set_index('t', inplace=True)
        df['v'] = pd.to_numeric(df['v'], errors='coerce')
        
        # Calculate daily max height and the max hourly 'flux' (tide speed)
        daily = df.resample('D').agg({'v': ['max', 'mean']})
        daily.columns = ['tide_max', 'tide_mean']
        return daily

    def fetch_rain_data(self, days=365):
        """Rain fetcher using pandas timing objects."""
        rain_data = []
        end_date = pd.Timestamp.now().normalize()
        curr = end_date - pd.Timedelta(days=days)
        
        while curr <= end_date:
            url = f"https://api.openweathermap.org/data/3.0/onecall/day_summary?lat={self.lat}&lon={self.lon}&date={curr.strftime('%Y-%m-%d')}&appid={self.owm_api_key}&units=metric"
            try:
                r = requests.get(url).json()
                rain_data.append({'time': curr, 'rain_mm': r.get('precipitation', {}).get('total', 0)})
            except: pass
            curr += pd.Timedelta(days=1)
        
        df = pd.DataFrame(rain_data).set_index('time')
        df.index = pd.to_datetime(df.index)
        df['rain_72h_weighted_mm'] = (df['rain_mm'] + (df['rain_mm'].shift(1) * 0.6) + (df['rain_mm'].shift(2) * 0.3)).fillna(0)
        return df

    def scrape_visibility_labels(self, total_pages=27):
        base_url = "https://justgetwet.com/blogs/dive-reports-and-conditions?page="
        all_reports = []
        for page_num in range(1, total_pages + 1):
            try:
                res = requests.get(f"{base_url}{page_num}", headers=self.headers, timeout=10)
                soup = BeautifulSoup(res.text, 'html.parser')
                for art in soup.find_all('div', class_='article__grid-meta'):
                    date_tag, excerpt = art.find('time'), art.find('div', class_='article__excerpt')
                    if not date_tag or not excerpt: continue
                    viz_raw = re.search(r'(?:Viz|Vis|Visibility):\s*([\d\-\+m\s\']+)', excerpt.get_text(), re.IGNORECASE)
                    if viz_raw:
                        nums = re.findall(r'(\d+)', viz_raw.group(1).lower())
                        if not nums: continue
                        viz_ft = float(nums[0])
                        if 'm' in viz_raw.group(1).lower(): viz_ft *= 3.28
                        all_reports.append({'date': pd.to_datetime(date_tag.get_text().strip()), 'visibility_ft': viz_ft})
                time.sleep(0.5)
            except: pass
        df = pd.DataFrame(all_reports).drop_duplicates('date')
        df['date'] = pd.to_datetime(df['date']).dt.normalize()
        return df.set_index('date').sort_index()

    def run(self, days=650, scrape_pages=27):
        """Unified run with normalization, lags 1-3, and trend."""
        df_buoy = self.fetch_buoy_data(days=days)
        df_wind = self.fetch_wind_data(days=days)
        df_rain = self.fetch_rain_data(days=days)
        df_tide = self.fetch_tide_data(days=days) # New fetcher
        df_labels = self.scrape_visibility_labels(total_pages=scrape_pages)

        for d in [df_buoy, df_wind, df_rain,df_tide, df_labels]:
            if not d.empty:
                d.index = pd.to_datetime(d.index).normalize().tz_localize(None)

        #Saving intermediate states to class variables
        self.df_buoy = df_buoy.copy() 
        self.df_wind = df_wind.copy()
        self.df_rain = df_rain.copy()
        self.df_tide = df_tide.copy()
        self.df_labels = df_labels.copy()
        audit_dict = {'df_buoy':self.df_buoy,
                      'df_wind':self.df_wind,
                      'df_rain':self.df_rain,
                      'df_labels':self.df_labels,
                      'df_tide':self.df_tide
                      }
        for k,v in audit_dict.items():
            print(f"{k} min: {v.index.min()} {k} max: {v.index.max()}")
        # 2. Add SST and Tide Change Features
        if 'sstSeaSurfaceTemperature' in df_buoy.columns:
            df_buoy['sst_diff_24h'] = df_buoy['sstSeaSurfaceTemperature'].diff(1)
            df_buoy['sst_diff_48h'] = df_buoy['sstSeaSurfaceTemperature'].diff(2)

        # 3. Join the new tide data
        final_df = df_buoy.join(df_wind, how='inner').join(df_rain, how='inner')\
                          .join(df_tide, how='inner').join(df_labels, how='inner')


        # Seasonality
        day_of_year = pd.to_datetime(final_df.index).dayofyear
        final_df['season_sine'] = np.sin(2 * np.pi * day_of_year / 365.25)
        final_df['season_cos'] = np.cos(2 * np.pi * day_of_year / 365.25)

        # Lags 1-3
        cols_to_lag = ['waveHs_mean', 'waveHs_max', 'wind_speed', 'wavePeakPSD_max', 'rain_72h_weighted_mm']
        for col in cols_to_lag:
            if col in final_df.columns:
                final_df[f'{col}_lag1'] = final_df[col].shift(1)
                final_df[f'{col}_lag2'] = final_df[col].shift(2)
                final_df[f'{col}_lag3'] = final_df[col].shift(3)

        # Swell Trend
        if 'waveHs_mean' in final_df.columns:
            final_df['swell_trend_3d'] = final_df['waveHs_mean'].diff(periods=3)
        print(f"final_df Pre NA Drop {final_df.shape} Post NA Drop: {final_df.dropna().shape}")
        self.data = final_df.dropna()
        return self.data

    def save_data(self, df, path="training_data.parquet", as_classification=False):
        export_df = df.copy()
        if as_classification:
            export_df['target'] = pd.cut(export_df['visibility_ft'], bins=self.viz_bins, labels=self.viz_labels,include_lowest=True).astype(int)
            export_df.drop(columns=['visibility_ft'], inplace=True)
        else:
            export_df.rename(columns={'visibility_ft': 'target'}, inplace=True)
        export_df.to_parquet(path, engine="fastparquet")
        print(f"Success. Parquet saved to {path} with {len(export_df)} records.")

In [36]:
gen = OceanDataGenerator()
df = gen.run(697,27)

Consider replacing `http` in your `url` with either `dap2` or `dap4` to specify the DAP protocol (e.g. `dap2://<data_url>` or `dap4://<data_url>`).  For more 
information, go to https://www.opendap.org/faq-page.
Consider replacing `http` in your `url` with either `dap2` or `dap4` to specify the DAP protocol (e.g. `dap2://<data_url>` or `dap4://<data_url>`).  For more 
information, go to https://www.opendap.org/faq-page.


df_buoy min: 2024-03-16 00:00:00 df_buoy max: 2026-02-12 00:00:00
df_wind min: 2024-03-16 00:00:00 df_wind max: 2026-02-11 00:00:00
df_rain min: 2024-03-16 00:00:00 df_rain max: 2026-02-11 00:00:00
df_labels min: 2024-03-16 00:00:00 df_labels max: 2026-02-10 00:00:00
df_tide min: 2024-03-16 00:00:00 df_tide max: 2026-02-11 00:00:00
final_df Pre NA Drop (522, 37) Post NA Drop: (519, 37)


In [39]:
df

Unnamed: 0,waveHs_max,waveHs_mean,waveTp_mean,wave_steepness_mean,swell_energy_mean,waveDp_sine_mean,waveDp_cos_mean,wavePeakPSD_max,sstSeaSurfaceTemperature,sst_diff_24h,...,wind_speed_lag1,wind_speed_lag2,wind_speed_lag3,wavePeakPSD_max_lag1,wavePeakPSD_max_lag2,wavePeakPSD_max_lag3,rain_72h_weighted_mm_lag1,rain_72h_weighted_mm_lag2,rain_72h_weighted_mm_lag3,swell_trend_3d
2024-03-22,0.57,0.475833,15.583219,0.030705,3.560712,-0.955971,0.281128,0.581115,17.285418,0.309376,...,1.711250,1.417500,3.240417,0.487821,1.118729,0.757443,0.0,0.0,0.0,-0.143125
2024-03-23,1.40,0.570000,14.369962,0.051740,4.340137,-0.934940,0.338181,1.611545,17.132292,-0.153126,...,1.699583,1.711250,1.417500,0.581115,0.487821,1.118729,0.0,0.0,0.0,-0.047292
2024-03-25,2.74,1.969375,8.158076,0.245759,32.531830,-0.969695,0.238400,8.518099,15.966667,-0.191666,...,5.215417,1.699583,1.711250,1.611545,0.581115,0.487821,0.0,0.0,0.0,1.509167
2024-03-28,0.81,0.681667,13.435426,0.057479,6.186221,-0.906081,0.403287,0.746166,15.750000,0.206250,...,4.163333,5.215417,1.699583,8.518099,1.611545,0.581115,0.0,0.0,0.0,0.205833
2024-03-29,0.98,0.780625,11.366722,0.085616,7.105659,-0.930865,0.342202,1.274041,15.382291,-0.367709,...,3.214167,4.163333,5.215417,0.746166,8.518099,1.611545,0.0,0.0,0.0,0.210625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2026-02-05,0.54,0.464583,12.316020,0.038797,2.647382,-0.971789,0.221565,0.480558,17.266666,-0.146875,...,1.424583,1.037083,1.202083,1.326035,1.141329,1.807949,0.0,0.0,0.0,-0.258125
2026-02-06,0.78,0.568333,12.871403,0.044477,4.194223,-0.961240,0.265666,1.436477,17.360416,0.093750,...,2.257083,1.424583,1.037083,0.480558,1.326035,1.141329,0.0,0.0,0.0,0.024167
2026-02-07,0.98,0.853750,13.965505,0.061576,10.318934,-0.944326,0.319713,2.333089,17.576042,0.215626,...,2.422500,2.257083,1.424583,1.436477,0.480558,1.326035,0.0,0.0,0.0,0.189167
2026-02-08,1.03,0.863333,12.833111,0.067706,9.709422,-0.933765,0.347968,1.754516,17.660418,0.084375,...,2.288703,2.422500,2.257083,2.333089,1.436477,0.480558,0.0,0.0,0.0,0.398750


In [37]:
gen.save_data(df,path=r"C:/Users\Andrew Shade/Documents/Visibility Code/visibility_data_class_xl.parquet",as_classification=True)

Success. Parquet saved to C:/Users\Andrew Shade/Documents/Visibility Code/visibility_data_class_xl.parquet with 519 records.


In [38]:
gen.save_data(df,path=r"C:/Users\Andrew Shade/Documents/Visibility Code/visibility_data_reg_xl.parquet",as_classification=False)

Success. Parquet saved to C:/Users\Andrew Shade/Documents/Visibility Code/visibility_data_reg_xl.parquet with 519 records.
