# 1_CO_OPS_Data_Scraper
The code is organized in 2 steps:
1. Filter out stations and inventorys out of scope, collect station meta data;
2. Collect weather station data from COOPS API

## Station Filtering and Station Metadata

In [None]:
import pandas as pd
from noaa_coops import Station, get_stations_from_bbox
from tqdm import tqdm

stations = get_stations_from_bbox(lat_coords=[0, 80], lon_coords=[-175, -25])

df = pd.DataFrame(columns=['id', 'lon', 'lat', 'Air_Temperature', 'Water_Level', 'Wind', 'Visibility', 'Barometric_Pressure'])

start_date = "2015-01-01"
end_date = "2023-12-31"

for station_id in tqdm(stations, desc="Processing Stations"):

    try:
        station = Station(id=station_id)

        lon = station.lat_lon['lon']
        lat = station.lat_lon['lat']

        Air_Temperature = False
        Water_Level = False
        Wind = False
        Visibility = False
        Barometric_Pressure = False
        Currents = False

        if not hasattr(station, 'data_inventory'):
            print(f"Skipping station {station_id} due to missing data_inventory")
            continue  
    
        data_inventory = station.data_inventory
    
        if 'Air Temperature' in data_inventory:
            air_temp_data = data_inventory['Air Temperature']
            if air_temp_data['start_date'] >= start_date or air_temp_data['end_date'] <= end_date:
                Air_Temperature = True
    
        if 'Verified Hourly Height Water Level' in data_inventory:
            water_level_data = data_inventory['Verified Hourly Height Water Level']
            if water_level_data['start_date'] >= start_date or water_level_data['end_date'] <= end_date:
                Water_Level = True
    
        if 'Wind' in data_inventory:
            wind_data = data_inventory['Wind']
            if wind_data['start_date'] >= start_date or wind_data['end_date'] <= end_date:
                Wind = True
    
        if 'Visibility' in data_inventory:
            visibility_data = data_inventory['Visibility']
            if visibility_data['start_date'] >= start_date or visibility_data['end_date'] <= end_date:
                Visibility = True
    
        if 'Barometric Pressure' in data_inventory:
            barometric_pressure_data = data_inventory['Barometric Pressure']
            if barometric_pressure_data['start_date'] >= start_date or barometric_pressure_data['end_date'] <= end_date:
                Barometric_Pressure = True
    
        df = pd.concat([df, pd.DataFrame({
            'id': [station_id],
            'lon': [lon],
            'lat': [lat],
            'Air_Temperature': [Air_Temperature],
            'Water_Level': [Water_Level],
            'Wind': [Wind],
            'Visibility': [Visibility],
            'Barometric_Pressure': [Barometric_Pressure]
        })], ignore_index=True)
        
    except Exception as e:
        print(f"Error processing station {station_id}: {e}")
        continue

print(df)

In [None]:
df.to_csv('..//2_Data//4_Weather_Station//station_info.csv',index = False)

## Data Scraper

In [9]:
import noaa_coops as nc
from noaa_coops import Station
from tqdm import tqdm

In [10]:
# Functions
def get_daily_high_low(df_hour, data_type, station_id):
    daily_high = df_hour.resample('D').max()
    daily_low = df_hour.resample('D').min()
    
    # Ensure that data_type is a string to be added as a column
    reformatted_df = pd.DataFrame({
        'station_id':station_id,
        'Date': daily_high.index.date,
        'Daily_high': daily_high.values,
        'Daily_low': daily_low.values,
        'Data_type': str(data_type)
    })
    
    return reformatted_df

def get_daily_high_wind(df_s, df_g, data_type, station_id):
    # Resample the data to daily frequency, getting max values for wind speed and gust
    daily_high = df_s.resample('D').max()
    daily_gust_high = df_g.resample('D').max()
    
    # Create a DataFrame with station_id, date, daily high wind speed, and daily gust high
    reformatted_df = pd.DataFrame({
        'station_id': station_id,
        'Date': daily_high.index.date,
        'Daily_high': daily_high.values,
        'Daily_gust_high': daily_gust_high.values,
        'Data_type': str(data_type)
    })
    
    return reformatted_df
    
def process_station_data_air_temperature(df):
    df_all = pd.DataFrame()  # Initialize an empty DataFrame to store the results

    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Stations"):
        station_id, lon, lat = row[['id', 'lon', 'lat']]
        station_info = Station(id=station_id)

        if row['Air_Temperature']:  # Check if Air_Temperature is True
            for year in range(2015, 2023):  # Loop through the years 2015-2023
                try:
                    df_air_temperature = station_info.get_data(
                        begin_date=f"{year}0101",
                        end_date=f"{year}1231",
                        product="air_temperature",
                        interval='h',
                        datum="MLLW",
                        units="metric",
                        time_zone="gmt"
                    )
                    
                    # Assuming df_air_temperature['v'] contains the air temperature data
                    df_temp = get_daily_high_low(df_air_temperature['v'], "air_temperature",station_id)
                    
                    # Concatenate the new data into df_all
                    df_all = pd.concat([df_all, df_temp], ignore_index=True)

                except Exception as e:
                    # Catch any error and print a message, continue to next iteration
                    print(f"Error for station {port_id} in year {year}: {e}")
                    continue
    return df_all

def process_station_data_water_level(df):
    df_all = pd.DataFrame()  # Initialize an empty DataFrame to store the results

    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Stations"):
        station_id, lon, lat = row[['id', 'lon', 'lat']]
        station_info = Station(id=station_id)

        if row['Water_Level']:  # Check if Air_Temperature is True
            for year in range(2015, 2023):  # Loop through the years 2015-2023
                try:
                    df_air_temperature = station_info.get_data(
                        begin_date=f"{year}0101",
                        end_date=f"{year}1231",
                        product="hourly_height",
                        datum="MLLW",
                        units="metric",
                        time_zone="gmt"
                    )
                    
                    # Assuming df_air_temperature['v'] contains the air temperature data
                    df_temp = get_daily_high_low(df_air_temperature['v'], "water_level",station_id)
                    
                    # Concatenate the new data into df_all
                    df_all = pd.concat([df_all, df_temp], ignore_index=True)

                except Exception as e:
                    # Catch any error and print a message, continue to next iteration
                    print(f"Error for station {port_id} in year {year}: {e}")
                    continue
    return df_all


def process_station_data_air_temperature(df):
    df_all = pd.DataFrame()  # Initialize an empty DataFrame to store the results

    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Stations"):
        station_id, lon, lat = row[['id', 'lon', 'lat']]
        station_info = Station(id=station_id)

        if row['Air_Temperature']:  # Check if Air_Temperature is True
            for year in range(2015, 2023):  # Loop through the years 2015-2023
                try:
                    df_air_temperature = station_info.get_data(
                        begin_date=f"{year}0101",
                        end_date=f"{year}1231",
                        product="air_temperature",
                        interval='h',
                        datum="MLLW",
                        units="metric",
                        time_zone="gmt"
                    )

                    # Check if the response contains valid data
                    if df_air_temperature is not None and 'v' in df_air_temperature:
                        # Assuming df_air_temperature['v'] contains the air temperature data
                        df_temp = get_daily_high_low(df_air_temperature['v'], "air_temperature", station_id)

                        # Concatenate the new data into df_all
                        df_all = pd.concat([df_all, df_temp], ignore_index=True)
                    else:
                        print(f"No valid data for station {station_id} in year {year}")

                except requests.exceptions.RequestException as e:
                    # Catch any request-related error and print a message, continue to next iteration
                    print(f"Request error for station {station_id} in year {year}: {e}")
                    continue

                except JSONDecodeError as e:
                    # Catch JSON decoding errors and print a message
                    print(f"JSONDecodeError for station {station_id} in year {year}: {e}")
                    continue

                except Exception as e:
                    # Catch all other errors
                    print(f"Error for station {station_id} in year {year}: {e}")
                    continue

    return df_all

def process_station_barometric_pressure(df):
    df_all = pd.DataFrame()  # Initialize an empty DataFrame to store the results

    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Stations"):
        station_id, lon, lat = row[['id', 'lon', 'lat']]
        
        try:
            station_info = Station(id=station_id)
            # Attempt to get metadata for the station
            station_info.get_metadata()
            
            if row['Barometric_Pressure']:  # Check if Barometric_Pressure is True
                for year in range(2015, 2023):  # Loop through the years 2015-2023
                    try:
                        df_air_pressure = station_info.get_data(
                            begin_date=f"{year}0101",
                            end_date=f"{year}1231",
                            product="air_pressure",
                            datum="MLLW",
                            units="metric",
                            time_zone="gmt"
                        )
                        
                        # Assuming df_air_pressure['v'] contains the air pressure data
                        df_temp = get_daily_high_low(df_air_pressure['v'], "air_pressure", station_id)
                        
                        # Concatenate the new data into df_all
                        df_all = pd.concat([df_all, df_temp], ignore_index=True)

                    except Exception as e:
                        # Catch any error and print a message for each year
                        print(f"Error for station {station_id} in year {year}: {e}")
                        continue
        
        except KeyError as ke:
            print(f"Metadata retrieval failed for station {station_id} due to missing key: {ke}")
        
        except Exception as e:
            # Catch all other errors and print a general message
            print(f"Error processing station {station_id}: {e}")
    
    return df_all

In [None]:
df_air_temperature = process_station_data_air_temperature(df)
df_air_temperature.to_csv('..//2_Data//4_Weather_Station//air_temperature.csv',index = False)
df_water_level = process_station_data_water_level(df)
df_water_level.to_csv('..//2_Data//4_Weather_Station//water_level.csv',index = False)
df_wind = process_station_data_wind(df)
df_wind.to_csv('..//2_Data//4_Weather_Station//wind.csv',index = False)
df_barometric_pressure = process_station_barometric_pressure(df)
df_barometric_pressure.to_csv('..//2_Data//4_Weather_Station//barometric_pressure.csv',index = False)