# Processing data for wave height forecast
In this notebook, we process the data that we use to train and fine tune our models. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import defaultdict

plt.style.use('seaborn-v0_8-darkgrid')

## 1. Wave data

In [94]:
# Read raw data
# ==============================================================================
# Directory containing the CSV files
directory = '../../Data/queensland/raw'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

# Dictionary to map possible column names to standard names
column_mapping = {
    'Date/Time': 'datetime',
    'Date/Time (AEST)': 'datetime',
    'Peak Direction': 'wave_direction',
    'Peak Direction (degrees)': 'wave_direction',
    'Tz': 'wave_period',
    'Tz (s)': 'wave_period',
    'Hmax': 'wave_height',
    'Hmax (m)': 'wave_height'
}

# Dictionary to hold lists of file names grouped by site
site_files = defaultdict(list)

# Group files by site
for file_name in csv_files:
    # Extract the site name (part before the first underscore)
    site_name = file_name.split('_')[0]
    # Add the file to the corresponding site group
    site_files[site_name].append(file_name)

# List to hold the final DataFrames, one per site
df_list = []

# Process each site group
for site, files in site_files.items():
    site_dfs = []
    
    for file in files:
        file_path = os.path.join(directory, file)
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Rename columns to a consistent format using the mapping dictionary
        df = df.rename(columns=column_mapping)
        
        # Select only the standardized columns we care about
        df = df[['datetime', 'wave_direction', 'wave_period', 'wave_height']]

        # Set right format for datetime variable
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['datetime'] = df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
        
        # Append the processed DataFrame to the list for this site
        site_dfs.append(df)
    
    # Concatenate all DataFrames for this site
    site_df = pd.concat(site_dfs, ignore_index=True)
    
    # Sort by the datetime column
    site_df = site_df.sort_values('datetime')
    site_df.set_index(keys = 'datetime', inplace=True)
    # site_df = df.asfreq('30min')

    # Keep site name in df
    site_df['site'] = site
    
    # Append the concatenated and sorted DataFrame to the list
    df_list.append(site_df)

# df_list now contains one DataFrame per site, each sorted by datetime

In [95]:
df_list[0]

Unnamed: 0_level_0,wave_direction,wave_period,wave_height,site
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-06-01 00:00:00,78.51,-99.900,-99.900,bilinga
2021-06-01 00:30:00,77.89,-99.900,-99.900,bilinga
2021-06-01 01:00:00,79.12,-99.900,-99.900,bilinga
2021-06-01 01:30:00,78.86,-99.900,-99.900,bilinga
2021-06-01 02:00:00,78.51,-99.900,-99.900,bilinga
...,...,...,...,...
2023-12-31 21:30:00,67.60,3.611,1.826,bilinga
2023-12-31 22:00:00,74.99,3.559,1.933,bilinga
2023-12-31 22:30:00,66.11,3.838,2.096,bilinga
2023-12-31 23:00:00,67.60,3.779,2.016,bilinga


In [97]:
# Create dataframes separating wave height, period and direction
df_list_height = []
df_list_period = []
df_list_direction = []

for _df in df_list:
    
    # Height
    _df_h = _df[['wave_height', 'site']].copy()
    df_list_height.append(_df_h)

    # Period
    _df_p = _df[['wave_period', 'site']].copy() 
    df_list_period.append(_df_p)    

    # Direction
    _df_d = _df[['wave_direction', 'site']].copy() 
    df_list_direction.append(_df_d) 

In [107]:
# Create one single dataframe for wave heights
# Initialize an empty DataFrame for the merge operation
df_height = pd.DataFrame()

# Iterate over each DataFrame in the list
for df in df_list_height:
    # Extract the site name from the first observation of the 'site' column
    site_name = df['site'].iloc[0]
    
    # Rename the 'wave_height' column to include the site name
    df_renamed = df[['wave_height']].rename(columns={'wave_height': f'wave_height_{site_name}'})
    
    # Merge with the existing merged_df on the 'datetime' index
    if df_height.empty:
        # If merged_df is empty, initialize it with the first DataFrame
        df_height = df_renamed
    else:
        df_height = df_height.merge(df_renamed, left_index=True, right_index=True, how='outer')

# Create one single dataframe for wave direction
# Initialize an empty DataFrame for the merge operation
df_direction = pd.DataFrame()

# Iterate over each DataFrame in the list
for df in df_list_direction:
    # Extract the site name from the first observation of the 'site' column
    site_name = df['site'].iloc[0]
    
    # Rename the 'wave_height' column to include the site name
    df_renamed = df[['wave_direction']].rename(columns={'wave_direction': f'wave_direction_{site_name}'})
    
    # Merge with the existing merged_df on the 'datetime' index
    if df_direction.empty:
        # If merged_df is empty, initialize it with the first DataFrame
        df_direction = df_renamed
    else:
        df_direction = df_direction.merge(df_renamed, left_index=True, right_index=True, how='outer')

# Create one single dataframe for wave period
# Initialize an empty DataFrame for the merge operation
df_period = pd.DataFrame()

# Iterate over each DataFrame in the list
for df in df_list_period:
    # Extract the site name from the first observation of the 'site' column
    site_name = df['site'].iloc[0]
    
    # Rename the 'wave_height' column to include the site name
    df_renamed = df[['wave_period']].rename(columns={'wave_period': f'wave_period_{site_name}'})
    
    # Merge with the existing merged_df on the 'datetime' index
    if df_period.empty:
        # If merged_df is empty, initialize it with the first DataFrame
        df_period = df_renamed
    else:
        df_period = df_period.merge(df_renamed, left_index=True, right_index=True, how='outer')


In [110]:
df_direction

Unnamed: 0_level_0,wave_direction_bilinga,wave_direction_tweed-heads-mk4,wave_direction_palm-beach-mk4,wave_direction_caloundra-pob,wave_direction_gladstone-gpa,wave_direction_brisbane-mk4,wave_direction_bundaberg,wave_direction_townsville,wave_direction_mooloolaba,wave_direction_cairns,wave_direction_mackay-mk4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-01-01 00:00:00,,92.48,87.74,85.0,-99.90,129.05,-99.90,-99.90,63.0,-99.90,119.65
2021-01-01 00:30:00,,77.80,70.24,91.0,-99.90,134.15,-99.90,-99.90,73.0,-99.90,119.21
2021-01-01 01:00:00,,78.07,72.70,75.0,79.62,127.56,10.09,95.43,61.0,145.61,122.73
2021-01-01 01:30:00,,83.60,72.79,75.0,73.62,125.54,10.09,90.43,73.0,136.61,119.47
2021-01-01 02:00:00,,77.27,78.51,74.0,86.62,135.65,10.09,87.43,70.0,118.61,118.15
...,...,...,...,...,...,...,...,...,...,...,...
2024-07-31 21:30:00,,,,,,137.68,,,,,
2024-07-31 22:00:00,,,,,,130.03,,,,,
2024-07-31 22:30:00,,,,,,134.42,,,,,
2024-07-31 23:00:00,,,,,,131.00,,,,,


In [113]:
df_direction['wave_direction_bilinga'].isna().value_counts()

wave_direction_bilinga
False    45312
True     17472
Name: count, dtype: int64

### Dealing with null values. 


Null value is -99.900. We transform -99.9 into NaN, as it can be processed by XGBoost.

In [119]:
for df in [df_direction, df_height, df_period]:
    df = df.replace(-99.9, np.nan)

## Save

In [116]:
dir = '../../Data/queensland/processed/'
df_direction.to_csv(dir + 'qld_direction.csv')
df_height.to_csv(dir + 'qld_height.csv')
df_period.to_csv(dir + 'qld_period.csv')