In [1]:
# imports
from csv import DictReader, DictWriter
from tqdm import tqdm
import pandas as pd
import numpy as np

In [4]:
original_column_names = ['station_id', 'longitude', 'latitude', 'time', 'AtmospherePressure', 'WindDirection', 'WindSpeed', 'Gust', 'WaveHeight', 'WavePeriod', 'MeanWaveDirection', 'Hmax', 'AirTemperature', 'DewPoint', 'SeaTemperature', 'RelativeHumidity']

new_column_names = ['AtmospherePressure', 'WindDirection', 'WindSpeed', 'Gust', 'AirTemperature', 'SeaTemperature', 'RelativeHumidity', 'WaveHeight', 'WavePeriod']

In [None]:
with open('../data/raw_data.csv', 'r') as f1, open('../data/interim_data.csv', 'w') as f2:
    reader = DictReader(f1, fieldnames=original_column_names)
    writer = DictWriter(f2, fieldnames=new_column_names, lineterminator='\n')

    next(reader)
    next(reader)

    writer.writeheader()

    for line in tqdm(reader):
        if any(line[column_name] == 'NaN' for column_name in new_column_names):
            continue
        writer.writerow({column_name: line[column_name] for column_name in new_column_names})

In [13]:
df = pd.read_csv('../data/interim_data.csv')
df.shape

(385474, 9)

In [14]:
# Use the interquartile range to remove the outliers in data.
for column_name in new_column_names:
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_whisker = Q1 - 1.5*IQR
    upper_whisker = Q3 + 1.5*IQR
    df = df[(df[column_name] >= lower_whisker) & (df[column_name] <= upper_whisker)]

In [15]:
# normalization
wave_height_mean = np.mean(df['WaveHeight'])
wave_height_std  = np.std(df['WaveHeight'])

wave_period_mean = np.mean(df['WavePeriod'])
wave_period_std  = np.std(df['WavePeriod'])

df['WaveHeight'] = (df['WaveHeight'] - wave_height_mean) / wave_height_std
df['WavePeriod'] = (df['WavePeriod'] - wave_period_mean) / wave_period_std

azimuth_rad = np.deg2rad(df['WindDirection'])
df['WindDirection_sin'] = np.sin(azimuth_rad)
df['WindDirection_cos'] = np.cos(azimuth_rad)
df = df.drop(columns=['WindDirection'])

df['TempDiff'] = df['AirTemperature'] - df['SeaTemperature']
df = df.drop(columns=['AirTemperature', 'SeaTemperature'])

df = df[['AtmospherePressure', 'WindSpeed', 'Gust', 'RelativeHumidity', 'WindDirection_sin', 'WindDirection_cos', 'TempDiff', 'WaveHeight', 'WavePeriod']]

df.to_csv('../data/processed_data.csv', index=False)