In [28]:
# imports
from csv import DictReader, DictWriter
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [30]:
og_column_names = ['station_id', 'longitude', 'latitude', 'time', 'AtmospherePressure', 'WindDirection', 'WindSpeed', 'Gust', 'WaveHeight', 'WavePeriod', 'MeanWaveDirection', 'Hmax', 'AirTemperature', 'DewPoint', 'SeaTemperature']

column_names = ['AtmospherePressure', 'WindDirection', 'WindSpeed', 'Gust', 'AirTemperature', 'SeaTemperature', 'WaveHeight', 'WavePeriod']

with open('../data/raw_data.csv', 'r') as f1, open('../data/clean_data.csv', 'w') as f2:
    reader = DictReader(f1, fieldnames=og_column_names)
    writer = DictWriter(f2, fieldnames=column_names, lineterminator='\n')

    next(reader)
    next(reader)

    writer.writeheader()

    for line in tqdm(reader):
        if any(line[column_name] == 'NaN' for column_name in column_names):
            continue
        writer.writerow({column_name: line[column_name] for column_name in column_names})


613392it [00:02, 275236.47it/s]


In [31]:
df = pd.read_csv('../data/clean_data.csv')
df.describe()

Unnamed: 0,AtmospherePressure,WindDirection,WindSpeed,Gust,AirTemperature,SeaTemperature,WaveHeight,WavePeriod
count,417645.0,417645.0,417645.0,417645.0,417645.0,417645.0,417645.0,417645.0
mean,1013.734783,206.22709,15.065121,21.330294,11.625455,12.446026,2.316566,6.131247
std,12.241317,89.490168,6.882371,9.552581,2.882423,2.302647,1.537627,1.734126
min,910.3,0.0,0.0,0.0,0.264,0.2,0.1,2.0
25%,1006.4,150.0,10.0,14.231,9.5,10.7,1.2,5.0
50%,1015.0,220.0,15.0,20.0,11.6,12.3,2.0,6.0
75%,1022.2,271.0,19.639,27.0,14.0,14.355,3.0,7.0
max,1046.4,360.0,76.506,200.0,22.104,24.2,15.742,18.0


In [32]:
# Use the interquartile range to remove the outliers in data.
for column_name in column_names:

    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1

    lower_whisker = Q1 - 1.5*IQR
    upper_whisker = Q3 + 1.5*IQR
    df = df[(df[column_name] >= lower_whisker) & (df[column_name] <= upper_whisker)]

df.describe()

Unnamed: 0,AtmospherePressure,WindDirection,WindSpeed,Gust,AirTemperature,SeaTemperature,WaveHeight,WavePeriod
count,391691.0,391691.0,391691.0,391691.0,391691.0,391691.0,391691.0,391691.0
mean,1014.709762,204.217605,14.461454,20.359107,11.734562,12.515255,2.090284,5.939886
std,11.267965,90.336266,6.303322,8.473279,2.881194,2.331295,1.192423,1.55046
min,982.7,0.0,0.0,0.0,2.861,5.9,0.1,2.0
25%,1007.458,150.0,10.0,14.0,9.6,10.7,1.2,5.0
50%,1015.6,215.0,14.0,20.0,11.8,12.5,1.875,6.0
75%,1022.6,270.0,19.0,25.473,14.106,14.424,2.8,7.0
max,1045.8,360.0,32.788,46.45,20.8,19.2,5.6,10.0


In [37]:
features = column_names[:-2]
labels = column_names[-2:]

X = df[features].to_numpy()
# Note that we have 2 labels
Y = df[labels].to_numpy()

[[1.4   7.   ]
 [1.3   7.   ]
 [1.4   7.   ]
 ...
 [3.4   7.   ]
 [1.719 4.57 ]
 [3.1   7.   ]]


In [40]:
# The commonly used split of 75% for training, 10% for validation, and 15%
# for the test sets has been followed when splitting the dataset.
# V Roshan Joseph. Optimal ratio for data splitting. Statistical Analysis and Data Mining: The
# ASA Data Science Journal, 15(4):531–538, 2022.

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, test_size=0.6, random_state=42)

# y1: WaveHeight
y1_train = Y_train[:,0]
y1_val = Y_val[:,0]
y1_test = Y_test[:,0]

# y2: WavePeriod
y2_train = Y_train[:,1]
y2_val = Y_val[:,1]
y2_test = Y_test[:,1]

(293768, 6)
(293768,)
