In [16]:
# imports
from csv import DictReader, DictWriter
from tqdm import tqdm
import pandas as pd
import numpy as np

In [17]:
original_column_names = ['station_id', 'longitude', 'latitude', 'time', 'AtmospherePressure', 'WindDirection', 'WindSpeed', 'Gust', 'WaveHeight', 'WavePeriod', 'MeanWaveDirection', 'Hmax', 'AirTemperature', 'DewPoint', 'SeaTemperature', 'RelativeHumidity']

new_column_names = ['AtmospherePressure', 'WindDirection', 'WindSpeed', 'Gust', 'AirTemperature', 'RelativeHumidity', 'WaveHeight']

In [18]:
with open('../data/raw_data.csv', 'r') as f1, open('../data/interim_data.csv', 'w') as f2:
    reader = DictReader(f1, fieldnames=original_column_names)
    writer = DictWriter(f2, fieldnames=new_column_names, lineterminator='\n')

    next(reader)
    next(reader)

    writer.writeheader()

    for line in tqdm(reader):
        if any(line[column_name] == 'NaN' for column_name in new_column_names):
            continue
        writer.writerow({column_name: line[column_name] for column_name in new_column_names})

613392it [00:02, 284366.26it/s]


In [19]:
df = pd.read_csv('../data/interim_data.csv')
df.describe()

Unnamed: 0,AtmospherePressure,WindDirection,WindSpeed,Gust,AirTemperature,RelativeHumidity,WaveHeight
count,403845.0,403845.0,403845.0,403845.0,403845.0,403845.0,403845.0
mean,1013.774914,206.758608,15.104633,21.405105,11.61408,82.416231,2.289677
std,12.234193,89.428016,6.875784,9.52244,2.87607,9.714247,1.523549
min,909.9,0.0,0.0,0.0,0.264,0.391,0.078
25%,1006.4,150.0,10.0,14.658,9.5,75.391,1.2
50%,1015.0,220.0,15.0,20.0,11.6,83.0,1.953
75%,1022.29,271.0,19.781,27.0,14.0,90.0,3.0
max,1046.4,360.0,76.506,200.0,26.172,100.0,14.1


In [20]:
# Use the interquartile range to remove the outliers in data.
for column_name in new_column_names:
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_whisker = Q1 - 1.5*IQR
    upper_whisker = Q3 + 1.5*IQR
    df = df[(df[column_name] >= lower_whisker) & (df[column_name] <= upper_whisker)]

df.describe()

Unnamed: 0,AtmospherePressure,WindDirection,WindSpeed,Gust,AirTemperature,RelativeHumidity,WaveHeight
count,378551.0,378551.0,378551.0,378551.0,378551.0,378551.0,378551.0
mean,1014.768286,204.638287,14.465392,20.374661,11.724571,82.731325,2.057696
std,11.28586,90.320138,6.284241,8.417713,2.874487,9.571322,1.164179
min,982.593,0.0,0.0,0.0,2.896,53.516,0.078
25%,1007.6,150.0,10.0,14.0,9.6,76.0,1.2
50%,1015.6,216.0,14.0,20.0,11.8,83.594,1.8
75%,1022.62,270.0,19.0,25.189,14.1,90.234,2.734
max,1045.8,360.0,33.016,46.251,20.771,100.0,5.43


In [21]:
# Feature engineering
azimuth_rad = np.deg2rad(df['WindDirection'])
df['WindDirectionSin'] = np.sin(azimuth_rad)
df['WindDirectionCos'] = np.cos(azimuth_rad)
df = df.drop(columns=['WindDirection'])

df = df[['AtmospherePressure', 'WindSpeed', 'Gust', 'RelativeHumidity', 'WindDirectionSin', 'WindDirectionCos', 'AirTemperature', 'WaveHeight']]

In [22]:
df.to_csv('../data/processed_data.csv', index=False)