# Prepare inputs for neural network

In [1]:
import numpy as np
import pandas as pd
import netCDF4
import glob
from scipy import interpolate
import tqdm
from scipy.spatial.distance import cdist
import datetime

In [2]:
import os
os.chdir('/Volumes/My Passport/eem20/')

In [3]:
# turbine features
turbines = pd.read_csv('data_turbines/windturbines_fillna_shiftlon.csv')
turbines.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], inplace=True)

# construct weather grid
weather = netCDF4.Dataset('data_weather/20000915T00Z.nc', 'r')
lats = weather.variables['latitude'][:,:]
lons = weather.variables['longitude'][:,:]
grid = np.column_stack([lons.flatten(), lats.flatten()])
grid_x = []
grid_y = []

# pull out latlon of turbine
for i in tqdm.tqdm(range(len(turbines))):
    
    # position of turbine
    lonlat = np.column_stack([turbines[turbines.index == i]['Longitude'].values, turbines[turbines.index == i]['Latitude'].values])

    # calculate distances to weather grid positions
    distance = cdist(grid, lonlat).reshape(169, 71, 1)

    # index of closest gridpoint
    grid_x += [np.where(distance == min(distance.flatten()))[0][0]]
    grid_y += [np.where(distance == min(distance.flatten()))[1][0]]
    
# add to dataframe
turbines.insert(len(turbines.columns), "grid x", grid_x, True)
turbines.insert(len(turbines.columns), "grid y", grid_y, True)

100%|██████████| 4004/4004 [00:21<00:00, 188.24it/s]


In [4]:
# sector power
sector_power = pd.read_csv('data_turbines/windpower_task1.csv')

# cut out the missing dates: May 14 2000, September 26 2000
drop_indices = []
for i in range(len(sector_power)):
    if (sector_power['Unnamed: 0'][i][3]=='0') * (sector_power['Unnamed: 0'][i][6]=='5') * (sector_power['Unnamed: 0'][i][8]=='1') * (sector_power['Unnamed: 0'][i][9]=='4'):
        drop_indices += [i]
        
    if (sector_power['Unnamed: 0'][i][3]=='0') * (sector_power['Unnamed: 0'][i][6]=='9') * (sector_power['Unnamed: 0'][i][8]=='2') * (sector_power['Unnamed: 0'][i][9]=='6'):
        drop_indices += [i]

sector_power = sector_power.drop(index=drop_indices)

In [5]:
# split turbine data by sector
turbines_SE1 = turbines[turbines['Price region']=='SE1']
turbines_SE2 = turbines[turbines['Price region']=='SE2']
turbines_SE3 = turbines[turbines['Price region']=='SE3']
turbines_SE4 = turbines[turbines['Price region']=='SE4']

In [9]:
# sort out the global time stamp list for the weather files

listoffiles = (glob.glob("round_3/Comp_data/*.nc"))
time_stamps = []
for file in listoffiles:
    weather = netCDF4.Dataset(file, 'r')
    units = weather.variables['time'].units
    Timestamp = netCDF4.num2date(weather.variables['time'][:], units=units)
    time_stamps += Timestamp.tolist()

# sort into time-ordered array
time_stamps.sort()

In [11]:
# loop over weather data

# weather features to process
weather_features = ['Temperature', 'RelativeHumidity', 'WindSpeed', 'Pressure', 'CloudCover', 'WindGustSpeed']

# number of time stamps
n_times = len(time_stamps)

weather_at_turbine_locations_SE1 = np.zeros(shape=(n_times, len(turbines_SE1.index), 2*len(weather_features)))
weather_at_turbine_locations_SE2 = np.zeros(shape=(n_times, len(turbines_SE2.index), 2*len(weather_features)))
weather_at_turbine_locations_SE3 = np.zeros(shape=(n_times, len(turbines_SE3.index), 2*len(weather_features)))
weather_at_turbine_locations_SE4 = np.zeros(shape=(n_times, len(turbines_SE4.index), 2*len(weather_features)))

# loop over netCDF weather files
for file_index in tqdm.tqdm(range(len(listoffiles))):
    
    ds = netCDF4.Dataset(listoffiles[file_index], 'r')
    units = ds.variables['time'].units
    times = netCDF4.num2date(ds.variables['time'][:], units=units)
    
    # loop over hours in this day
    for t in range(24):
        
        # index where this time stamp will go
        i = time_stamps.index(times[t])
        
        # loop over weather features
        for f in range(len(weather_features)):

            # for wind speed, combine U and V, else just pull out the feature
            if f == 2:
                data = np.sqrt(ds.variables['Wind_U'][t,:,:,:]**2 + ds.variables['Wind_V'][t,:,:,:]**2)
            else:
                data = ds.variables[weather_features[f]][t,:,:,:]
            
            # mean and std-dev
            mean = data.mean(axis=0)
            sd = data.std(axis=0)
            
            weather_at_turbine_locations_SE1[i,:,2*f] = mean[turbines_SE1['grid x'].values,turbines_SE1['grid y'].values]
            weather_at_turbine_locations_SE1[i,:,2*f+1] = sd[turbines_SE1['grid x'].values,turbines_SE1['grid y'].values]
    
            weather_at_turbine_locations_SE2[i,:,2*f] = mean[turbines_SE2['grid x'].values,turbines_SE2['grid y'].values]
            weather_at_turbine_locations_SE2[i,:,2*f+1] = sd[turbines_SE2['grid x'].values,turbines_SE2['grid y'].values]

            weather_at_turbine_locations_SE3[i,:,2*f] = mean[turbines_SE3['grid x'].values,turbines_SE3['grid y'].values]
            weather_at_turbine_locations_SE3[i,:,2*f+1] = sd[turbines_SE3['grid x'].values,turbines_SE3['grid y'].values]

            weather_at_turbine_locations_SE4[i,:,2*f] = mean[turbines_SE4['grid x'].values,turbines_SE4['grid y'].values]
            weather_at_turbine_locations_SE4[i,:,2*f+1] = sd[turbines_SE4['grid x'].values,turbines_SE4['grid y'].values]

100%|██████████| 61/61 [02:55<00:00,  2.88s/it]


In [12]:
# trim the turbine feature data to only contain the four relevant things
turbine_features = ['Terrain height [m]', 'Nacelle height [m]', 'Rotor diameter [m]', 'Max power [MW]']

turbines_SE1 = np.column_stack([turbines[turbines['Price region']=='SE1'][turbine_features[i]].values for i in range(len(turbine_features))])
turbines_SE2 = np.column_stack([turbines[turbines['Price region']=='SE2'][turbine_features[i]].values for i in range(len(turbine_features))])
turbines_SE3 = np.column_stack([turbines[turbines['Price region']=='SE3'][turbine_features[i]].values for i in range(len(turbine_features))])
turbines_SE4 = np.column_stack([turbines[turbines['Price region']=='SE4'][turbine_features[i]].values for i in range(len(turbine_features))])

In [13]:
# installation dates
installation_dates = []
for sector in ['SE1', 'SE2', 'SE3', 'SE4']:
    
    turbines_sector = turbines[turbines['Price region']==sector].reset_index()
    
    installation_date = []
    for i in range(len(turbines_sector)):
        if turbines_sector['Installation date'][i] == '1881-12-31':
            installation_date += [datetime.datetime.strptime(turbines_sector['Installation date'][i], '%Y-%m-%d')]
        else:
            installation_date += [datetime.datetime.strptime(turbines_sector['Installation date'][i], '%d/%m/%y')]
    
    # re-base dates (eg., `64 should be 1964, not 2064 as come out from the above)
    for i in range(len(installation_date)):
        if installation_date[i].year > 2020:
            installation_date[i] = installation_date[i].replace(year = installation_date[i].year - 100)
        
    installation_dates.append(installation_date)
    
installed = []
for s in range(4):
    
    install = np.zeros((len(time_stamps), len(installation_dates[s]), 1))
    
    # compare install dates to the weather time stamps
    for i in range(len(time_stamps)):
        for j in range(len(installation_dates[s])):
            install[i,j,0] = int(installation_dates[s][j] < time_stamps[i])
    
    installed.append(install)

In [14]:
# save everything to file

np.save('round_3/data_prepared/weather_at_turbine_locations_SE1.npy', weather_at_turbine_locations_SE1)
np.save('round_3/data_prepared/weather_at_turbine_locations_SE2.npy', weather_at_turbine_locations_SE2)
np.save('round_3/data_prepared/weather_at_turbine_locations_SE3.npy', weather_at_turbine_locations_SE3)
np.save('round_3/data_prepared/weather_at_turbine_locations_SE4.npy', weather_at_turbine_locations_SE4)

np.save('round_3/data_prepared/turbines_SE1.npy', turbines_SE1)
np.save('round_3/data_prepared/turbines_SE2.npy', turbines_SE2)
np.save('round_3/data_prepared/turbines_SE3.npy', turbines_SE3)
np.save('round_3/data_prepared/turbines_SE4.npy', turbines_SE4)

np.save('round_3/data_prepared/sector_power_SE1.npy', np.atleast_2d(sector_power['SE1'].values).T)
np.save('round_3/data_prepared/sector_power_SE2.npy', np.atleast_2d(sector_power['SE2'].values).T)
np.save('round_3/data_prepared/sector_power_SE3.npy', np.atleast_2d(sector_power['SE3'].values).T)
np.save('round_3/data_prepared/sector_power_SE4.npy', np.atleast_2d(sector_power['SE4'].values).T)

np.save('round_3/data_prepared/installed_SE1.npy', installed[0])
np.save('round_3/data_prepared/installed_SE2.npy', installed[1])
np.save('round_3/data_prepared/installed_SE3.npy', installed[2])
np.save('round_3/data_prepared/installed_SE4.npy', installed[3])

In [11]:
# calculate shifts and scales for the turbine feature and weather inputs

turbine_features_shift = np.median(np.row_stack([turbines_SE1, turbines_SE2, turbines_SE3, turbines_SE4]), axis=0)
turbine_features_scale = np.max(np.row_stack([turbines_SE1, turbines_SE2, turbines_SE3, turbines_SE4]), axis=0) - np.min(np.row_stack([turbines_SE1, turbines_SE2, turbines_SE3, turbines_SE4]), axis=0)

weather_features_shift = np.median(np.concatenate([weather_at_turbine_locations_SE1,
                                        weather_at_turbine_locations_SE2,
                                        weather_at_turbine_locations_SE3,
                                        weather_at_turbine_locations_SE4], axis=1), axis=(0,1))

weather_features_scale = np.max(np.concatenate([weather_at_turbine_locations_SE1,
                                        weather_at_turbine_locations_SE2,
                                        weather_at_turbine_locations_SE3,
                                        weather_at_turbine_locations_SE4], axis=1), axis=(0,1)) - \
                         np.min(np.concatenate([weather_at_turbine_locations_SE1,
                                        weather_at_turbine_locations_SE2,
                                        weather_at_turbine_locations_SE3,
                                        weather_at_turbine_locations_SE4], axis=1), axis=(0,1))

np.save('data_prepared/turbine_features_shift.npy', turbine_features_shift)
np.save('data_prepared/turbine_features_scale.npy', turbine_features_scale)
np.save('data_prepared/weather_features_shift.npy', weather_features_shift)
np.save('data_prepared/weather_features_scale.npy', weather_features_scale)