In [None]:
from datetime import datetime, timedelta
from matplotlib.dates import DateFormatter
from matplotlib import pyplot
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from keras.models import Model
from keras.optimizers import RMSprop
from keras.layers import Input, LSTM, Dense, RepeatVector
import tensorflow as tf
from keras.models import Sequential
import math
import enum
import os
from pathlib import Path
import argparse
from copy import deepcopy
from skimage.measure import compare_nrmse
import time
import hashlib

### loader.py

In [None]:
def get_working_path():
    path = Path(os.getcwd()).parent
    return path


def get_child_dirs(path):
    directories = [f.path for f in os.scandir(path) if f.is_dir()]
    return directories


def get_child_files(path):
    files = [f.path for f in os.scandir(path) if f.is_file()]
    return files


def get_power_file(plant_number, date):
    dir_name = "UR00000%d" % plant_number
    file_name = date.strftime("%Y%m%d") + ".json"
    path = os.path.join(get_working_path(),
                              "data", "pow_24", dir_name, file_name)
    return path


def get_weather_file(spot_index, date):
    year = int(date.strftime("%Y"))
    file_name = "SURFACE_ASOS_%d_HR_%d_%d_%d.csv"\
                % (spot_index, year, year, year + 1)
    path = os.path.join(get_working_path(),
                        "data", "weather", file_name)
    return path


def insert_json_row(idx, df, df_insert):
    begin = df.iloc[:idx, ]
    end = df.iloc[idx:, ]
    result = pd.concat([begin, df_insert, end], ignore_index=True)
    return result


def check_json_time_series(json_data):
    for i in range(22):
        if int(json_data.loc[i]['result'].get('logHr')) != i:
            nan_row = pd.DataFrame.from_dict({"result": [{'hrPow': np.nan, 'logHr': "%02d" % i}]})
            json_data = insert_json_row(i, json_data, nan_row)
    new_row = pd.DataFrame.from_dict({"result": [{'hrPow': 0, 'logHr': "23"}]})
    json_data = insert_json_row(23, json_data, new_row)

    return json_data


def read_json(plant_number, date):
    path = get_power_file(plant_number, date)

    if os.path.isfile(path) is False or os.stat(path).st_size == 0:
        row = [pd.DataFrame.from_dict({"result": [{'hrPow': 0, 'logHr': '%02d' % i}]}) for i in range(24)]
        json_data = pd.concat(row, ignore_index=True)
    else:
        json_data = pd.read_json(path)
        json_data = check_json_time_series(json_data)

    json_data = pd.json_normalize(json_data['result'])

    return json_data


def read_csv(spot_index, date, duration=1):
    dates = [date + timedelta(days=i) for i in range(duration)]

    str_dates = []
    years = []
    result = None

    year = int(dates[0].strftime("%Y"))
    years.append(year)
    for i, date in enumerate(dates):
        str_date = date.strftime("%Y-%m-%d")
        new_year = int(date.strftime("%Y"))
        if year != new_year:
            years.append(new_year)
        year = new_year
        str_dates.append(str_date)

    for year in years:
        sub_str_dates = [str_date for str_date in str_dates if str(year) in str_date]
        path = get_weather_file(spot_index, datetime.strptime(sub_str_dates[0], "%Y-%m-%d"))
        csv_data = pd.read_csv(path, encoding='cp949')
        csv_data_days = [csv_data[csv_data['일시'].str.contains(sub_str_dates[i])] for i in range(len(sub_str_dates))]
        result = pd.concat(csv_data_days)

    return result

### util.py

In [None]:
def get_power_data(plant_number, date, duration=1):
    json_data = read_json(plant_number, date)['hrPow']
    for i in range(1, duration):
        date = date + timedelta(days=1)
        new_data = read_json(plant_number, date)['hrPow']
        json_data = json_data.append(new_data, ignore_index=True)

    power_data = json_data.interpolate(method='linear')
    power_data = normalize(power_data.to_numpy())
    return power_data


def get_weather_data(spot_index, date, feature_type, duration=1):
    csv_data = read_csv(spot_index, date, duration=duration)
    weather_data = interpolate_weather(date, csv_data, feature_type)
    weather_data = normalize(weather_data.to_numpy())
    weather_data = np.nan_to_num(weather_data)

    return weather_data


def insert_row(idx, df, df_insert):
    begin = df.iloc[:idx, ]
    end = df.iloc[idx:, ]
    df_insert = pd.DataFrame(list(df_insert.values()), columns=['일시'])
    return pd.concat([begin, df_insert, end])


def interpolate_weather(date, csv_data, feature_type):
    str_date = date.strftime("%Y-%m-%d %H:%M")

    for i in range(23):
        if csv_data.iloc[i:i+1]['일시'].values[0] != str_date:
            row = {'일시': str_date}
            csv_data = insert_row(i, csv_data, row)
        date = date + timedelta(hours=1)
        str_date = date.strftime("%Y-%m-%d %H:%M")

    weather_data = csv_data[feature_type].interpolate(method='linear')
    return weather_data


def normalize(data):
    normalized = (data-min(data))/(max(data)-min(data))
    return normalized


def get_input_data(plant_number, spot_index, starting_date, features, duration):
    input_data = np.empty([len(features) + 1, 24 * duration])

    input_data[0] = get_power_data(plant_number, starting_date, duration)
    for i in range(0, len(features)):
        input_data[i + 1] = get_weather_data(spot_index, starting_date, features[i].value, duration)

    return input_data

### Enum Classes

In [None]:
class FeatureType(enum.Enum):
    TEMPERATURE = "기온(°C)"
    PRECIPITATION = "강수량(mm)"
    WIND_SPEED = "풍속(m/s)"
    WIND_DIRECTION = "풍향(16방위)"
    HUMIDITY = "습도(%)"
    DEW_POINT_TEMPERATURE = "이슬점온도(°C)"
    STEAM_PRESSURE = "현지기압(hPa)"
    SUNSHINE = "일조(hr)"
    VISIBILITY = "시정(10m)"
    GROUND_TEMPERATURE = "지면온도(°C)"
    ATMOSPHERIC_PRESSURE = "증기압(hPa)"
    
class FileType(enum.Enum):
    MODEL = 0
    RESULT = 1

### Manage Dataset

In [None]:
class Dataset:
    def __init__(self, args):
        self.plant_number = args.plant
        self.spot_index = args.spot
        self.feature_types = args.features
        self.start = datetime.strptime(args.start_date, "%Y%m%d")
        self.end = datetime.strptime(args.end_date, "%Y%m%d")
        self.x_frames = args.x_frames
        self.y_frames = args.y_frames
        self.initialize()

    def initialize(self):
        self.duration = (self.end - self.start).days + 1
        self.data = get_input_data(self.plant_number, self.spot_index,
                                   self.start, self.feature_types, self.duration)

    def get_durations(self):
        train_duration = math.floor(self.duration * 0.75)

        val_duration = math.floor(self.duration * 0.125)
        test_duration = math.floor(self.duration * 0.125)
        return train_duration, val_duration, test_duration

    def get_dates(self):
        train_duration, val_duration, test_duration = self.get_durations()
        train_start = self.start
        train_end = train_start + timedelta(days=train_duration - 1)
        val_start = train_end + timedelta(days=1)
        val_end = val_start + timedelta(days=val_duration - 1)
        test_start = val_end + timedelta(days=1)
        test_end = test_start + timedelta(days=test_duration - 1)
        return [train_start, train_end], [val_start, val_end], [test_start, test_end]

    def get_item(self, start):
        index = (start - self.start).days * 24
        X = [self.data[i+1][index:index+(self.x_frames * 24)] for i in range(len(self.feature_types))]
        y = self.data[0][index+(self.x_frames * 24):index+((self.x_frames + self.y_frames) * 24)]
        return np.asarray(X), y

    def get_items(self, start, end):
        len_samples = (end - start).days + 1 - (self.x_frames + self.y_frames) + 1

        X = []
        y = []
        for i in range(len_samples):
            X_item, y_item = self.get_item(start + timedelta(i))
            X.append(X_item)
            y.append(y_item)

        return np.asarray(X), np.asarray(y)

    def get_dataset(self):
        train, val, test = self.get_dates()
        X_train, y_train = self.get_items(train[0], train[1])
        X_val, y_val = self.get_items(val[0], val[1])
        X_test, y_test = self.get_items(test[0], test[1])

        return [X_train, y_train], [X_val, y_val], [X_test, y_test]

In [None]:
def load_dataset(args):
    dataset = Dataset(args)
    train_set, validation_set, test_set = dataset.get_dataset()
    partition = {'train': train_set, 'val': validation_set, 'test': test_set}
    return partition

### Train, Validate, Test

In [None]:
def generate_model(args):
    with tf.device('/GPU:0'):
        encoder_inputs = Input(shape=(len(args.features), 72))

        encoder = LSTM(args.hid_dim, return_state=True)
        encoder_outputs, state_h, state_c = encoder(encoder_inputs)
        encoder_states = [state_h, state_c]

        repeat_vector = RepeatVector(24)
        repeat_vector_outputs = repeat_vector(encoder_outputs)

        decoder_lstm = LSTM(args.hid_dim, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_lstm(repeat_vector_outputs, initial_state=encoder_states)

        decoder_dense = Dense(1, activation=args.activation)
        decoder_outputs = decoder_dense(decoder_outputs)

        rmsprop = RMSprop(lr=args.lr)

        model = Model(encoder_inputs, decoder_outputs)
        model.compile(optimizer=rmsprop, loss=tf.keras.losses.MeanSquaredError(), metrics='accuracy')

    return model

In [None]:
def train(partition, args):
    X_train, y_train = partition['train']
    X_val, y_val = partition['val']
    
    batch_size = args.batch_size
    epochs = args.epochs
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=args.early_stop)
    
    model = generate_model(args)
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
              batch_size=batch_size, epochs=epochs, callbacks=[callback])
    model.summary()

    return model, history

In [None]:
def test(partition, model, args):
    X_test, y_test = partition['test']
    
    y_pred = model.predict(X_test)
    y_pred = y_pred.reshape(y_test.shape[0], y_test.shape[1])

    nrmse = compare_nrmse(y_test, y_pred)
    return nrmse

In [None]:
def experiment(args):
    partition = load_dataset(args)
    
    ts = time.time()
    model, history = train(partition, args)
    te = time.time()
    
    print('Took {:2.2f} sec for training the model'.format(te-ts))
    
    test_acc = test(partition, model, args)
    
    # ======= Add Result to Dictionary ======= #
    result = {}
    result['train_losses'] = history.history['loss']
    result['val_losses'] = history.history['val_loss']
    result['train_accs'] = history.history['accuracy']
    result['val_accs'] = history.history['val_accuracy']
    result['train_acc'] = np.mean(history.history['accuracy'])
    result['val_acc'] = np.mean(history.history['val_accuracy'])
    result['test_nrmse'] = test_acc
    return vars(args), model, result

### Manage Experiment

In [None]:
def get_filepath(setting, filetype):
    exp_name = setting['exp_name']
    hash_key = hashlib.sha1(str(setting).encode()).hexdigest()[:6]
    
    file_format = ""
    directory = ""
    if filetype == FileType.MODEL:
        file_format = 'h5'
        directory = 'models'
    elif filetype == FileType.RESULT:
        file_format = 'json'
        directory = 'results'
        
    filename = '[]-[].[]'.format(exp_name, hash_key, file_format)
    filepath = os.path.join(get_working_path(), 'directory', filename)
    return filepath
        
def save_exp_model(setting, model):
    filepath = get_filepath(setting, FileType.MODEL)
    model.save(filepath)

def save_exp_result(setting, result):
    filepath = get_filepath(setting, FileType.RESULT)
    
    result.update(setting)
    with open(filepath, 'w') as f:
        json.dump(result, f)
        
def load_exp_result(exp_name):
    dir_path = os.path.join(get_working_path(), "results")
    filenames = [f for f in listdeir(dir_path) if isfile(join(dir_path, f)) if '.json' in f]
    list_result = []
    for filename in filenames:
        if exp_name in filename:
            with open(join(dir_path, filename), 'r') as infile:
                results = json.load(infile)
                list_result.append(results)
    df = pd.DataFrame(list_result)
    return df

In [None]:
# ====== Random Seed Initialization ====== #
seed = 1234
tf.random.set_seed(seed)

parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.exp_name = "solar power prediction using weather features"

# ====== Data Loading ====== #
args.plant = 126
args.spot = 174
args.start_date = "20190820"
args.end_date = "20191231"
args.batch_size = 64
args.x_frames = 3
args.y_frames = 1

# ====== Model Capacity ===== #
args.hid_dim = 256

# ====== Optimizer & Training ====== #
args.optim = 'RMSprop'
args.activation = 'relu'
args.lss = 'MSE'
args.lr = 0.001
args.epochs = 256
args.early_stop = 30
args.evaluation = 'NRMSE'

# ====== Experiment Variable ====== #
name_var1 = 'features'
list_var1 = [FeatureType.GROUND_TEMPERATURE,
            FeatureType.VISIBILITY,
            FeatureType.WIND_SPEED,
            FeatureType.WIND_DIRECTION,
            FeatureType.TEMPERATURE,
            FeatureType.STEAM_PRESSURE,
            FeatureType.ATMOSPHERIC_PRESSURE,
            FeatureType.HUMIDITY,
            FeatureType.DEW_POINT_TEMPERATURE]

for i in range(len(list_var1)):
    sub_list = list_var1[:i+1]
    setattr(args, name_var1, sub_list)
    print(args)
    
    setting, model, result = experiment(deepcopy(args))
    save_exp_model(setting, model)
    save_exp_result(setting, result)