In [1]:
import sys
import os
import numpy as np
from pathlib import Path
root = Path(os.getcwd()).parent.parent
sys.path.append(str(Path(os.getcwd()).parent))
np.set_printoptions(threshold=sys.maxsize, suppress=True)

In [2]:
# set korean for matplotlib
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

In [3]:
from loader import DataType, Dataset, Power, Weather, Loader
from constant import FeatureType
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import pandas as pd
import argparse

## Setting

In [4]:
parser = argparse.ArgumentParser()
args = parser.parse_args("")

# ====== Data ====== #
args.years = [2017, 2018, 2019]
args.region = "Jindo"
args.station = 192

# Load Dataset

## Power

In [5]:
def check_missing_dates(year, power_data):
    date_checker = datetime.strptime("%d.01.01" % year, "%Y.%m.%d")
    dates = power_data['년월일']
    count = 0

    for index, value in dates.items():
        new_date = datetime.strptime(value, "%Y.%m.%d")
        if date_checker != new_date:
            print("standard: %s, file: %s" % (str(date_checker, str(new_date))))
            date_checker = new_date
            count += 1
        date_checker = date_checker + timedelta(days=1)
        
    print("%d missing dates" % count)
    
def check_midnight_values(power_data):
    midnights = power_data['24']
    for index, value in midnights.items():
        count = 0
        if value != 0:
            print("index: %d, value: %d" % (index, value))
            count += 0
    print("%d value(s) are not zero" % count)
    
def convert_df_to_list(power_data, extra=1, zfill=1):
    power_data_list = []
    for index, row in power_data.iterrows():
        for i in range(1, 24):
            value = row[str(i).zfill(zfill)] * extra
            power_data_list.append(int(value))
        if index == 0: print(power_data_list)
        power_data_list.append(0)
        
    return power_data_list

In [6]:
power_path = os.path.join(root, "data", "pow")
extras = [1, 1000, 1]
zfills = [1, 1, 2]

power_data_list = []
for i, year in enumerate(args.years):
    power_file_path = os.path.join(power_path, "%s_%d_%d.csv" % (args.region, args.station, year))
    power_data = pd.read_csv(power_file_path, encoding='euc-kr')
    
    check_missing_dates(year, power_data)
    check_midnight_values(power_data)
    
    power_data_yearly = convert_df_to_list(power_data, extra=extras[i], zfill=zfills[i])
    power_data_list.append(power_data_yearly)
power_data = sum(power_data_list, [])

0 missing dates
0 value(s) are not zero
[0, 0, 0, 0, 0, 0, 4, 28, 59, 125, 174, 223, 220, 188, 122, 33, 5, 0, 0, 0, 0, 0, 0]
0 missing dates
0 value(s) are not zero
[0, 0, 0, 0, 0, 0, 3, 21, 63, 123, 180, 233, 243, 198, 131, 31, 3, 0, 0, 0, 0, 0, 0]
0 missing dates
0 value(s) are not zero
[0, 0, 0, 0, 0, 0, 5, 22, 57, 117, 159, 0, 211, 226, 126, 92, 20, 4, 0, 0, 0, 0, 0]


## Weather

In [7]:
weather_path = os.path.join(root, "data", "weather")

features = [FeatureType.SUNSHINE,
                 FeatureType.HUMIDITY,
                 FeatureType.WIND_SPEED,
                 FeatureType.VISIBILITY,
                 FeatureType.GROUND_TEMPERATURE,
                 FeatureType.WIND_DIRECTION,
                 FeatureType.STEAM_PRESSURE,
                 FeatureType.TEMPERATURE,
                 FeatureType.PRECIPITATION,
                 FeatureType.DEW_POINT_TEMPERATURE,
                 FeatureType.ATMOSPHERIC_PRESSURE]

In [8]:
def check_missing_dates(year, weather_data):
    weather_data['일시'] = pd.to_datetime(weather_data['일시'], format='%Y-%m-%d %H:%M')
    full_idx = pd.date_range(start=weather_data['일시'].min(), end=weather_data['일시'].max(), freq='60T')

    missing_hour_filled_weather = weather_data.set_index('일시').reindex(full_idx).rename_axis('일시').reset_index()

    start_date = datetime.strptime("%d-01-01 00:00" % year, '%Y-%m-%d %H:%M')
    missing_dates = missing_hour_filled_weather['일시'].isin(weather_data['일시'])
    missing_dates = [(start_date + timedelta(hours=i)).strftime('%Y-%m-%d %H:%M') for i, val in enumerate(missing_dates) if not val] 

    print("missing dates:", missing_dates)
    return missing_hour_filled_weather

def interpolate_weather(weather_data, features):
    weather_df = pd.DataFrame()
    weather_df['일시'] = weather_data['일시']
    for feature in features:
        weather_raw = weather_data[feature.value]
        if feature == FeatureType.PRECIPITATION or feature == FeatureType.SUNSHINE:
            weather_feature = weather_data[feature.value].fillna(0)
        else:
            weather_feature = weather_data[feature.value]
        weather_feature = weather_feature.interpolate(mathoed='linear')
        weather_df[feature.value] = weather_feature
    return weather_df

In [9]:
weather_data_list = []
for i, year in enumerate(args.years):
    filename = "SURFACE_ASOS_%d_HR_%d_%d_%d.csv" % (args.station, year, year, year + 1)
    weather_file_path = os.path.join(weather_path, filename)
    weather_data = pd.read_csv(weather_file_path, encoding='euc-kr')
    weather_data = check_missing_dates(year, weather_data)
    weather_data_yearly = interpolate_weather(weather_data, features)
    weather_data_yearly = weather_data_yearly.set_index('일시')
    weather_data_list.append(weather_data_yearly)
    
weather_data = pd.concat(weather_data_list)

missing dates: []
missing dates: ['2018-01-13 09:00', '2018-01-13 10:00', '2018-01-13 11:00', '2018-01-13 12:00', '2018-01-13 13:00']
missing dates: []


In [10]:
weather_data.index

DatetimeIndex(['2017-01-01 00:00:00', '2017-01-01 01:00:00',
               '2017-01-01 02:00:00', '2017-01-01 03:00:00',
               '2017-01-01 04:00:00', '2017-01-01 05:00:00',
               '2017-01-01 06:00:00', '2017-01-01 07:00:00',
               '2017-01-01 08:00:00', '2017-01-01 09:00:00',
               ...
               '2019-12-31 14:00:00', '2019-12-31 15:00:00',
               '2019-12-31 16:00:00', '2019-12-31 17:00:00',
               '2019-12-31 18:00:00', '2019-12-31 19:00:00',
               '2019-12-31 20:00:00', '2019-12-31 21:00:00',
               '2019-12-31 22:00:00', '2019-12-31 23:00:00'],
              dtype='datetime64[ns]', name='일시', length=26280, freq=None)

In [11]:
weather_data

Unnamed: 0_level_0,일조(hr),습도(%),풍속(m/s),시정(10m),지면온도(°C),풍향(16방위),현지기압(hPa),기온(°C),강수량(mm),이슬점온도(°C),증기압(hPa)
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-01-01 00:00:00,0.0,85.0,0.0,720.0,-1.3,0.0,1026.5,-2.4,0.0,-4.5,4.4
2017-01-01 01:00:00,0.0,86.0,0.1,700.0,-1.5,0.0,1026.2,-3.0,0.0,-5.0,4.2
2017-01-01 02:00:00,0.0,87.0,0.0,637.0,-1.7,0.0,1026.6,-3.1,0.0,-4.9,4.2
2017-01-01 03:00:00,0.0,88.0,0.1,626.0,-1.8,0.0,1026.8,-3.5,0.0,-5.2,4.2
2017-01-01 04:00:00,0.0,88.0,0.3,573.0,-1.9,0.0,1026.2,-3.6,0.0,-5.3,4.1
...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 19:00:00,0.0,29.0,1.6,2000.0,-1.1,20.0,1028.5,-0.7,0.0,-16.5,1.7
2019-12-31 20:00:00,0.0,31.0,1.0,2000.0,-1.7,50.0,1028.3,-1.5,0.0,-16.4,1.7
2019-12-31 21:00:00,0.0,31.0,0.8,2000.0,-2.0,50.0,1028.4,-2.4,0.0,-17.2,1.6
2019-12-31 22:00:00,0.0,39.0,0.4,2000.0,-2.5,0.0,1028.6,-3.7,0.0,-15.6,1.8


# Scale Dataset

In [12]:
from sklearn.preprocessing import MinMaxScaler

In [13]:
power_data = np.asarray(power_data)

power_scaler = MinMaxScaler()
power_scaled = power_scaler.fit_transform(power_data.reshape(-1, 1))
power_scaled

array([[0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00203459],
       [0.01424212],
       [0.03001017],
       [0.06358087],
       [0.08850458],
       [0.11342828],
       [0.11190234],
       [0.09562564],
       [0.06205493],
       [0.01678535],
       [0.00254323],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00101729],
       [0.01220753],
       [0.03051882],
       [0.06154629],
       [0.08290946],
       [0.1093591 ],
       [0.11190234],
       [0.08901322],
       [0.03204476],
       [0.01068159],
       [0.00152594],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.   

In [14]:
str_features = [feature.value for feature in features]

weather_scaler = MinMaxScaler()
weather_data[str_features] = weather_scaler.fit_transform(weather_data[str_features])
weather_data

Unnamed: 0_level_0,일조(hr),습도(%),풍속(m/s),시정(10m),지면온도(°C),풍향(16방위),현지기압(hPa),기온(°C),강수량(mm),이슬점온도(°C),증기압(hPa)
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-01-01 00:00:00,0.0,0.838710,0.000000,0.346899,0.132561,0.000000,0.861423,0.225296,0.0,0.425688,0.109827
2017-01-01 01:00:00,0.0,0.849462,0.013333,0.337209,0.129987,0.000000,0.855805,0.213439,0.0,0.416514,0.104046
2017-01-01 02:00:00,0.0,0.860215,0.000000,0.306686,0.127413,0.000000,0.863296,0.211462,0.0,0.418349,0.104046
2017-01-01 03:00:00,0.0,0.870968,0.013333,0.301357,0.126126,0.000000,0.867041,0.203557,0.0,0.412844,0.104046
2017-01-01 04:00:00,0.0,0.870968,0.040000,0.275678,0.124839,0.000000,0.855805,0.201581,0.0,0.411009,0.101156
...,...,...,...,...,...,...,...,...,...,...,...
2019-12-31 19:00:00,0.0,0.236559,0.213333,0.967054,0.135135,0.055556,0.898876,0.258893,0.0,0.205505,0.031792
2019-12-31 20:00:00,0.0,0.258065,0.133333,0.967054,0.127413,0.138889,0.895131,0.243083,0.0,0.207339,0.031792
2019-12-31 21:00:00,0.0,0.258065,0.106667,0.967054,0.123552,0.138889,0.897004,0.225296,0.0,0.192661,0.028902
2019-12-31 22:00:00,0.0,0.344086,0.053333,0.967054,0.117117,0.000000,0.900749,0.199605,0.0,0.222018,0.034682
