In [1]:
import requests
import numpy as np
import pandas as pd
import time
import ephem

In [2]:
def get_coordinates(city):
    r = requests.get(f'https://en.wikipedia.org/w/api.php?action=query&prop=coordinates&titles={city}&%20format=json')

    r = r.json()

    pages = r['query']['pages']

    for k, v in pages.items():
        return str(v['coordinates'][0]['lat']), str(v['coordinates'][0]['lon'])

In [3]:
def get_elevation(lat, long):
    r = requests.get(f'https://api.opentopodata.org/v1/eudem25m?locations={lat},{long}')
    r = r.json()
    return r['results'][0]['elevation']

In [4]:
def add_geo_features(df):
    city_mapper = {'h': "Hannover", 
               'bs': "Braunschweig", 
               'ol': "Oldenburg", 
               'os': "Osnabrück", 
               'wob': "Wolfsburg", 
               'go': "Göttingen",
               'sz': "Salzgitter", 
               'hi': "Hildesheim", 
               'del': "Delmenhorst", 
               'lg': "Lüneburg", 
               'whv': "Wilhelmshaven", 
               'ce': "Celle", 
               'hm': "Hameln", 
               'el': "Lingen(Ems)"
              }
    
    # map to identifiers on wikipedia
    keys = list(city_mapper.keys())
    values = ['Hanover',
              'Braunschweig',
              'Oldenburg_(city)',
              'Osnabrück',
              'Wolfsburg',
              'Göttingen',
              'Salzgitter',
              'Hildesheim',
              'Delmenhorst',
              'Lüneburg',
              'Wilhelmshaven',
              'Celle',
              'Hamelin',
              'Lingen,_Germany']
    cities_wiki = dict(zip(keys, values))
    
    lat_dict = dict()
    long_dict = dict()
    elevation_dict = dict()

    for k, v in cities_wiki.items():
        lat, long = get_coordinates(v)
        lat_dict[k] = lat
        long_dict[k] = long
        elevation_dict[k] = get_elevation(lat, long)
        time.sleep(1)
    
    df['latitute'] = df.city.replace(lat_dict)
    df['longitude'] = df.city.replace(long_dict)
    df['elevation'] = df.city.replace(elevation_dict)

In [5]:
def preprocess(df):
    df.rename(columns = {'Load [MWh]':'load', 'Time [s]':'time', 'City':'city'}, inplace = True)
    df.time = pd.to_datetime(df.time)

In [6]:
def encode(df, col, max_val):
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_val)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_val)

In [7]:
def add_time_features(df, drop=True):
    df['hour'] = df.time.dt.hour
    df['day_name'] = df.time.dt.day_name()
    df['weekday'] = df.time.dt.weekday
    df['day'] = df.time.dt.day
    df['month'] = df.time.dt.month
    df['year'] = df.time.dt.year
    df['dayofyear'] = df.time.dt.dayofyear
    
    encode(df, 'hour', 23)
    encode(df, 'day', 31)
    encode(df, 'month', 12)
    encode(df, 'dayofyear', 365)
    
    if drop:
        df.drop(columns = ['time'], inplace = True)

In [12]:
def get_day(time, lat, lon, elevation):
    observer.date = time
    observer.lat = lat
    observer.lon = lon
    observer.elevation = elevation
    sun.compute(observer)
    current_sun_alt = sun.alt
    if current_sun_alt*180/np.pi < -6:
        return 0
    else:
        return 1

In [13]:
def get_dusk(time, lat, lon, elevation):
    observer.date = time
    observer.lat = lat
    observer.lon = lon
    observer.elevation = elevation
    sun.compute(observer)
    
    nxt_setting = observer.next_setting(sun).datetime()
    nxt_setting_rounded = pd.to_datetime(nxt_setting.replace(microsecond=0, second=0, minute=0))
    nxt_setting_later = pd.to_datetime(nxt_setting_rounded + pd.Timedelta(hours=1))
    if (time.time() >= nxt_setting_rounded.time()) and (time.time() <= nxt_setting_later.time()):
        return 1
    else:
        return 0

In [14]:
def get_dawn(time, lat, lon, elevation):
    observer.date = time
    observer.lat = lat
    observer.lon = lon
    observer.elevation = elevation
    sun.compute(observer)
    
    prev_rising = observer.previous_rising(sun).datetime()
    prev_rising_rounded = pd.to_datetime(prev_rising.replace(microsecond=0, second=0, minute=0))
    prev_rising_later = pd.to_datetime(prev_rising_rounded + pd.Timedelta(hours=1))
    if (time.time() >= prev_rising_rounded.time()) and (time.time() <= prev_rising_later.time()):
        return 1
    else:
        return 0

In [15]:
def get_sun_position(time, lat, lon, elevation):
    observer.date = time
    observer.lat = lat
    observer.lon = lon
    observer.elevation = elevation
    sun.compute(observer)
    
    current_sun_alt = sun.alt
    return current_sun_alt * 180/np.pi

In [22]:
def add_daylight_features(df):
    global sun, observer 
    sun = ephem.Sun()
    observer = ephem.Observer()
    
    df['is_day'] = df.apply(lambda x: get_day(x.time, x.latitute, x.longitude, x.elevation), axis=1)
    df['is_dusk'] = df.apply(lambda x: get_dusk(x.time, x.latitute, x.longitude, x.elevation), axis=1)
    df['is_dawn'] = df.apply(lambda x: get_dawn(x.time, x.latitute, x.longitude, x.elevation), axis=1)
    df['sun_position'] = df.apply(lambda x: get_sun_position(x.time, x.latitute, x.longitude, x.elevation), axis=1)

In [8]:
df_train = pd.read_csv('/hkfs/work/workspace/scratch/bh6321-energy_challenge/data/train.csv')

In [9]:
df_val = pd.read_csv('/hkfs/work/workspace/scratch/bh6321-energy_challenge/data/valid.csv')

In [10]:
df_train.head()

Unnamed: 0,Load [MWh],Time [s],City
0,263.365956,2015-01-01 00:00:00,bs
1,259.073621,2015-01-01 01:00:00,bs
2,246.612481,2015-01-01 02:00:00,bs
3,238.313719,2015-01-01 03:00:00,bs
4,237.11439,2015-01-01 04:00:00,bs


In [11]:
preprocess(df_train)
add_time_features(df_train, drop = False)
add_geo_features(df_train)
add_daylight_features(df_train)

In [24]:
df_train.head()

Unnamed: 0,load,time,city,hour,day_name,weekday,day,month,year,dayofyear,...,month_cos,dayofyear_sin,dayofyear_cos,latitute,longitude,elevation,is_day,is_dusk,is_dawn,sun_position
0,263.365956,2015-01-01 00:00:00,bs,0,Thursday,3,1,1,2015,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-59.840175
1,259.073621,2015-01-01 01:00:00,bs,1,Thursday,3,1,1,2015,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-55.19509
2,246.612481,2015-01-01 02:00:00,bs,2,Thursday,3,1,1,2015,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-47.964556
3,238.313719,2015-01-01 03:00:00,bs,3,Thursday,3,1,1,2015,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-39.408925
4,237.11439,2015-01-01 04:00:00,bs,4,Thursday,3,1,1,2015,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-30.318257


In [27]:
preprocess(df_val)
add_time_features(df_val, drop = False)
add_geo_features(df_val)
add_daylight_features(df_val)

In [29]:
df_val.head()

Unnamed: 0,load,time,city,hour,day_name,weekday,day,month,year,dayofyear,...,month_cos,dayofyear_sin,dayofyear_cos,latitute,longitude,elevation,is_day,is_dusk,is_dawn,sun_position
0,208.066865,2018-01-01 00:00:00,bs,0,Monday,0,1,1,2018,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-59.825798
1,202.341276,2018-01-01 01:00:00,bs,1,Monday,0,1,1,2018,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-55.190329
2,191.499046,2018-01-01 02:00:00,bs,2,Monday,0,1,1,2018,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-47.965888
3,183.315389,2018-01-01 03:00:00,bs,3,Monday,0,1,1,2018,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-39.413265
4,181.941545,2018-01-01 04:00:00,bs,4,Monday,0,1,1,2018,1,...,0.866025,0.017213,0.999852,52.26666667,10.51666667,76.70401,0,0,0,-30.323468


In [30]:
df_train.to_csv('data/processed/train.csv', index=False)

In [31]:
df_val.to_csv('data/processed/valid.csv', index=False)