In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from electricity_price_predictor.data import get_shifted_load, get_shifted_price

## Function to retrieve weather data

In [5]:
def get_weather(path='../raw_data/weather_2015_2020.csv'):
    df = pd.read_csv(path)
    
    df['dt'] = pd.to_datetime(df.dt)
    
    # drop unnecessary columns
    to_drop = ['dt_iso','timezone','lat', 'lon','sea_level','grnd_level',
               'rain_1h','rain_3h', 'pressure', 'snow_1h', 'snow_3h', 
               'temp_min','temp_max','weather_id', 'weather_description', 
               'weather_icon']
    df = df.drop(to_drop, axis=1)
    
    # population of each city in the df 
    pop = {'Aarhus': 349_983,
        'Odense': 204_895,
        'Aalborg': 217_075,
        'Esbjerg': 115_748,
        'Vejle': 111_743,
        'Randers': 96_559,
        'Viborg': 93_819,
        'Kolding': 89_412,
        'Silkeborg': 89_328,
        'Herning': 86_348,
        'Horsens': 83_598}
    
    df['population'] = [pop[city] for city in df.city_name]
    
    # numeric weather values as affects demand or supply
    numeric_cols = ['temp', 'feels_like', 'humidity',  'clouds_all','wind_speed', 'wind_deg']
    
    weather_df = pd.DataFrame()
    
    #for the numeric columns, group by datetime and average according to their population weight
    for col in numeric_cols:
    #group by the datecolumn for each element in the column average it by it's weight
        weather_df[col] = df.groupby(df.dt).apply(lambda x : np.average(x[col], weights=x.population))
        
    
    # check for missing indices
    missing_idx = pd.date_range(start = '2015-01-01', end = '2020-11-24', freq='H' ).difference(weather_df.index)
    
    # impute missing indices with average of bounding rows
    for idx in missing_idx:
        weather_df.loc[idx] = weather_df.loc[pd.to_datetime(idx) - timedelta(hours= 1)] + \
                      weather_df.loc[pd.to_datetime(idx) + timedelta(hours= 1)] / 2 
    
    return weather_df

In [6]:
df = get_weather()

In [7]:
df.head()

Unnamed: 0_level_0,temp,feels_like,humidity,clouds_all,wind_speed,wind_deg
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01 00:00:00,5.432478,-0.154827,86.7587,83.913979,5.937703,240.418412
2015-01-01 01:00:00,5.488014,0.354168,87.139723,91.950524,5.322343,234.663941
2015-01-01 02:00:00,5.599633,0.307204,87.786899,89.471946,5.605773,237.303816
2015-01-01 03:00:00,6.23438,0.99153,87.241639,88.896068,5.681109,237.16409
2015-01-01 04:00:00,6.305504,1.011242,88.078906,92.107249,5.809829,236.33376


## Test after package

In [1]:
from electricity_price_predictor.data import get_weather

In [2]:
get_weather()

Unnamed: 0_level_0,temp,feels_like,humidity,clouds_all,wind_speed,wind_deg
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01 00:00:00,5.432478,-0.154827,86.758700,83.913979,5.937703,240.418412
2015-01-01 01:00:00,5.488014,0.354168,87.139723,91.950524,5.322343,234.663941
2015-01-01 02:00:00,5.599633,0.307204,87.786899,89.471946,5.605773,237.303816
2015-01-01 03:00:00,6.234380,0.991530,87.241639,88.896068,5.681109,237.164090
2015-01-01 04:00:00,6.305504,1.011242,88.078906,92.107249,5.809829,236.333760
...,...,...,...,...,...,...
2016-03-27 02:00:00,11.127135,2.719387,120.440943,99.097881,9.280806,240.234604
2017-03-26 02:00:00,7.332196,3.531517,144.181985,116.542063,2.741674,441.864719
2018-03-25 02:00:00,4.905578,0.409993,147.320293,129.658418,3.210394,389.227976
2019-03-31 02:00:00,7.544891,-0.003615,127.079972,88.261550,7.448825,413.987643
