In [1]:
import pandasql as ps
import pandas as pd

In [2]:
train = pd.read_csv('./data/clean/train_set.csv', parse_dates=True, index_col='timestamp')
test = pd.read_csv('./data/clean/test_set.csv', parse_dates=True, index_col='timestamp')

Concatenate the train and test data to apply the following analysis to both of them.

In [3]:
full = pd.concat([train,test])

------

# 3. Data Reframe to a Supervised Learning Schema

### Adding Hour, Day, Week and Month as Categorical Variables

In [4]:
def weekday_categorical_int(data, initial_day = 1):
    '''
    Create a categorical value for the number of the day of the week. 1 = monday, 
    2 = tuesday and so on. 
    '''
    day = [initial_day]
    
    for i in range(data.shape[0] - 1):
        if day[i] == 7:
            num_day = 0
        else:
            num_day = day[i]
        day.append(num_day + 1)
    return day

In [5]:
full['month'] = [d.month for d in full.index]
full['day'] = [d.day for d in full.index]
full['weekday'] = weekday_categorical_int(full, 5)
full['hour'] = [d.hour for d in full.index]

### Changing Wind Direction to Categorical

In [6]:
def wind_dir_categorical_int(data):
    wind_dir = data['wind_direction'].astype('float32').tolist()
    for i in range(len(wind_dir)):
        x = wind_dir[i]
        if 22.5 <= x < 67.5:
            wind_dir[i] = 1
        elif 67.5 <= x < 112.5:
            wind_dir[i] = 2
        elif 112.5 <= x < 157.5:
            wind_dir[i] = 3
        elif 157.5 <= x < 202.5:
            wind_dir[i] = 4
        elif 202.5 <= x < 247.5:
            wind_dir[i] = 5
        elif 247.5 <= x < 292.5:
            wind_dir[i] = 6
        elif 292.5 <= x < 337.5:
            wind_dir[i] = 7
        else:
            wind_dir[i] = 0
    return wind_dir

In [7]:
full['wind_direction'] = wind_dir_categorical_int(full)

In [8]:
elec = pd.DataFrame(full.pop('electricity'))
# now full doesn't contain "electricity" anymore. Order of join will move it to left or right:
full = elec.join(full) # has mean as first column

In [9]:
full

Unnamed: 0_level_0,electricity,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed,month,day,weekday,hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-01-01 00:00:00,17.7,3.8,2.4,1020.9,5,3.1,1,1,5,0
2016-01-01 01:00:00,37.1,3.7,2.4,1021.6,5,2.6,1,1,6,1
2016-01-01 02:00:00,37.8,2.6,1.9,1021.9,0,0.0,1,1,7,2
2016-01-01 03:00:00,35.1,2.0,1.2,1022.3,4,1.5,1,1,1,3
2016-01-01 04:00:00,27.5,2.3,1.8,1022.7,2,1.5,1,1,2,4
...,...,...,...,...,...,...,...,...,...,...
2018-12-31 19:00:00,0.0,9.0,5.3,1035.3,6,4.1,12,31,5,19
2018-12-31 20:00:00,0.0,8.9,5.1,1035.2,6,4.1,12,31,6,20
2018-12-31 21:00:00,0.0,9.1,5.1,1035.3,6,3.6,12,31,7,21
2018-12-31 22:00:00,0.0,9.0,5.0,1035.0,6,3.6,12,31,1,22


In [10]:
full.to_csv('./data/clean/full_data.csv')

### Adding a (time) lag between the input values and the target variable (electricity)

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
 
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    #df = pd.DataFrame(data)
    df = data
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(data.shift(i))
        names += [('{}(t-{})'.format(j, i)) for j in data.columns]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('{}(t)'.format(j)) for j in data.columns]
        else:
            names += [('{}(t+{})'.format(j, i)) for j in data.columns]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [12]:
# load data
values = full.values
# check data is float
values = values.astype('float32')
# normalizing
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(values)

df_scaled = pd.DataFrame(scaled, columns=full.columns)
# frame as supervised learning
reframed = series_to_supervised(df_scaled, 4, 1)

We drop the columns which have the variables that we do not want to predict.

In [13]:
reframed.drop(reframed.columns[[40,41,42,43,44,46,47,48,49]], axis=1, inplace=True)
print(reframed.head())

   electricity(t-4)  air_temperature(t-4)  dew_temperature(t-4)  \
4          0.117219              0.223350              0.396040   
5          0.245695              0.220812              0.396040   
6          0.250331              0.192893              0.379538   
7          0.232450              0.177665              0.356436   
8          0.182119              0.185279              0.376238   

   sea_level_pressure(t-4)  wind_direction(t-4)  wind_speed(t-4)  month(t-4)  \
4                 0.665307             0.714286         0.512683         0.0   
5                 0.674829             0.714286         0.498410         0.0   
6                 0.678912             0.000000         0.424189         0.0   
7                 0.684354             0.571429         0.467009         0.0   
8                 0.689796             0.285714         0.467009         0.0   

   day(t-4)  weekday(t-4)  hour(t-4)  ...  air_temperature(t-1)  \
4       0.0      0.666667   0.000000  ...        

In [14]:
reframed.to_csv('./data/clean/full_data_supLearn.csv')

In [15]:
reframed.head()

Unnamed: 0,electricity(t-4),air_temperature(t-4),dew_temperature(t-4),sea_level_pressure(t-4),wind_direction(t-4),wind_speed(t-4),month(t-4),day(t-4),weekday(t-4),hour(t-4),...,air_temperature(t-1),dew_temperature(t-1),sea_level_pressure(t-1),wind_direction(t-1),wind_speed(t-1),month(t-1),day(t-1),weekday(t-1),hour(t-1),wind_speed(t)
4,0.117219,0.22335,0.39604,0.665307,0.714286,0.512683,0.0,0.0,0.666667,0.0,...,0.177665,0.356436,0.684354,0.571429,0.467009,0.0,0.0,0.0,0.130435,0.467009
5,0.245695,0.220812,0.39604,0.674829,0.714286,0.49841,0.0,0.0,0.833333,0.043478,...,0.185279,0.376238,0.689796,0.285714,0.467009,0.0,0.0,0.166667,0.173913,0.452735
6,0.250331,0.192893,0.379538,0.678912,0.0,0.424189,0.0,0.0,1.0,0.086957,...,0.187817,0.382838,0.693878,0.285714,0.452735,0.0,0.0,0.333333,0.217391,0.467009
7,0.23245,0.177665,0.356436,0.684354,0.571429,0.467009,0.0,0.0,0.0,0.130435,...,0.185279,0.376238,0.693878,0.285714,0.467009,0.0,0.0,0.5,0.26087,0.452735
8,0.182119,0.185279,0.376238,0.689796,0.285714,0.467009,0.0,0.0,0.166667,0.173913,...,0.192893,0.379538,0.687075,0.285714,0.452735,0.0,0.0,0.666667,0.304348,0.452735


In [16]:
reframed.values

array([[0.11721855, 0.22335026, 0.39603963, ..., 0.        , 0.13043478,
        0.46700853],
       [0.24569535, 0.22081217, 0.39603963, ..., 0.16666667, 0.17391305,
        0.45273528],
       [0.25033113, 0.19289339, 0.37953797, ..., 0.3333333 , 0.21739131,
        0.46700853],
       ...,
       [0.        , 0.37055838, 0.48844886, ..., 0.8333333 , 0.86956525,
        0.52695614],
       [0.        , 0.36040607, 0.49504954, ..., 1.0000001 , 0.9130435 ,
        0.52695614],
       [0.        , 0.35532993, 0.49174923, ..., 0.        , 0.95652175,
        0.52695614]], dtype=float32)