# 3. Framing del problema y separación del dataset

In [1]:
import pandasql as ps
import pandas as pd
import numpy as np

#### Lectura de los datos

In [2]:
data = pd.read_csv('./data_ordered.csv')
data = data.drop(columns = ['Unnamed: 0'])
data

Unnamed: 0,building_id,timestamp,electricity,site_id,square_feet,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed,pressure_meter,month,day,weekday,hour
0,1447,2016-01-01 01:00:00,156.650,15,29775,0.0,-2.000000,1019.4,7,4.100000,1.0,1,1,5,1
1,1447,2016-01-01 02:00:00,157.575,15,29775,0.0,-2.000000,1019.4,7,2.600000,1.0,1,1,6,2
2,1447,2016-01-01 03:00:00,154.925,15,29775,0.0,-2.000000,1019.4,0,2.100000,1.0,1,1,7,3
3,1447,2016-01-01 04:00:00,156.075,15,29775,0.0,-2.111111,1019.4,7,2.155556,1.0,1,1,1,4
4,1447,2016-01-01 05:00:00,154.500,15,29775,0.0,-2.222222,1019.4,7,2.211111,1.0,1,1,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1229003,6,2016-12-31 19:00:00,264.970,0,27926,22.8,10.000000,1021.7,3,5.700000,1.0,12,31,6,19
1229004,6,2016-12-31 20:00:00,265.379,0,27926,23.3,8.900000,1021.0,4,4.100000,1.0,12,31,7,20
1229005,6,2016-12-31 21:00:00,263.741,0,27926,23.3,10.000000,1021.1,3,4.100000,1.0,12,31,1,21
1229006,6,2016-12-31 22:00:00,258.827,0,27926,22.8,10.000000,1021.1,4,3.100000,1.0,12,31,2,22


Para este punto el timestamp ya no es necesario tenerlo, pues los datos ya están ordenados y los atributos del dataframe contienen la información del tiempo al igual que el id del sitio. Por tanto dejamos atrás dicha información.

In [3]:
data = data.drop(columns=['timestamp', 'site_id'])

Se extrae la lista de los edificios que contiene el dataset

In [4]:
buildings_id = ps.sqldf("select building_id from data group by building_id")
buildings_id

Unnamed: 0,building_id
0,6
1,12
2,27
3,33
4,34
...,...
140,1422
141,1423
142,1424
143,1440


Contamos con 145 edificios diferentes de uso residencial. Como se analizó en los notebooks anteriores, están distribuidos alrededor de distintas locaciones por lo que cada uno tiene condiciones climaticas diferentes asociadas.

## 3.1 Framing del problema

Es en esta parte en donde se vuelve importante tener claro el problema para el cual queremos entrenar a nuestra red que aprenda, pues la forma que se le den a los datos a continuación, debe ser de tal forma que dicho objetivo sea alcanzable. 

El problema para el cual entrenaremos a nuestro sistema es para predecir el consumo eléctrico promedio de 1 hora en adelante, dada la información de consumo de las 12 horas anteriores, así como los datos climaticos y del tiempo de las 12 horas previas.

Para dicho propósito, reformaremos la tabla que presentamos arriba de manera que agreguemos para cada edificio las horas anteriores en forma de columnas para los valores del clima y del consumo. Y hasta la derecha, quedará como variable objetivo el consumo al momento presente.

In [5]:
 # convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    #df = pd.DataFrame(data)
    df = data
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(data.shift(i))
        names += [('{}(t-{})'.format(j, i)) for j in data.columns]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('{}(t)'.format(j)) for j in data.columns]
        else:
            names += [('{}(t+{})'.format(j, i)) for j in data.columns]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def weather_supervised(dataframe, n_in, n_out):
    df = dataframe.copy()
    df.pop('electricity')
    return series_to_supervised(data = df, n_in = n_in, n_out = n_out, dropnan=True)

def electric_supervised(dataframe, n_in, n_out):
    elec = pd.DataFrame(dataframe['electricity'])
    elec_sup = series_to_supervised(data = elec, n_in = n_in, n_out = n_out, dropnan=True)
    return elec_sup

In [33]:
framed_data = pd.DataFrame()
for id_ in buildings_id['building_id'].values:
    print('Framing data from builing id:', id_)
    building_data = ps.sqldf("select * from data where building_id = {}".format(id_))
    id_df = pd.DataFrame(building_data['building_id'])
    building_data.pop('building_id')
    building_electricity = electric_supervised(building_data, 6, 1)
    building_weather = weather_supervised(building_data, 6, 0)
    building_data = building_weather.join(building_electricity)
    building_data = id_df.join(building_data)
    building_data = building_data.dropna()
    
    framed_data = pd.concat([framed_data, building_data]) 

Framing data from builing id: 6
Framing data from builing id: 12
Framing data from builing id: 27
Framing data from builing id: 33
Framing data from builing id: 34
Framing data from builing id: 35
Framing data from builing id: 36
Framing data from builing id: 37
Framing data from builing id: 49
Framing data from builing id: 56
Framing data from builing id: 57
Framing data from builing id: 58
Framing data from builing id: 61
Framing data from builing id: 62
Framing data from builing id: 63
Framing data from builing id: 64
Framing data from builing id: 65
Framing data from builing id: 66
Framing data from builing id: 67
Framing data from builing id: 77
Framing data from builing id: 85
Framing data from builing id: 90
Framing data from builing id: 95
Framing data from builing id: 96
Framing data from builing id: 97
Framing data from builing id: 98
Framing data from builing id: 100
Framing data from builing id: 127
Framing data from builing id: 128
Framing data from builing id: 129
Framing

Se obtiene el dataframe con los datos ya formateados. Sin embargo se notó posteriormente que una cantidad considerable de edificios tenía más del 40% de los valores de la electricidad en 0, por lo que a continuación se remueven del dataframe obtenido previamente.

In [37]:
framed_data

Unnamed: 0,building_id,square_feet(t-6),air_temperature(t-6),dew_temperature(t-6),sea_level_pressure(t-6),wind_direction(t-6),wind_speed(t-6),pressure_meter(t-6),month(t-6),day(t-6),...,day(t-1),weekday(t-1),hour(t-1),electricity(t-6),electricity(t-5),electricity(t-4),electricity(t-3),electricity(t-2),electricity(t-1),electricity(t)
6,6,27926.0,25.0,20.0,1019.7,0.0,0.0,1.0,1.0,1.0,...,1.0,3.0,5.0,0.000,0.000,0.000,0.000,0.000,0.000,0.000
7,6,27926.0,24.4,21.1,1020.2,2.0,1.5,1.0,1.0,1.0,...,1.0,4.0,6.0,0.000,0.000,0.000,0.000,0.000,0.000,0.000
8,6,27926.0,22.8,21.1,1020.2,0.0,0.0,1.0,1.0,1.0,...,1.0,5.0,7.0,0.000,0.000,0.000,0.000,0.000,0.000,0.000
9,6,27926.0,21.1,20.6,1020.1,0.0,0.0,1.0,1.0,1.0,...,1.0,6.0,8.0,0.000,0.000,0.000,0.000,0.000,0.000,0.000
10,6,27926.0,20.0,20.0,1020.0,6.0,2.6,1.0,1.0,1.0,...,1.0,7.0,9.0,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7465,1447,29775.0,-3.3,-7.8,1014.4,4.0,6.7,1.0,12.0,31.0,...,31.0,4.0,18.0,145.750,146.050,146.525,152.725,167.725,170.225,167.800
7466,1447,29775.0,-2.8,-7.8,1013.4,4.0,5.7,1.0,12.0,31.0,...,31.0,5.0,19.0,146.050,146.525,152.725,167.725,170.225,167.800,168.150
7467,1447,29775.0,-1.7,-7.2,1012.3,4.0,6.7,1.0,12.0,31.0,...,31.0,6.0,20.0,146.525,152.725,167.725,170.225,167.800,168.150,164.075
7468,1447,29775.0,-0.6,-6.7,1011.1,4.0,7.2,1.0,12.0,31.0,...,31.0,7.0,21.0,152.725,167.725,170.225,167.800,168.150,164.075,158.800


In [40]:
#lista de id de edificios que tienen más del 10% de los datos en 0.
dirty = [90,64,37,36,98,34,49,62,27,85,77,6,67,66,56,57,58,61,100,65,35,33,63,95,97,12,96]
dirty = pd.DataFrame(data = dirty, columns = ['building_id'])

In [41]:
framed_data_clean = ps.sqldf("select * from framed_data where building_id not in (select building_id from dirty) ")

Se obtiene finalmente el dataframe con los datos a utilizar. A continuación sigue dividir los datos en los conjuntos de entrenamiento, validación y prueba.

In [42]:
framed_data_clean

Unnamed: 0,building_id,square_feet(t-6),air_temperature(t-6),dew_temperature(t-6),sea_level_pressure(t-6),wind_direction(t-6),wind_speed(t-6),pressure_meter(t-6),month(t-6),day(t-6),...,day(t-1),weekday(t-1),hour(t-1),electricity(t-6),electricity(t-5),electricity(t-4),electricity(t-3),electricity(t-2),electricity(t-1),electricity(t)
0,127,27071.0,3.8,2.4,1020.9,5.0,3.1,1.0,1.0,1.0,...,1.0,2.0,5.0,8.6786,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
1,127,27071.0,3.7,2.4,1021.6,5.0,2.6,1.0,1.0,1.0,...,1.0,3.0,6.0,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
2,127,27071.0,2.6,1.9,1021.9,0.0,0.0,1.0,1.0,1.0,...,1.0,4.0,7.0,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
3,127,27071.0,2.0,1.2,1022.3,4.0,1.5,1.0,1.0,1.0,...,1.0,5.0,8.0,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
4,127,27071.0,2.3,1.8,1022.7,2.0,1.5,1.0,1.0,1.0,...,1.0,6.0,9.0,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991127,1447,29775.0,-3.3,-7.8,1014.4,4.0,6.7,1.0,12.0,31.0,...,31.0,4.0,18.0,145.7500,146.0500,146.5250,152.7250,167.7250,170.2250,167.8000
991128,1447,29775.0,-2.8,-7.8,1013.4,4.0,5.7,1.0,12.0,31.0,...,31.0,5.0,19.0,146.0500,146.5250,152.7250,167.7250,170.2250,167.8000,168.1500
991129,1447,29775.0,-1.7,-7.2,1012.3,4.0,6.7,1.0,12.0,31.0,...,31.0,6.0,20.0,146.5250,152.7250,167.7250,170.2250,167.8000,168.1500,164.0750
991130,1447,29775.0,-0.6,-6.7,1011.1,4.0,7.2,1.0,12.0,31.0,...,31.0,7.0,21.0,152.7250,167.7250,170.2250,167.8000,168.1500,164.0750,158.8000


In [43]:
framed_data_clean.to_csv('./framed_data.csv')

## 3.2 Separación del conjunto de datos

Dado que se tiene la información para diversos edicicios, el conjunto de entrenamiento consiste en la información para el 75% del total, tomamos 20% de validación y 5% del conjunto de prueba.

In [44]:
#obtenemos la lista de los edificios con los que se cuenta
id_s = ps.sqldf("select building_id from framed_data_clean group by building_id")

In [48]:
#separación de los id_s en entrenamiento, validación y prueba
n_train = int(0.75*len(id_s['building_id'].values))
n_val = int(0.20*len(id_s['building_id'].values))
n_test = int(0.05*len(id_s['building_id'].values))

id_s_sorted = np.sort(id_s['building_id'].values)
train_id_s = id_s_sorted[:n_train]
val_id_s = id_s_sorted[n_train:n_train + n_val]
test_id_s = id_s_sorted[n_train + n_val:]

print(train_id_s.shape, val_id_s.shape, test_id_s.shape)

(88,) (23,) (7,)


De tal forma, la separación resulta en 88 edificios para el entrenamiento, 23 para validación y 7 para prueba.

In [50]:
#dataframes con los id_s correspondientes
train_id_df = pd.DataFrame(data = train_id_s, columns = ['building_id'])
val_id_df = pd.DataFrame(data = val_id_s, columns = ['building_id'])
test_id_df = pd.DataFrame(data = test_id_s, columns = ['building_id'])

In [51]:
#separación del dataframe con la información completa para obtener dataframes 
#con la información de cada uno de los conjuntos.

train_data = ps.sqldf("select * from framed_data_clean where building_id in (select building_id from train_id_df)")
val_id_df = ps.sqldf("select * from framed_data_clean where building_id in (select building_id from val_id_df)")
test_id_df = ps.sqldf("select * from framed_data_clean where building_id in (select building_id from test_id_df)")

In [57]:
train_data

Unnamed: 0,building_id,square_feet(t-6),air_temperature(t-6),dew_temperature(t-6),sea_level_pressure(t-6),wind_direction(t-6),wind_speed(t-6),pressure_meter(t-6),month(t-6),day(t-6),...,day(t-1),weekday(t-1),hour(t-1),electricity(t-6),electricity(t-5),electricity(t-4),electricity(t-3),electricity(t-2),electricity(t-1),electricity(t)
0,127,27071.0,3.8,2.4,1020.9,5.0,3.1,1.0,1.0,1.0,...,1.0,2.0,5.0,8.6786,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
1,127,27071.0,3.7,2.4,1021.6,5.0,2.6,1.0,1.0,1.0,...,1.0,3.0,6.0,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
2,127,27071.0,2.6,1.9,1021.9,0.0,0.0,1.0,1.0,1.0,...,1.0,4.0,7.0,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
3,127,27071.0,2.0,1.2,1022.3,4.0,1.5,1.0,1.0,1.0,...,1.0,5.0,8.0,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
4,127,27071.0,2.3,1.8,1022.7,2.0,1.5,1.0,1.0,1.0,...,1.0,6.0,9.0,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571,17.3571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765834,1310,57274.0,-2.8,-7.8,1020.4,4.0,1.5,1.0,12.0,31.0,...,31.0,6.0,18.0,10.3900,11.6100,9.6600,8.8900,11.1000,9.1400,9.5800
765835,1310,57274.0,-1.1,-6.1,1020.5,4.0,2.6,1.0,12.0,31.0,...,31.0,7.0,19.0,11.6100,9.6600,8.8900,11.1000,9.1400,9.5800,8.8400
765836,1310,57274.0,0.6,-5.6,1020.3,4.0,2.6,1.0,12.0,31.0,...,31.0,1.0,20.0,9.6600,8.8900,11.1000,9.1400,9.5800,8.8400,11.0300
765837,1310,57274.0,2.8,-6.1,1019.4,4.0,5.1,1.0,12.0,31.0,...,31.0,2.0,21.0,8.8900,11.1000,9.1400,9.5800,8.8400,11.0300,9.7700


In [58]:
val_id_df

Unnamed: 0,building_id,square_feet(t-6),air_temperature(t-6),dew_temperature(t-6),sea_level_pressure(t-6),wind_direction(t-6),wind_speed(t-6),pressure_meter(t-6),month(t-6),day(t-6),...,day(t-1),weekday(t-1),hour(t-1),electricity(t-6),electricity(t-5),electricity(t-4),electricity(t-3),electricity(t-2),electricity(t-1),electricity(t)
0,1311,66794.0,5.6,-0.6,1019.3,7.0,2.6,1.0,1.0,1.0,...,1.0,5.0,6.0,26.5716,24.5416,25.4905,26.9095,26.4995,25.3895,25.8695
1,1311,66794.0,5.6,-0.6,1019.4,7.0,2.6,1.0,1.0,1.0,...,1.0,6.0,7.0,24.5416,25.4905,26.9095,26.4995,25.3895,25.8695,27.2795
2,1311,66794.0,5.6,-1.1,1019.4,7.0,1.5,1.0,1.0,1.0,...,1.0,7.0,8.0,25.4905,26.9095,26.4995,25.3895,25.8695,27.2795,25.5795
3,1311,66794.0,5.0,-2.2,1019.2,6.0,3.1,1.0,1.0,1.0,...,1.0,1.0,9.0,26.9095,26.4995,25.3895,25.8695,27.2795,25.5795,25.8295
4,1311,66794.0,4.4,-2.2,1018.9,7.0,4.1,1.0,1.0,1.0,...,1.0,2.0,10.0,26.4995,25.3895,25.8695,27.2795,25.5795,25.8295,27.6895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173077,1417,40451.0,-3.3,-7.8,1014.4,4.0,6.7,1.0,12.0,31.0,...,31.0,4.0,18.0,92.3000,84.1250,79.9750,79.6750,95.8250,93.8750,88.5000
173078,1417,40451.0,-2.8,-7.8,1013.4,4.0,5.7,1.0,12.0,31.0,...,31.0,5.0,19.0,84.1250,79.9750,79.6750,95.8250,93.8750,88.5000,85.8750
173079,1417,40451.0,-1.7,-7.2,1012.3,4.0,6.7,1.0,12.0,31.0,...,31.0,6.0,20.0,79.9750,79.6750,95.8250,93.8750,88.5000,85.8750,75.3500
173080,1417,40451.0,-0.6,-6.7,1011.1,4.0,7.2,1.0,12.0,31.0,...,31.0,7.0,21.0,79.6750,95.8250,93.8750,88.5000,85.8750,75.3500,85.0000


In [59]:
test_id_df

Unnamed: 0,building_id,square_feet(t-6),air_temperature(t-6),dew_temperature(t-6),sea_level_pressure(t-6),wind_direction(t-6),wind_speed(t-6),pressure_meter(t-6),month(t-6),day(t-6),...,day(t-1),weekday(t-1),hour(t-1),electricity(t-6),electricity(t-5),electricity(t-4),electricity(t-3),electricity(t-2),electricity(t-1),electricity(t)
0,1418,41302.0,0.0,-2.000000,1019.4,7.0,4.100000,1.0,1.0,1.0,...,1.0,3.0,6.0,12.425,12.300,12.750,12.925,12.950,13.000,12.950
1,1418,41302.0,0.0,-2.000000,1019.4,7.0,2.600000,1.0,1.0,1.0,...,1.0,4.0,7.0,12.300,12.750,12.925,12.950,13.000,12.950,12.625
2,1418,41302.0,0.0,-2.000000,1019.4,0.0,2.100000,1.0,1.0,1.0,...,1.0,5.0,8.0,12.750,12.925,12.950,13.000,12.950,12.625,12.675
3,1418,41302.0,0.0,-2.111111,1019.4,7.0,2.155556,1.0,1.0,1.0,...,1.0,6.0,9.0,12.925,12.950,13.000,12.950,12.625,12.675,12.575
4,1418,41302.0,0.0,-2.222222,1019.4,7.0,2.211111,1.0,1.0,1.0,...,1.0,7.0,10.0,12.950,13.000,12.950,12.625,12.675,12.575,12.975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52206,1447,29775.0,-3.3,-7.800000,1014.4,4.0,6.700000,1.0,12.0,31.0,...,31.0,4.0,18.0,145.750,146.050,146.525,152.725,167.725,170.225,167.800
52207,1447,29775.0,-2.8,-7.800000,1013.4,4.0,5.700000,1.0,12.0,31.0,...,31.0,5.0,19.0,146.050,146.525,152.725,167.725,170.225,167.800,168.150
52208,1447,29775.0,-1.7,-7.200000,1012.3,4.0,6.700000,1.0,12.0,31.0,...,31.0,6.0,20.0,146.525,152.725,167.725,170.225,167.800,168.150,164.075
52209,1447,29775.0,-0.6,-6.700000,1011.1,4.0,7.200000,1.0,12.0,31.0,...,31.0,7.0,21.0,152.725,167.725,170.225,167.800,168.150,164.075,158.800


In [55]:
#generación de los csv con los diferentes datos

train_data.to_csv('./train_data.csv')
val_id_df.to_csv('./validation_data.csv')
test_id_df.to_csv('./test_data.csv')