# Neural network

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


In [8]:
df = pd.read_csv("consumption.csv")
weather_avg = pd.read_csv('weather-avg.csv')
weather_min = pd.read_csv('weather-min.csv')
weather_max = pd.read_csv('weather-max.csv')
info = pd.read_csv('addinfo.csv')

In [9]:
weather_avg = weather_avg.set_index("meter_id")
weather_min = weather_min.set_index("meter_id")
weather_max = weather_max.set_index("meter_id")
info = info.set_index("meter_id")
weather_avg_sorted = weather_avg.reindex(df.iloc[:,0])
weather_min_sorted = weather_min.reindex(df.iloc[:,0])
weather_max_sorted = weather_max.reindex(df.iloc[:,0])
info_sorted = info.reindex(df.iloc[:,0])

In [11]:
# Filling the mean value for the NaN values in num_bedrooms column:
brinfo=info_sorted['num_bedrooms']
values = {'num_bedrooms' : brinfo.mean()}
info_filled_br = info_sorted.fillna(value = values)
brinfo_filled = info_filled_br['num_bedrooms']

In [12]:
def get_monthi(n):
    begin=48*31*(n-1)+1
    end=48*31*n
    if n==1:
        begin=1
    if n>1:
        end-=3*48
    if n>2:
        begin-=3*48
    if n>3:
        end-=48
    if n>4:
        begin-=48
    if n>5:
        end-=48
    if n>6:
        begin-=48
    if n>8:
        end-=48
    if n>9:
        begin-=48
    if n>10:
        end-=48
    if n>11:
        begin-=48
    return begin,end

def get_mean_temp(row,month):
    """
    row: is the row (meter_id) we would like to get the average temperature for.
    month: which month (columns) we would get the average temperature for.
    returns: the average temperature for a specific meter_id for a specific month.
    """
    if month==1:
        return row.loc[:,"2017-01-01 00:00:00":"2017-01-31 00:00:00"].mean(1)
    elif month==2:
        return row.loc[:,"2017-02-01 00:00:00":"2017-02-28 00:00:00"].mean(1)
    elif month==3:
        return row.loc[:,"2017-03-01 00:00:00":"2017-03-31 00:00:00"].mean(1)
    elif month==4:
        return row.loc[:,"2017-04-01 00:00:00":"2017-04-30 00:00:00"].mean(1)
    elif month==5:
        return row.loc[:,"2017-05-01 00:00:00":"2017-05-31 00:00:00"].mean(1)
    elif month==6:
        return row.loc[:,"2017-06-01 00:00:00":"2017-06-30 00:00:00"].mean(1)
    elif month==7:
        return row.loc[:,"2017-07-01 00:00:00":"2017-07-31 00:00:00"].mean(1)
    elif month==8:
        return row.loc[:,"2017-08-01 00:00:00":"2017-08-31 00:00:00"].mean(1)
    elif month==9:
        return row.loc[:,"2017-09-01 00:00:00":"2017-09-30 00:00:00"].mean(1)
    elif month==10:
        return row.loc[:,"2017-10-01 00:00:00":"2017-10-31 00:00:00"].mean(1)
    elif month==11:
        return row.loc[:,"2017-11-01 00:00:00":"2017-11-30 00:00:00"].mean(1)
    elif month==12:
        return row.loc[:,"2017-12-01 00:00:00":"2017-12-31 00:00:00"].mean(1)
    else:
        print("Error: this is not a valid input for month")

In [13]:
temps=[]
temps_min=[]
temps_max=[]
En_con=[]
month_arr=[]
br_arr=[]
l_En_con=[]
NaN_t=1200

for i in range(df.shape[0]): # loop over all users
    
    meter=df.iloc[i] # = row number i  
    
    fmf=False;        # first month found
    
    
    for m in range (1,13): # loop over all months
        begin_index, end_index =get_monthi(m)               #get index of beginning and end of month
        month=meter[begin_index:end_index] #data for the month m for the row user (row) i
        row = brinfo_filled[i:i+1]

        # Check if months have numeric values otherwise discard the month for this user.       
        n_NaN=month.isnull().sum() #Number of NaN's
        if n_NaN<NaN_t:
            if fmf:
                #current month
                temps.append(get_mean_temp(weather_avg_sorted.iloc[i:i+1],m))
                temps_min.append(get_mean_temp(weather_min_sorted.iloc[i:i+1],m))
                temps_max.append(get_mean_temp(weather_max_sorted.iloc[i:i+1],m))
                En_con.append(month.mean())
                month_arr.append(m) # = number of data points = the months that passed threshold for each user.
                br_arr.append(row[0])
                #last month
                l_begin_index,l_end_index=get_monthi(m-1)
                last_month=meter[l_begin_index:l_end_index] #load last month
                l_En_con.append(last_month.mean())
            else:
                fmf=True

In [14]:
# converting list to numpy arrays: 
nptemps = np.zeros(len(temps))
nptemps_min = np.zeros(len(temps_min))
nptemps_max = np.zeros(len(temps_max))
npEn_con = np.zeros(len(En_con))
npmonth_arr = np.zeros(len(month_arr))
npl_En_con= np.zeros(len(l_En_con))
npbr_arr = np.zeros(len(br_arr)) #converting it into numpy array.

for i in range(len(temps)):
     nptemps[i] = temps[i]

for i in range(len(temps_min)):
     nptemps_min[i] = temps_min[i]

for i in range(len(temps_max)):
     nptemps_max[i] = temps_max[i]

for i in range(len(En_con)):
     npEn_con[i] = En_con[i]

for i in range(len(month_arr)):
     npmonth_arr[i] = month_arr[i]
        
for i in range(len(En_con)):
     npl_En_con[i] = l_En_con[i]

for i in range(len(br_arr)):
    npbr_arr[i] = br_arr[i]  

In [30]:
np_features= np.zeros(shape = (len(temps), 6))

for i in range(len(temps)):
    np_features[i, 0]  = l_En_con[i]
    np_features[i, 1] = temps[i]
    np_features[i, 2] = temps_min[i]
    np_features[i, 3] = temps_max[i]
    np_features[i, 4] = npbr_arr[i]
    np_features[i, 5] = npmonth_arr[i]

In [31]:
splitnum=5347
# Split the data into training/testing sets
X_train_comb = np_features[:-splitnum]
X_test_comb = np_features[-splitnum:]

# Split the targets into training/testing sets
y_train = npEn_con[:-splitnum]
y_test = npEn_con[-splitnum:]

In [38]:
# construct the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, input_dim=6, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(1)
])
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 256)               1792      
_________________________________________________________________
dense_29 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_30 (Dense)             (None, 2566)              659462    
_________________________________________________________________
dense_31 (Dense)             (None, 1)                 2567      
Total params: 729,613
Trainable params: 729,613
Non-trainable params: 0
_________________________________________________________________
None


In [41]:
model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=[tf.keras.metrics.MeanSquaredError()])
print("Start training")
# train the model
model.fit(X_train_comb, y_train, validation_data=(X_test_comb, y_test), epochs=50)

# evaluate the model
mse = model.evaluate(X_test_comb, y_test)[1]
print(f'MSE: {mse}')

Start training
Train on 12479 samples, validate on 5347 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
MSE: 0.002361157676205039


In [42]:
y_pred_comb=model.predict(X_test_comb)
print('Mean squared error: %.8f'
      % mean_squared_error(y_test, y_pred_comb))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.4f'
      % r2_score(y_test, y_pred_comb))

Mean squared error: 0.00236116
Coefficient of determination: 0.8876


epoch=10 <br>
Mean squared error: 0.00253821 <br>
Coefficient of determination: 0.8792 <br>
This is a slight improvement on the polynomial model,although there is probably some variation on this <br>
epoch=50 <br>
Mean squared error: 0.00236116 <br>
Coefficient of determination: 0.8876

idea: Januari (or other month) specific test set on a model. This can be used to see if model actually know how to deal with these months if which there is very little training data.