# Kfold encoding and past data

In this notebook we combine the result from 'more_past_data.ipynb' and 'KFold_Encoding.ipynb'. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
#setup
df = pd.read_csv("consumption.csv")
weather_avg = pd.read_csv('weather-avg.csv')
weather_min = pd.read_csv('weather-min.csv')
weather_max = pd.read_csv('weather-max.csv')
info = pd.read_csv('addinfo.csv')
weather_avg = weather_avg.set_index("meter_id")
weather_min = weather_min.set_index("meter_id")
weather_max = weather_max.set_index("meter_id")
info = info.set_index("meter_id")
weather_avg_sorted = weather_avg.reindex(df.iloc[:,0])
weather_min_sorted = weather_min.reindex(df.iloc[:,0])
weather_max_sorted = weather_max.reindex(df.iloc[:,0])
info_sorted = info.reindex(df.iloc[:,0])
brinfo=info_sorted['num_bedrooms']
values = {'num_bedrooms' : brinfo.mean()}
info_filled_br = info_sorted.fillna(value = values)
brinfo_filled = info_filled_br['num_bedrooms']
#functions
def get_monthi(n):
    begin=48*31*(n-1)+1
    end=48*31*n
    if n==1:
        begin=1
    if n>1:
        end-=3*48
    if n>2:
        begin-=3*48
    if n>3:
        end-=48
    if n>4:
        begin-=48
    if n>5:
        end-=48
    if n>6:
        begin-=48
    if n>8:
        end-=48
    if n>9:
        begin-=48
    if n>10:
        end-=48
    if n>11:
        begin-=48
    return begin,end

def get_mean_temp(row,month):
    """
    row: is the row (meter_id) we would like to get the average temperature for.
    month: which month (columns) we would get the average temperature for.
    returns: the average temperature for a specific meter_id for a specific month.
    """
    if month==1:
        return row.loc[:,"2017-01-01 00:00:00":"2017-01-31 00:00:00"].mean(1)
    elif month==2:
        return row.loc[:,"2017-02-01 00:00:00":"2017-02-28 00:00:00"].mean(1)
    elif month==3:
        return row.loc[:,"2017-03-01 00:00:00":"2017-03-31 00:00:00"].mean(1)
    elif month==4:
        return row.loc[:,"2017-04-01 00:00:00":"2017-04-30 00:00:00"].mean(1)
    elif month==5:
        return row.loc[:,"2017-05-01 00:00:00":"2017-05-31 00:00:00"].mean(1)
    elif month==6:
        return row.loc[:,"2017-06-01 00:00:00":"2017-06-30 00:00:00"].mean(1)
    elif month==7:
        return row.loc[:,"2017-07-01 00:00:00":"2017-07-31 00:00:00"].mean(1)
    elif month==8:
        return row.loc[:,"2017-08-01 00:00:00":"2017-08-31 00:00:00"].mean(1)
    elif month==9:
        return row.loc[:,"2017-09-01 00:00:00":"2017-09-30 00:00:00"].mean(1)
    elif month==10:
        return row.loc[:,"2017-10-01 00:00:00":"2017-10-31 00:00:00"].mean(1)
    elif month==11:
        return row.loc[:,"2017-11-01 00:00:00":"2017-11-30 00:00:00"].mean(1)
    elif month==12:
        return row.loc[:,"2017-12-01 00:00:00":"2017-12-31 00:00:00"].mean(1)
    else:
        print("Error: this is not a valid input for month")

In [3]:

#Filling missing values with a value outside the range i.e. -1 as a dummy variable:
# Filling the mean value for the NaN values in num_bedrooms column:
values = {'num_bedrooms' : -1}
info_filled_br_dummy = info_sorted.fillna(value = values)
brinfo_filled_dummy = info_filled_br_dummy['num_bedrooms']
brinfo_filled_dummy

meter_id
0xa62b9f23553ff183f61e2bf943aab3d5983d02d7    2.0
0x459c834d1f6cfb5b734b82aa9f5410fa97fb70da   -1.0
0x4a1ed36825360a058cec2bdd409fc2459e1ce54f   -1.0
0x5b76d3c0e0aefc6e0a8d1d031f96388a23263407   -1.0
0x943ebe39ef2be6ef807c42c5a647e27112ca5b0f   -1.0
                                             ... 
0x7dd7a7b8ee1bec7c44b24f738c752482f6161065   -1.0
0xfdaf9f857621ec06f2cf801f42a020a322835090   -1.0
0xd28f2f001e0cd4d6c121a3cb2e1427207e170e18   -1.0
0x47218b46abb2fcaade487a211911406dc6e13730   -1.0
0xcd19e6fe3d887bc5dcac7ca18d46199695463fdd   -1.0
Name: num_bedrooms, Length: 3248, dtype: float64

new function to load all data

In [4]:
def sort_data(nim):
    NaN_t=1200
    En_con=[]
    #number of input months (if nim=1 we should get the same as before)
    #this is the amount of months we want in our input space to predict the next monthly consumption     
    prev_con=[] #previous consumtion
    #extra info
    temps=[]
    temps_min=[]
    temps_max=[]
    month_arr=[]
    br_arr=[]
    br_arr_dummy=[]
    dwelling_type_bungalow=[]
    dwelling_type_detached_house=[]
    dwelling_type_flat=[]
    dwelling_type_semi_detached_house=[]
    dwelling_type_terraced_house=[]
    y = pd.get_dummies(info_sorted.dwelling_type, prefix='dwelling_type')
    
    
    for i in range(df.shape[0]): # loop over all users
        meter=df.iloc[i] # = row number i  
    
        #loop over all month from januari until month 12-nim
        #if the month is higher then this we don't have enough data for the nim input variables and the 1 output variable
        for m in range (1,13-nim): 
            #load current month
            bi, ei =get_monthi(m)               #bi (begin index) ei(end index)
            month=meter[bi:ei]  #data for the month m for the row user (row) i
            #load bedroom info
            row = brinfo_filled[i:i+1]
            row_dummy = brinfo_filled_dummy[i:i+1]
            #Load dwelling type
            dew_type_row = y[i:i+1]
            
            # Count the amount of NaN's in current      
            n_NaN=month.isnull().sum() 

            # Check if months have numeric values otherwise discard the month for this user. 
            if n_NaN<NaN_t:
                for j in range(m,13-nim):
                    #save input for month j,j+1,..,j+nim-1
                    input_months=np.zeros(nim)
                    for k in range(nim):
                        #load month for input j+k
                        bik,eik=get_monthi(j+k)
                        input_months[k]=meter[bik:eik].mean()
                    #debug if statement checks for NaN in input month
                    if np.isnan(np.sum(input_months)):
                        print("months=%i index=%i" %(j,i))
                        print(input_months)
                    prev_con.append(input_months)
                    #save output for month j+nim
                    bio,eio=get_monthi(j+nim)
                    En_con.append(meter[bio:eio].mean())
                    #save extra information 
                    temps.append(get_mean_temp(weather_avg_sorted.iloc[i:i+1],j+nim))
                    temps_min.append(get_mean_temp(weather_min_sorted.iloc[i:i+1],j+nim))
                    temps_max.append(get_mean_temp(weather_max_sorted.iloc[i:i+1],j+nim))
                    month_arr.append(j+nim)
                    br_arr.append(row[0])
                    br_arr_dummy.append(row_dummy[0])
                    dwelling_type_bungalow.append(dew_type_row.iloc[0, 0])
                    dwelling_type_detached_house.append(dew_type_row.iloc[0, 1])
                    dwelling_type_flat.append(dew_type_row.iloc[0, 2])
                    dwelling_type_semi_detached_house.append(dew_type_row.iloc[0, 3])
                    dwelling_type_terraced_house.append(dew_type_row.iloc[0, 4])

                    
                
                
                #break out of outer month loops since the inner month loop takes care of all months
                break
    # converting list to numpy arrays: 
    nptemps = np.zeros(len(temps))
    nptemps_min = np.zeros(len(temps_min))
    nptemps_max = np.zeros(len(temps_max))
    npEn_con = np.zeros(len(En_con))
    npmonth_arr = np.zeros(len(month_arr))
    npbr_arr = np.zeros(len(br_arr)) #converting it into numpy array.
    npbr_arr_dummy = np.zeros(len(br_arr_dummy)) #converting it into numpy array.

    for i in range(len(temps)):
         nptemps[i] = temps[i]

    for i in range(len(temps_min)):
         nptemps_min[i] = temps_min[i]

    for i in range(len(temps_max)):
         nptemps_max[i] = temps_max[i]

    for i in range(len(En_con)):
         npEn_con[i] = En_con[i]

    for i in range(len(month_arr)):
         npmonth_arr[i] = month_arr[i]

    for i in range(len(br_arr)):
        npbr_arr[i] = br_arr[i]        
    
    for i in range(len(br_arr_dummy)):
        npbr_arr_dummy[i] = br_arr_dummy[i]
        
    npprev_con=np.array(prev_con)
    dw_t_b = np.array(dwelling_type_bungalow)
    dw_t_d = np.array(dwelling_type_detached_house)
    dw_t_f = np.array(dwelling_type_flat)
    dw_t_s = np.array(dwelling_type_semi_detached_house)
    dw_t_t = np.array(dwelling_type_terraced_house)
    
    #This should probably be done differently 
    return npEn_con,npprev_con,nptemps,nptemps_min,nptemps_max,npmonth_arr,npbr_arr,npbr_arr_dummy,dw_t_b,dw_t_d,dw_t_f,dw_t_s,dw_t_t
    

In [5]:
#nim=1
npEn_con1,npprev_con1,nptemps1,nptemps_min1,nptemps_max1,npmonth_arr1,npbr_arr1,npbr_arr_dummy1,dw_t_b1,dw_t_d1,dw_t_f1,dw_t_s1,dw_t_t1=sort_data(1)
#nim=2
npEn_con2,npprev_con2,nptemps2,nptemps_min2,nptemps_max2,npmonth_arr2,npbr_arr2,npbr_arr_dummy2,dw_t_b2,dw_t_d2,dw_t_f2,dw_t_s2,dw_t_t2=sort_data(2)
#nim=3
npEn_con3,npprev_con3,nptemps3,nptemps_min3,nptemps_max3,npmonth_arr3,npbr_arr3,npbr_arr_dummy3,dw_t_b3,dw_t_d3,dw_t_f3,dw_t_s3,dw_t_t3=sort_data(3)

## Reëvalutate previous setups from more_past_data with Kfold encoding

In this section the test from the notebook more_past_data will be recreate to measure the error with the Kfold crossvalidatation and random splits. Later on the extra variable like dwelling type will also be added


### Random split

In [89]:
def Eval_random_Split(features,npEn_con,split,model,print_res=True):
    X_train, X_test, y_train, y_test = train_test_split(
    features , npEn_con.reshape(-1,1), test_size=split, random_state=0)
    #print(X_train.shape, y_train.shape)
    #print(X_test.shape, y_test.shape)
    model.fit(X_train,y_train)
    if print_res:
        y_pred=model.predict(X_test)
        print('Coefficients: \n', model.coef_)
        print('Intercept: {}'.format(model.intercept_))
        # The mean squared error
        print('Mean squared error: %.8f' % mean_squared_error(y_test, y_pred))
        # The coefficient of determination: 1 is perfect prediction
        print('Coefficient of determination: %.8f' % r2_score(y_test, y_pred))
    return model

In [90]:
#1 month of previous data
Eval_random_Split(npprev_con1,npEn_con1,0.3,LinearRegression())

Coefficients: 
 [[1.01008592]]
Intercept: [0.00831724]
Mean squared error: 0.00323402
Coefficient of determination: 0.83505367


LinearRegression()

The R2 and MSE value is better then before(0.8186 and 0.00381149) this could be dought to the better randomized training or the easier test set.

In [91]:
print("2 months")
Eval_random_Split(npprev_con2,npEn_con2,0.3,LinearRegression())
print("3 months")
Eval_random_Split(npprev_con3,npEn_con3,0.3,LinearRegression())

2 months
Coefficients: 
 [[-0.32214394  1.31486463]]
Intercept: [0.01136229]
Mean squared error: 0.00282637
Coefficient of determination: 0.85470846
3 months
Coefficients: 
 [[-0.16042959 -0.06958152  1.23612595]]
Intercept: [0.01144688]
Mean squared error: 0.00330198
Coefficient of determination: 0.83436158


LinearRegression()

2 months result in a similar story to the 1 month case.<br>
For 3 months there is a decrease in the accuracy. I have no explenation for this. 

In [33]:
def Check_1D(features,npEn_con,split,model,print_res=True):
    #extract only the last month from the features
    features_lm=features[:,(features.shape[-1]-1)]
    features_lm=features_lm.reshape(-1,1)
    #use previously made function
    return Eval_random_Split(features_lm,npEn_con,split,model,print_res)
    

In [92]:
print("2 months 1D check")
Check_1D(npprev_con2,npEn_con2,0.3,LinearRegression())
print("\n3 months 1D check")
Check_1D(npprev_con3,npEn_con3,0.3,LinearRegression())

2 months 1D check
Coefficients: 
 [[1.04247136]]
Intercept: [0.00448163]
Mean squared error: 0.00297290
Coefficient of determination: 0.84717579

3 months 1D check
Coefficients: 
 [[1.05911266]]
Intercept: [0.00383822]
Mean squared error: 0.00353348
Coefficient of determination: 0.82274906


LinearRegression()

It seems like most of the improvements are due to the smaller data set but there is a small improvment due to the extra information. Because the result for 3 months is very poor incomparison to the "more past data" notebook, we'll only repeat the test for 2 months.

In [93]:
#nim=2
#np_features= np.zeros(shape = (len(nptemps2), nim+4))

#for i in range(len(nptemps2)):
#    np_features[i, 0:nim]  = npprev_con2[i,:]
#    np_features[i, nim] = nptemps2[i]
#    np_features[i, nim+1] = nptemps_min2[i]
#    np_features[i, nim+2] = nptemps_max2[i]
#    np_features[i, nim+3] = npbr_arr2[i]

#This does the above in 1 line
np_features=np.column_stack(((npprev_con2,nptemps2,nptemps_min2, nptemps_max2, npbr_arr2)))

In [94]:
Eval_random_Split(np_features,npEn_con2,0.3,LinearRegression())

Coefficients: 
 [[-0.26161656  1.24849317  0.0034595   0.00189202 -0.00656957  0.00287161]]
Intercept: [0.04612932]
Mean squared error: 0.00271353
Coefficient of determination: 0.86050883


LinearRegression()

slight improvement after adding the extra features. Slightly worse then non random result but this might depend on the seed. <br>
We'll now use normalisation

In [84]:
#z-normalization on data:
def normalise_features(np_features):
    #np_features_norm= np.zeros(shape = np_features.shape)
    np_features_norm=np_features
    if np_features.ndim==1:
        np_features_norm=(np_features-np.mean(np_features))/np.std(np_features)
    else: 
        for i in range(np_features.shape[1]):
            np_features_norm[:,i] = (np_features[:,i]-np.mean(np_features[:,i]))/np.std(np_features[:,i])
    return np_features_norm

In [95]:
np_features_norm=normalise_features(np_features)
npEn_con2_norm=normalise_features(npEn_con2) #perhaps normalise_features is not the best name
Eval_random_Split(np_features_norm,npEn_con2_norm,0.3,LinearRegression())

Coefficients: 
 [[-0.2166253   1.09890608  0.10741244  0.05241113 -0.23258458  0.0109377 ]]
Intercept: [-0.00156118]
Mean squared error: 0.13526234
Coefficient of determination: 0.86050883


LinearRegression()

MSE changes but R2 is the same

We'll now add dwelling type information

In [96]:
np_features=np.column_stack(((npprev_con2,nptemps2,nptemps_min2, nptemps_max2, npbr_arr2,dw_t_b2,dw_t_d2,dw_t_f2,dw_t_s2,dw_t_t2)))

In [98]:
Eval_random_Split(np_features,npEn_con2,0.3,LinearRegression())
print("\n Normalised")
np_features_norm=normalise_features(np_features)
npEn_con2_norm=normalise_features(npEn_con2) #perhaps normalise_features is not the best name
Eval_random_Split(np_features_norm,npEn_con2_norm,0.3,LinearRegression())

Coefficients: 
 [[-2.61096489e-01  1.24861183e+00  3.13197473e-03  1.98467949e-03
  -6.35750080e-03  3.06418248e-03 -4.63528659e-04 -2.63050358e-03
  -2.01450897e-03 -3.27842103e-03 -9.65673950e-04]]
Intercept: [0.04642805]
Mean squared error: 0.00271466
Coefficient of determination: 0.86045083

 Normalised
Coefficients: 
 [[-2.16194670e-01  1.09901053e+00  9.72431387e-02  5.49779826e-02
  -2.25076576e-01  1.16712138e-02 -9.42181784e-04 -5.82424552e-03
  -2.08285255e-03 -9.41635362e-03 -2.04366111e-03]]
Intercept: [-0.00161761]
Mean squared error: 0.13531858
Coefficient of determination: 0.86045083


LinearRegression()

The results are slightly worse with the extra infomation. strange <br>
The coefficients of these components are alsmost zero(1e-3)so maybe addition of these components resulted in extra noise

### K fold crossvalidation

In [7]:
def Eval_Kfold_Split(features,npEn_con,n_splits,model,print_res=True):
    kf = KFold(n_splits, shuffle=True)
    models_arr=[]
    MSE_arr=[]
    R2_arr=[]
    for train_index, test_index in kf.split(features): 
        cmodel=model
        cmodel.fit(features[train_index,],npEn_con[train_index])
        models_arr.append(cmodel)
        y_pred = model.predict(features[test_index,])
        MSE_arr.append(mean_squared_error(npEn_con[test_index,], y_pred));
        R2_arr.append(r2_score(npEn_con[test_index,], y_pred))
    if print_res:
        # The mean squared error
        print('Mean squared error:')
        print(np.mean(MSE_arr))
        # The coefficient of determination: 1 is perfect prediction
        print('Coefficient of determination: ' )
        print(np.mean(R2_arr))
    return models_arr

In [8]:
#1 month
print("1 months")
np_features=np.column_stack(((npprev_con1,nptemps1,nptemps_min1, nptemps_max1, npbr_arr1)))
models_arr=Eval_Kfold_Split(np_features,npEn_con1,10,LinearRegression())
print("\n")
#2 months
print("2 months")
np_features=np.column_stack(((npprev_con2,nptemps2,nptemps_min2, nptemps_max2, npbr_arr2)))
models_arr=Eval_Kfold_Split(np_features,npEn_con2,10,LinearRegression())
#2 months
print("\n 3 months")
np_features=np.column_stack(((npprev_con3,nptemps3,nptemps_min3, nptemps_max3, npbr_arr3)))
models_arr=Eval_Kfold_Split(np_features,npEn_con3,10,LinearRegression())

1 months
Mean squared error:
0.003137588527636117
Coefficient of determination: 
0.8442307822092066


2 months
Mean squared error:
0.0029432519127855924
Coefficient of determination: 
0.8520938825111755
3 months
Mean squared error:
0.002989671454428639
Coefficient of determination: 
0.852745625807929


In [119]:
#the same but normalised
print("1 months")
np_features=np.column_stack(((npprev_con1,nptemps1,nptemps_min1, nptemps_max1, npbr_arr1)))
np_features_norm=normalise_features(np_features)
npEn_con1_norm=normalise_features(npEn_con1)
models_arr=Eval_Kfold_Split(np_features_norm,npEn_con1_norm,10,LinearRegression())
print("\n")
print("2 months")
np_features=np.column_stack(((npprev_con2,nptemps2,nptemps_min2, nptemps_max2, npbr_arr2)))
np_features_norm=normalise_features(np_features)
npEn_con2_norm=normalise_features(npEn_con2)
models_arr=Eval_Kfold_Split(np_features_norm,npEn_con2_norm,10,LinearRegression())

1 months
Mean squared error:
0.15580694251186425
Coefficient of determination: 
0.8440378834414213


2 months
Mean squared error:
0.14661010890770895
Coefficient of determination: 
0.8529708901177872


In [120]:
#extra features
np_features=np.column_stack(((npprev_con1,nptemps1,nptemps_min1, nptemps_max1, npbr_arr1,dw_t_b1,dw_t_d1,dw_t_f1,dw_t_s1,dw_t_t1)))
print("1 months")
np_features=np.column_stack(((npprev_con1,nptemps1,nptemps_min1, nptemps_max1, npbr_arr1)))
models_arr=Eval_Kfold_Split(np_features,npEn_con1,10,LinearRegression())
print("normalised")
np_features_norm=normalise_features(np_features)
npEn_con1_norm=normalise_features(npEn_con1)
models_arr=Eval_Kfold_Split(np_features_norm,npEn_con1_norm,10,LinearRegression())
print("\n")
print("2 months")
np_features=np.column_stack(((npprev_con2,nptemps2,nptemps_min2, nptemps_max2, npbr_arr2,dw_t_b2,dw_t_d2,dw_t_f2,dw_t_s2,dw_t_t2)))
models_arr=Eval_Kfold_Split(np_features,npEn_con2,10,LinearRegression())
print("normalised")
np_features_norm=normalise_features(np_features)
npEn_con2_norm=normalise_features(npEn_con2)
models_arr=Eval_Kfold_Split(np_features_norm,npEn_con2_norm,10,LinearRegression())

1 months
Mean squared error:
0.003137344698948375
Coefficient of determination: 
0.8445424751389631
normalised
Mean squared error:
0.15577994596890393
Coefficient of determination: 
0.8443358607467972


2 months
Mean squared error:
0.0029432062113013977
Coefficient of determination: 
0.8526624010669828
normalised
Mean squared error:
0.14673926913578075
Coefficient of determination: 
0.8539250214343251
