# 3 different models on NEM data


Loading compotents that are useful

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

sns.set_style('whitegrid')


%config InlineBackend.figure_format = 'retina'

from keras.models import Sequential
from keras.layers import Dense,TimeDistributed,Flatten,Dropout,Conv1D,MaxPooling1D
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from keras.callbacks import ReduceLROnPlateau,EarlyStopping
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split 

In [None]:
pd.plotting.register_matplotlib_converters()

#FB Prophet has altered the register in Matplotlib and a solution has not been found. This work around however works.


Loading the data. This is a csv produced by the downloading notebook. It has been split off.

In [None]:
data = pd.read_csv('./HalfHourly_Data.csv', index_col=0)

data.index = pd.to_datetime(data.index)

## Creating a daily mean dataframe

The downloaded data is half hourly. This is simply too much data to proceed with at the moment. So instead a daily figure table is created.

In [None]:
daily_mean = pd.DataFrame()

daily_mean["VIC_PRICE"] = data["VIC_PRICE"].resample('D').mean()
daily_mean["VIC_PRICE_median"] = data["VIC_PRICE"].resample('D').median()
daily_mean['VIC_PRICE_wt_mean'] = (data['VIC_PRICE']*data['VIC_DEMAND']).resample('D').sum()/data['VIC_DEMAND'].resample('D').sum()
daily_mean["VIC_PRICE_std"] = data["VIC_PRICE"].resample('D').std().interpolate(method='polynomial', order=1)
daily_mean["VIC_PRICE_diff"] = daily_mean["VIC_PRICE"].diff()


daily_mean.dropna()


## Scaler choosing

Due to a few large spikes, scaling really helps. 

In [None]:
from sklearn.preprocessing import MinMaxScaler,RobustScaler,QuantileTransformer,StandardScaler
from scipy.stats.mstats import winsorize

minmaxscale = MinMaxScaler()

robustscale = RobustScaler()

qt = QuantileTransformer(n_quantiles=200)

ss = StandardScaler()

from sklearn_pandas import DataFrameMapper,gen_features

features=[]
transformers = [StandardScaler,MinMaxScaler,QuantileTransformer,RobustScaler]
#transformers = [QuantileTransformer]


for transformer in transformers:
#     feature = gen_features([['VIC_PRICE']],classes=[{'class':QuantileTransformer,'n_quantiles':200}])
    feature = gen_features([['VIC_PRICE']],classes=[transformer])
    features.append((feature[0][0],feature[0][1],{'alias':transformer.__name__}))

mapper = DataFrameMapper(features,df_out=True,input_df=True)

# Enter the Lag 

In [None]:
lag = input("What lag should be used?")

lag = int(lag)

print("")
print("The lag is set at", lag)

<a id="reshape-the-data-to-work-with-the-lstm"></a>
## Long short-term memory (LSTM) - Recurrent Neural Network (RNN)


In [None]:
def create_data(timeseries, lag=1, as_array=True):
    # print(timeseries)
    if not isinstance(timeseries, pd.Series):
        timeseries = pd.Series(timeseries.ravel())
    y = timeseries[lag:]
    # print(y.shape)
    X = pd.DataFrame({'lag'+str(lag-i):timeseries.shift(-i) for i in range(0, lag)}).dropna().iloc[:-1, :]
#     X = np.reshape(X.values, (X.shape[0], 1, X.shape[1]))
    X =X.values.reshape(X.shape[0], 1, X.shape[1])
    
    return X, y

In [None]:


model = Sequential()
model.add(LSTM(2, input_shape=(None, lag)))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')
lstm_results = pd.DataFrame()
scaler = 'RobustScaler'

from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
print(tscv)  
X=daily_mean

for i, (train_index, test_index) in enumerate(tscv.split(X)):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  
    train_transform = mapper.fit_transform(X_train)
    test_transform = mapper.fit_transform(X_test)

    trainX, trainY = create_data(train_transform[scaler], lag)
    testX, testY = create_data(test_transform[scaler], lag)
  # print(trainX.shape)
    model.fit(trainX, trainY, epochs=20, batch_size=1, verbose=1)

    #trainPredict = pd.Series(model.predict(trainX).squeeze(),index=trainY.index)
    testPredict = pd.Series(model.predict(testX).squeeze(),index=testY.index)
    lstm_results['testPredict_'+str(i)] = pd.Series(model.predict(testX).squeeze(),index=testY.index)
    lstm_results['trainPredict_'+str(i)] = pd.Series(model.predict(trainX).squeeze(),index=trainY.index)
    lstm_results['trainY_'+str(i)] = trainY
    lstm_results['testY_'+str(i)] = testY
    
#     pd.Series(model.predict(trainX).squeeze(),index=trainX.index)
    
    #trainY.plot(figsize=(14,7))
    testY.plot(figsize=(14,7))

    #trainPredict.plot(figsize=(14,7))
    testPredict.plot(figsize=(14,7))
  



# Now lets add multiple Variables

In [None]:
def create_multi_data(timeseries, nfeatures, target = 'VIC_PRICE',lag=1):
    # print(timeseries)
    # if not isinstance(timeseries, pd.Series):
        # timeseries = pd.Series(timeseries.ravel())
#     minmax = MinMaxScaler()
#     X_train = minmax.fit_transform(X_train)
#     X_test = minmax.transform(X_test)
    feature = gen_features([[col] for col in timeseries.columns],classes=[{'class':QuantileTransformer,'n_quantiles':100,'output_distribution':'normal'}])
#     features.append((feature[0][0],feature[0][1],{'alias':transformer.__name__}))

    mapper = DataFrameMapper(feature,df_out=True,input_df=True)
    timeseries = mapper.fit_transform(timeseries)
    y = timeseries[target][lag:]
    # print(y.shape)
    
    Xs=[]
    for col in timeseries.columns[0:nfeatures]:
    
        Xs.append(pd.DataFrame({'lag'+str(lag-i):timeseries[col].shift(-i) for i in range(0, lag)}).dropna().iloc[:-1, :])

    X = pd.concat(Xs,axis=1)
    # X = pd.DataFrame({'lag'+str(lag-i):timeseries.shift(-i) for i in range(0, lag)}).dropna().iloc[:-1, :]
    X = np.reshape(X.values, (X.shape[0], 1, X.shape[1]))

    return X, y

In [None]:
nfeatures = 3

model = Sequential()
# model.add(Dense(200))
# 
# model.add(TimeDistributed(Flatten()))
model.add(LSTM(50,input_shape=(None, lag*nfeatures)))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
from sklearn.preprocessing import robust_scale

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=5, min_lr=0.001)
earlystop=EarlyStopping(monitor='loss', min_delta=0.1, patience=10, verbose=1, mode='auto', baseline=None, restore_best_weights=False)

In [None]:

from sklearn.model_selection import TimeSeriesSplit

nfeatures = 5

tscv = TimeSeriesSplit(n_splits=3)
print(tscv)  
# X=daily_mean.rolling(3).mean().dropna()
X=daily_mean.dropna()
xCol= daily_mean.columns
for train_index, test_index in tscv.split(X):
  #print("TRAIN:", train_index, "TEST:", test_index)
  
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  

    trainX, trainY = create_multi_data(X_train,nfeatures, lag=lag)
    testX, testY = create_multi_data(X_test, nfeatures,lag=lag)

    
    model = Sequential()
    model.add(LSTM(10,input_shape=(None, lag*nfeatures)))
    
    model.add(Dropout(0.5))
    model.add(Dense(10))
    model.add(Dropout(0.5))
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='RMSprop')
    model.fit(trainX, trainY, epochs=80, batch_size=64, verbose=1, callbacks=[reduce_lr,earlystop])
    
    trainPredict =  pd.DataFrame(model.predict(trainX),index=trainY.index)
    testPredict = pd.DataFrame(model.predict(testX),index=testY.index)
     
    #trainY.plot(figsize=(14,7))
    #testY.plot(figsize=(14,7))
  

  # break



In [None]:
# plt.plot()
# plt.plot(trainY.ravel(), label="Test")
# plt.plot(model.predict(trainX), label="Train")
trainPredict.plot(ylim = (-2,2))
trainY.plot(ylim = (-2,2))

testPredict.plot(ylim = (-2,2))
testY.plot(ylim = (-2,2))

In [None]:
X_train.index[0]-X_train.index[1]
dt = 1

In [None]:
NFFT = 1024*2*2  # the length of the windowing segments
Fs = (1.0 / dt)  # the sampling frequency

Fs

In [None]:
_,_=plt.psd(X_train['VIC_PRICE'].dropna().values, NFFT=NFFT, Fs=Fs)

In [None]:
plt.specgram(X_train['VIC_PRICE'].dropna().values, NFFT=NFFT, Fs=1 ,noverlap=90)

In [None]:
X_train['VIC_PRICE'].dropna().values.shape

## Gradient Boosting Regressor()

In [None]:
gbr = GradientBoostingRegressor(warm_start=True)

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(daily_mean[["VIC_PRICE"]],daily_mean[["VIC_PRICE_diff"]], test_size = 0.5)

In [None]:
ytrain.fillna(0, inplace = True)

In [None]:

gbr.fit(Xtrain.values.reshape(-1,1),ytrain)

yPredict_train = gbr.predict(Xtrain.values.reshape(-1,1))
yPredict_test = gbr.predict(Xtest.values.reshape(-1,1))

# pd.plotting.register_matplotlib_converters()
ytrain.loc[:,'yPredictTrain'] = pd.DataFrame(yPredict_train,index=ytrain.index)
ytest.loc[:,'yPredictTest'] = pd.DataFrame(yPredict_test,index=ytest.index)

# ytrain.plot()
# yPredict.shape,ytrain.shape

In [None]:
boostdata = daily_mean[["VIC_PRICE","VIC_PRICE_diff"]].dropna()

In [None]:
GradientBoostingRegressor(min_samples_split=5,min_samples_leaf=4)

In [None]:
gbr = GradientBoostingRegressor(warm_start=False)

tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(boostdata):
  #print("TRAIN:", train_index, "TEST:", test_index)
  
    xtrain, xtest = boostdata[["VIC_PRICE"]].iloc[train_index], boostdata[["VIC_PRICE"]].iloc[test_index]
    ytrain, ytest = boostdata[["VIC_PRICE_diff"]].iloc[train_index], boostdata[["VIC_PRICE_diff"]].iloc[test_index]

#     print(xtrain,xtest,ytrain,ytest)
#     break
    gbr.fit(xtrain,ytrain.values.ravel())

    yPredict_train = gbr.predict(xtrain.values)
    yPredict_test = gbr.predict(xtest.values)

    # pd.plotting.register_matplotlib_converters()
    ytrain.loc[:,'yPredictTrain'] = pd.DataFrame(yPredict_train,index=ytrain.index)
    ytest.loc[:,'yPredictTest'] = pd.DataFrame(yPredict_test,index=ytest.index)
    ytrain.plot()
    ytest.plot()

    # ytrain.plot()
    # yPredict.shape,ytrain.shape

In [None]:
# yPredict.reset_index().sort_values('index').set_index('index').plot()
ytrain.plot()
# .plot('VIC_PRICE_diff')
ytest.plot()


In [None]:
ytrain.reset_index(drop=True).plot()

## Facebook Prophet 


In [None]:
from fbprophet import Prophet
import math
m = Prophet()

FBdata  = pd.DataFrame()

FBdata['y'] = (data["VIC_PRICE"].resample('D').mean()) 

FBdata['ds'] = FBdata.index
FBdata = FBdata[['ds','y']]
#df = FBdata[:258]
#test = FBdata[258:]

m.fit(FBdata)

future = m.make_future_dataframe(periods=50)
forecast = m.predict(future)

fig1 = m.plot(forecast)

In [None]:
fig2 = m.plot_components(forecast)

In [None]:
# Now to try on the Half-hourly data.  
m = Prophet()

FBdata  = pd.DataFrame()
pd.set_option('use_inf_as_na', True)
FBdata['y'] = np.log(data["VIC_PRICE"]) 
FBdata['ds'] = FBdata.index

m.fit(FBdata)

future = m.make_future_dataframe(periods=50)
forecast = m.predict(future)

fig1 = m.plot(forecast)


In [None]:
m.plot_components(forecast)