In [9]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GridSearchCV
import os,sys
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [10]:
df = pd.read_csv("Data_RH.csv", header=0, sep=';')

In [11]:
df.shape

(38658, 6)

# Some statistics

In [8]:
prof=ProfileReport(df)
prof.to_file(output_file='Profile_RH')

# Data split

In [12]:
Multivariate Time Series Forecasting using RNN(LSTM)
I was trying to forecast the future values of a variable where it not only depends on the previous values of itself but it also depends on the previous/current values of the other variables. In that case we consider that as multivariate time series forecasting problem.

EXAMPLE:

Let us consider a shop which is trying to sell two different Indian snacks i.e. Samosa and Kachori. He wants to forecast the number of samosas he must prepare next day to fulfill the demands of the customers. In this case let me give you a realistic example.

Samosa(available-yes),kachori(available-yes):

Consider a customer who has come with an intention to buy 10 samosas but as kachoris were also available in the shop. He ended up ordering 5 samosa and 5 kachori . The sales of samosa dipped down because of the kachoris availability.

Samosa(available-yes),kachori(available-no)

Consider the same customer has come to the same shop with the intention of buying 5 samosas and 5 kachoris but because of the unavailability he ended up buying 10 samosas . The sales of samosas increased because of the unavailability of kachoris. The same could happen vice-versa.

In the case above the sales of samosa is not only dependent on its previous sales but also dependent on the current and past sales of kachori. Hence, it becomes multi-variate time series problem. Hope, it sounds clean and clear now.
# set aside 20% of train and test data for evaluation
X_train, X_test, y_train, y_test = train_test_split(x, y,
    test_size=0.2, shuffle = True, random_state = 8)

# Use the same function above for the validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.25, random_state= 8) # 0.25 x 0.8 = 0.2


print("X_train shape: {}".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("y_train shape: {}".format(y_train.shape))
print("y_test shape: {}".format(y_test.shape))
print("X_val shape: {}".format(X_val.shape))
print("y_val shape: {}".format(y_val.shape))

X_train shape: (23194, 4)
X_test shape: (7732, 4)
y_train shape: (23194, 1)
y_test shape: (7732, 1)
X_val shape: (7732, 4)
y_val shape: (7732, 1)


# LinearRegression

In [13]:
model=LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

In [14]:
prediction = model.predict(X_val)

In [15]:
print('Variance score: %.2f' % model.score(X_val, y_val))
print("Mean absolute error: %.2f" % np.mean(np.absolute(prediction - y_val)))
print("Residual sum of squares (MSE): %.2f" % np.mean((prediction - y_val) ** 2))
print("R2-score: %.2f" % r2_score(y_val,prediction ) )

Variance score: 0.94
Mean absolute error: 2.96
Residual sum of squares (MSE): 31.60
R2-score: 0.94


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# DecisionTree

In [16]:
regressor=DecisionTreeRegressor(random_state=0)
regressor.fit(X_train,y_train)

DecisionTreeRegressor(random_state=0)

In [18]:
prediction=regressor.predict(X_val)

In [21]:

prediction=prediction.reshape(7732,1)


In [23]:
print('Variance score: %.2f' % regressor.score(X_val, y_val))
print("Mean absolute error: %.2f" % np.mean(np.absolute(prediction - y_val)))
print("Residual sum of squares (MSE): %.2f" % np.mean((prediction - y_val) ** 2))
print("R2-score: %.2f" % r2_score(y_val,prediction ) )

Variance score: 0.90
Mean absolute error: 3.39
Residual sum of squares (MSE): 51.76
R2-score: 0.90


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# RandomForest

In [24]:
regr=RandomForestRegressor(max_depth=90,random_state=0,n_estimators=100)
regr.fit(X_train,y_train)

  regr.fit(X_train,y_train)


RandomForestRegressor(max_depth=90, random_state=0)

In [25]:
prediction=regr.predict(X_val)

In [26]:
prediction=prediction.reshape(7732,1)


In [27]:
print('Variance score: %.2f' % regr.score(X_val, y_val))
print("Mean absolute error: %.2f" % np.mean(np.absolute(prediction - y_val)))
print("Residual sum of squares (MSE): %.2f" % np.mean((prediction - y_val) ** 2))
print("R2-score: %.2f" % r2_score(y_val,prediction ) )

Variance score: 0.95
Mean absolute error: 2.50
Residual sum of squares (MSE): 24.68
R2-score: 0.95


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# SVR

In [28]:
reg = SVR(kernel = 'rbf')
reg.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVR()

In [29]:
prediction=regr.predict(X_val)

In [30]:
prediction=prediction.reshape(7732,1)


In [31]:
print('Variance score: %.2f' % reg.score(X_val, y_val))
print("Mean absolute error: %.2f" % np.mean(np.absolute(prediction - y_val)))
print("Residual sum of squares (MSE): %.2f" % np.mean((prediction - y_val) ** 2))
print("R2-score: %.2f" % r2_score(y_val,prediction ) )

Variance score: 0.94
Mean absolute error: 2.50
Residual sum of squares (MSE): 24.68
R2-score: 0.95


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# Normalization

In [32]:
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(X_train)
val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(X_test)

train_scaled, y_train = np.array(train_scaled), np.array(y_train)
test_scaled,  y_test  = np.array(test_scaled),  np.array(y_test)
val_scaled,  y_val  = np.array(val_scaled),  np.array(y_val)

# ANN

In [174]:
def DNN(shape):
  
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape, name="InputLayer"))
    model.add(keras.layers.Dense(289, activation='relu', name='Dense_n1'))
    model.add(keras.layers.Dense(486, activation='relu', name='Dense_n2'))
    model.add(keras.layers.Dense(245, activation='relu', name='Dense_n3'))
    model.add(keras.layers.Dense(1, name='Output'))
  
    model.compile(optimizer = keras.optimizers.Adam(learning_rate = 0.001),
                loss      = 'mse',
                metrics   = ['mae', 'mse'] )
    return model

In [264]:
model=DNN( (4,) )

model.summary()

Model: "sequential_41"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Dense_n1 (Dense)            (None, 215)               1075      
                                                                 
 Dense_n2 (Dense)            (None, 484)               104544    
                                                                 
 Dense_n3 (Dense)            (None, 245)               118825    
                                                                 
 Dense_n4 (Dense)            (None, 255)               62730     
                                                                 
 Output (Dense)              (None, 1)                 256       
                                                                 
Total params: 287,430
Trainable params: 287,430
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(train_scaled,
                    y_train,
                    epochs          = 30,
                    validation_data = (X_val, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30

In [None]:
#df=pd.DataFrame(data=history.history)
#display(df) 

In [None]:
prediction=model.predict(val_scaled)

In [None]:
print("Mean absolute error: %.2f" % np.mean(np.absolute(prediction - y_val)))
print("Residual sum of squares (MSE): %.2f" % np.mean((prediction - y_val) ** 2))
print("R2-score: %.2f" % r2_score(y_val,prediction ) )