In [2]:
import numpy as np
import pandas as pd

In [10]:
#IMPORTANT : import the DataPreprocessing class
from DataPreprocessing import DataPreprocessing

In [6]:
df = pd.read_csv("WorkstationsData/1.csv", header=None, names=["StationIndex", "StationName", "Year", "Month", "Day", "Rainfall"])

In [7]:
df

Unnamed: 0,StationIndex,StationName,Year,Month,Dat,Rainfall
0,1,Dhaka,1970,1,1,0
1,1,Dhaka,1970,1,2,0
2,1,Dhaka,1970,1,3,0
3,1,Dhaka,1970,1,4,0
4,1,Dhaka,1970,1,5,0
5,1,Dhaka,1970,1,6,0
6,1,Dhaka,1970,1,7,0
7,1,Dhaka,1970,1,8,0
8,1,Dhaka,1970,1,9,0
9,1,Dhaka,1970,1,10,0


In [20]:
#The first step is to instanciate the DataPreprocessing class as follow
DP = DataPreprocessing()

In [25]:
#We will only use the "Rainfall" column
time_series_df = df[["Rainfall"]]
time_series_df

Unnamed: 0,Rainfall
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [29]:
#We need to specify the column that corresponds to the time series (Rainfall) and the time horizon (40)
X, y = DP.create_dataset_in_time_series_form(entire_dataframe=time_series_df, time_series_column="Rainfall", time_horizon=40)

In [30]:
pd.DataFrame(data=X)
#You can notice that there are 40 columns : since we want to predict $Y_{t+1}$ base on the 40 previous values of $Y_{t}$
#Take a look at line 10 and 11. $Y_{t+1}$ for line 10 is "8". This same 8 will be the 40th features of line 11 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,0,2,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
pd.DataFrame(data=y)
#Take a look at line 10 (starting with "0" of course) : $Y_{t+1}$ is 8 in this case

Unnamed: 0,Y_t_plus_1
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0


# How to use Scikit-lean with this form of dataset

In [33]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [54]:
model = LinearRegression()
#model = RandomForestRegressor()

In [55]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [56]:
model.score(X_test, y_test)

0.12858419880917027

In [57]:
y_pred = model.predict(X_test)

In [58]:
from sklearn.metrics import mean_squared_error

In [59]:
np.sqrt(mean_squared_error(y_test, y_pred))

15.307048434251877

# How to train a Keras model with this form of dataset

In [52]:
import tensorflow.keras as keras

Specify output_form="keras" to the DP function : it will return the dataset in a form adapted to Keras RNN (that is (batch_size,time_horizon,features) ). Again, the time horizon here is 40

In [53]:
X, y = DP.create_dataset_in_time_series_form(entire_dataframe=time_series_df, time_series_column="Rainfall", time_horizon=40, output_form="keras")

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [63]:
X_train.shape

(12804, 40, 1)

In [60]:
X.shape

(17072, 40, 1)

In [61]:
y.shape

(17072, 1)

We will create a 128 hidden unit LSTM. In our case, the LSTM will receive sequences of lenghts 40 . Each sequence element will have length "1" (since we've only used the "Rainfall" column). Therefore, she input_shape parameter will be (40,1)

In [67]:
model = keras.Sequential()
model.add(keras.layers.LSTM(128, input_shape=(40,1)))
model.add(keras.layers.Dense(units=1))

model.compile(optimizer='rmsprop',
              loss='mse')

model.fit(X_train, y_train,
          epochs=20,
          batch_size=128)
score = model.evaluate(X_test, y_test, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [68]:
score

197.75426010517953

In [69]:
np.sqrt(score)

14.062512581511866