In [1]:
# Import all relevant packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional


# Starting out
## I followed a very detailed guide to predicting weather phenomena
The link to this guide:
https://towardsdatascience.com/weather-forecasting-a-deep-learning-approach-7ecddff0fa71

It explained each step in great detail.
The basic idea was to build RNN (LSTMs which we covered on okctober 27. lecture and had a link to https://colah.github.io/posts/2015-08-Understanding-LSTMs/ which is a great read)

For my data i used https://www.ncdc.noaa.gov/cdo-web/datasets where i could reach a dataset for Budapests average temperature dating back to the year 2000.
I enquire the data, which then they sent by mail in a csv file, which was really helpful.

Additional resources: https://www.hindawi.com/journals/complexity/2020/3536572/

Following the guide, i downloaded their file from which we will work. Note that this works on rainfall and not temperature but the idea is basically the same. So After this I will modify it to my needs 

In [70]:
# Load data from the csv file
DATA_DIR = 'data/rainfall.csv'
dataset = pd.read_csv(DATA_DIR)
# We only need the rainfall column
rainfall_df = dataset[['rainfall']]
rainfall_df

Unnamed: 0,rainfall
0,2.9
1,10.2
2,0.9
3,48.6
4,9.6
...,...
3769,0.0
3770,8.4
3771,5.4
3772,3.2


In [None]:
# Split the Data into a training set and a test set
train_split= 0.9
split_idx = int(len(rainfall_df) * 0.9)
training_set = rainfall_df[:split_idx].values
test_set = rainfall_df[split_idx:].values

This guide created a network which predicts 5 days into the future baes on the last 30 days.

In [None]:
# 5-day prediction using 30 days data
x_train = []
y_train = []
# Next 5 days rainfall forecast
n_future = 5 
# Past 30 days
n_past = 30 
# Fill x_train and y_train (inputs and corresponding outputs)
for i in range(0, len(training_set) - n_past - n_future + 1):
    x_train.append(training_set[i : i + n_past, 0])
    y_train.append(training_set[i + n_past : i + n_past + n_future, 0])

In [None]:
# Reshape the data so it fits into the input of our network
x_train , y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0] , x_train.shape[1], 1))

I left all this on default as the guide suggested. the training happened over 300 epochs, with a batch size of 32. 
The model has LSTM(Long short term memories) so all of the previous data can be taken into consideration.
The network also has dropouts as to guard against overfitting.

In [None]:
EPOCHS = 300
BATCH_SIZE = 32
regressor = Sequential()
regressor.add(Bidirectional(LSTM(units=30, return_sequences=True, input_shape = (x_train.shape[1], 1))))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units= 30, return_sequences=True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units= 30, return_sequences=True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units= 30))
regressor.add(Dropout(0.2))
regressor.add(Dense(units = n_future, activation='relu'))

In [None]:
# Compiling and fitting the network
regressor.compile(optimizer='adam',loss='mean_squared_error', metrics=['acc'])

regressor.fit(x_train,y_train,epochs=EPOCHS,batch_size=BATCH_SIZE)

After the training we can see the accuracy was around 40%
I took the output out by accident, and didn't have time for another 30 min session.

# Start of Own data training

I took my own data and folloed the steps from the guide, leaving everything as is.

In [71]:
TEMP_DATA_DIR = 'data/average_temperatures_budapest_2000_2020.csv'
dataset = pd.read_csv(TEMP_DATA_DIR)
# this is what it looks like
print(dataset[['date','average_temperature']])

# This is what we are going to use 
temp_df = dataset[['average_temperature']]
temp_df

            date  average_temperature
0       1/1/2000                266.4
1       1/2/2000                268.4
2       1/3/2000                271.2
3       1/4/2000                271.6
4       1/5/2000                271.7
...          ...                  ...
7562  10/16/2020                282.4
7563  10/17/2020                281.0
7564  10/18/2020                281.8
7565  10/19/2020                280.9
7566  10/20/2020                282.1

[7567 rows x 2 columns]


Unnamed: 0,average_temperature
0,266.4
1,268.4
2,271.2
3,271.6
4,271.7
...,...
7562,282.4
7563,281.0
7564,281.8
7565,280.9


In [None]:
# Everything is the same here
train_split= 0.9
split_idx = int(len(temp_df) * 0.9)
training_set = temp_df[:split_idx].values
test_set = temp_df[split_idx:].values

In [None]:
# 5-day prediction using 30 days data
x_train = []
y_train = []
n_future = 5 #Next 5 days rainfall forecast
n_past = 30 #Past 30 days
for i in range(0, len(training_set) - n_past - n_future + 1):
    x_train.append(training_set[i : i + n_past, 0])
    y_train.append(training_set[i + n_past : i + n_past + n_future, 0])

In [None]:
x_train , y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0] , x_train.shape[1], 1))

In [None]:
# Build model/Neural network
model = Sequential()
model.add(Bidirectional(LSTM(units=30, return_sequences=True, input_shape = (x_train.shape[1], 1))))
model.add(Dropout(0.2))
model.add(LSTM(units= 30, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units= 30, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units= 30))
model.add(Dropout(0.2))
model.add(Dense(units = n_future, activation='relu'))

In [None]:
# Compile and train
EPOCHS = 300
BATCH_SIZE = 32
model.compile(optimizer='adam',loss='mean_squared_error', metrics=['acc'])

model.fit(x_train,y_train,epochs=EPOCHS,batch_size=BATCH_SIZE)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f2140333470>

In [None]:
x_test = test_set[: n_past, 0]
y_test = test_set[n_past : n_past + n_future, 0]

x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (1, x_test.shape[0], 1))

predicted_temperature = model.predict(x_test)

print('Predicted temperature {}'.format(predicted_temperature-273.2))
print('Real temperature {}'.format(y_test-273.2))

Predicted temperature [[10.00531  10.00235  10.010254 10.009827 10.017639]]
Real temperature [15.5 13.4 13.8  9.8 12.4]


As we can see the accuracy on this initial train wasn't that great. As the data was in kelvin, i next tried the same thing on the same database that was in Celcius

Try on Celcius


In [10]:
# Load celcius csv
TEMP_DATA_DIR = 'data/average_temperatures_budapest_2000_2020_C.csv'
dataset = pd.read_csv(TEMP_DATA_DIR)
# this is what it looks like
print(dataset[['DATE','TAVG']])

# This is what we are going to use 
temp_df = dataset[['TAVG']]
temp_df

            DATE  TAVG
0       1/1/2000  -6.8
1       1/2/2000  -4.8
2       1/3/2000  -2.0
3       1/4/2000  -1.6
4       1/5/2000  -1.5
...          ...   ...
7562  10/16/2020   9.2
7563  10/17/2020   7.8
7564  10/18/2020   8.6
7565  10/19/2020   7.7
7566  10/20/2020   8.9

[7567 rows x 2 columns]


Unnamed: 0,TAVG
0,-6.8
1,-4.8
2,-2.0
3,-1.6
4,-1.5
...,...
7562,9.2
7563,7.8
7564,8.6
7565,7.7


In [39]:
# everything else stayes the same
train_split= 0.9
split_idx = int(len(temp_df) * 0.9)
training_set = temp_df[:split_idx].values
test_set = temp_df[split_idx:].values

In [55]:
# 3 days predicted (tomorrow, one week later, one month later) prediction using 30 days data
x_train = []
y_train = []
no_of_predicted_days = 3
n_future = 28 + 1 # Furthest date + 1 day.
n_past = 30 #Past 30 days
for i in range(0, len(training_set) - n_past - n_future + 1):
    x_train.append(training_set[i : i + n_past, 0])

    single_y = []
    single_y.append(training_set[i + n_past + 1, 0]) # tomorrow
    single_y.append(training_set[i + n_past + 7, 0]) # one week later
    single_y.append(training_set[i + n_past + 28, 0]) # one week later
    y_train.append(single_y)

In [56]:
# Checking the output
print(y_train)

[[8.0, 3.1, 3.9], [4.5, 5.4, 5.3], [6.8, 6.9, 7.1], [3.3, 4.8, 5.1], [6.6, 2.5, 4.4], [6.7, 4.7, 5.1], [3.1, 5.7, 1.6], [5.4, 3.6, 2.5], [6.9, 0.9, 5.8], [4.8, 2.4, 10.4], [2.5, 2.3, 11.9], [4.7, 1.8, 6.5], [5.7, 1.2, 1.7], [3.6, 2.6, 6.4], [0.9, 0.9, 3.9], [2.4, -1.1, 6.6], [2.3, -1.5, 5.6], [1.8, 0.7, 0.9], [1.2, 5.0, 0.5], [2.6, 4.7, 1.8], [0.9, 4.3, 1.3], [-1.1, 3.9, 3.1], [-1.5, 5.3, 4.8], [0.7, 7.1, 8.1], [5.0, 5.1, 9.3], [4.7, 4.4, 9.5], [4.3, 5.1, 8.0], [3.9, 1.6, 9.1], [5.3, 2.5, 9.8], [7.1, 5.8, 10.1], [5.1, 10.4, 11.0], [4.4, 11.9, 9.6], [5.1, 6.5, 8.0], [1.6, 1.7, 9.4], [2.5, 6.4, 10.1], [5.8, 3.9, 10.9], [10.4, 6.6, 13.2], [11.9, 5.6, 11.4], [6.5, 0.9, 6.7], [1.7, 0.5, 6.3], [6.4, 1.8, 6.5], [3.9, 1.3, 5.7], [6.6, 3.1, 8.1], [5.6, 4.8, 10.4], [0.9, 8.1, 11.3], [0.5, 9.3, 12.8], [1.8, 9.5, 14.7], [1.3, 8.0, 16.5], [3.1, 9.1, 14.0], [4.8, 9.8, 16.6], [8.1, 10.1, 19.1], [9.3, 11.0, 19.9], [9.5, 9.6, 20.4], [8.0, 8.0, 21.3], [9.1, 9.4, 20.4], [9.8, 10.1, 20.4], [10.1, 10.9, 19

In [57]:
x_train , y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0] , x_train.shape[1], 1))

I wanted a prediction which takes the previous 180 days(this is around half a year) into account, at first I only did this on the LSTMs (units=180). The final network will have the inputs set to 180 aswell

In [58]:
# Tried with the units parameter at 180 
# Build model/Neural network
model_C = Sequential()
model_C.add(Bidirectional(LSTM(units=180, return_sequences=True, input_shape = (x_train.shape[1], 1))))
model_C.add(Dropout(0.2))
model_C.add(LSTM(units= 180, return_sequences=True))
model_C.add(Dropout(0.2))
model_C.add(LSTM(units= 180, return_sequences=True))
model_C.add(Dropout(0.2))
model_C.add(LSTM(units= 180))
model_C.add(Dropout(0.2))
model_C.add(Dense(units = no_of_predicted_days, activation='relu'))

In [59]:
# Compile and train
EPOCHS = 300
BATCH_SIZE = 32
model_C.compile(optimizer='adam',loss='mean_squared_error', metrics=['acc'])

model_C.fit(x_train,y_train,epochs=EPOCHS,batch_size=BATCH_SIZE)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f6ca43a9e10>

As we can see the accuracy seems to be pretty high, this is a good sign for our final network. 

My final attempt, using the last 180 days to predict a day, a week and a month into the future.

In [63]:
train_split= 0.9
split_idx = int(len(temp_df) * 0.9)
training_set = temp_df[:split_idx].values
test_set = temp_df[split_idx:].values

In [64]:
# 5-day prediction using 180 days data
x_train = []
y_train = []
no_of_predicted_days = 3
n_future = 28 + 1 
n_past = 180 #Past 30 days
for i in range(0, len(training_set) - n_past - n_future + 1):
    x_train.append(training_set[i : i + n_past, 0])

    single_y = []
    single_y.append(training_set[i + n_past + 1, 0]) # tomorrow
    single_y.append(training_set[i + n_past + 7, 0]) # one week later
    single_y.append(training_set[i + n_past + 28, 0]) # one month later
    y_train.append(single_y)

In [65]:

x_train , y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0] , x_train.shape[1], 1))

In [68]:
# Build model/Neural network
model = Sequential()
model.add(Bidirectional(LSTM(units=180, return_sequences=True, input_shape = (x_train.shape[1], 1))))
model.add(Dropout(0.2))
model.add(LSTM(units= 180, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units= 180, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units= 180))
model.add(Dropout(0.2))
model.add(Dense(units = no_of_predicted_days, activation='relu'))

In [69]:
# Compile and train
EPOCHS = 300
BATCH_SIZE = 32
model.compile(optimizer='adam',loss='mean_squared_error', metrics=['acc'])

model.fit(x_train,y_train,epochs=EPOCHS,batch_size=BATCH_SIZE)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f6ca7fe5ba8>

This accuracy seems promising, but since we only had data till the 20th we can only make predictions based on that. Data from this: https://www.wunderground.com/history/daily/hu/budapest/LHBP/date/2020-10-21 site showed the average temperature on the 21st was 50.06 F which is 10.03 C. pretty close im happy with it. From our data we can predict the following temperatures, and when does dates pass we can check how accurate they actually were!.

In [76]:
x_test = test_set[-n_past:, 0]

x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (1, x_test.shape[0], 1))

predicted_temperature = model_C.predict(x_test)

print('Predicted temperature for the 21st, 28th, and november the 17th:{} '.format(predicted_temperature))


Predicted temperature for the 21st, 28th, and november the 17th:[[11.429388 10.344137  4.295057]] 


End of the 3rd small homework.