# Requirements

In [1]:
!pip install tensorflow keras



In [2]:
import pandas as pd

In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense

## Introduction
In this laboratory assignment, the focus is on time series forecasting, specifically targeting the prediction of the current **close price** for Bitcoin. To accomplish this, you will use data from the preceding 7 days, and past statistics. 


## The Amazon Stock Price Dataset

The dataset comprises the following columns:
- date - the date of the recorded price
- symbol - the resource for prediction
- open - the open price of BTC
- high - the high price of BTC
- low - the low price of BTC
- volumeBTC - the volume of trades BTC
- volumeUSD - the volume of trades USD
- close - the close price of BTC

Target:
close

Load the dataset into a `pandas` data frame.

In [51]:
df = pd.read_csv('BTC-Daily.csv')
df.sample(5)

Unnamed: 0,date,symbol,open,high,low,close,Volume BTC,Volume USD
846,11/6/2019 0:00,BTC/USD,9319.1,9448.19,9254.68,9344.78,4670.235,43642320.0
2108,5/23/2016 0:00,BTC/USD,438.72,442.91,436.4,442.29,1126275.0,2555.94
1215,11/2/2018 0:00,BTC/USD,6343.85,6381.25,6328.33,6350.43,2678.069,17006890.0
282,5/23/2021 0:00,BTC/USD,37474.34,38311.74,31107.46,34706.79,9376.292,325421000.0
763,1/28/2020 0:00,BTC/USD,8894.57,9413.24,8876.0,9400.0,9565.559,89916260.0


In [53]:
df = df.drop(columns=['symbol'])

Explore the dataset using visualizations of your choice.

# Feauture Extraction
Select the relevant features for prediction and apply a lag of up to 7 days to each chosen feature

Hint: Use `df['column_name'].shift(period)`. Check the documentation at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html.

In [15]:
data=df
data['Date'] = pd.to_datetime(data['date'])
data = data.set_index('date')

In [17]:
data = data.sort_values(by='Date', ascending=False)
data.head()

Unnamed: 0_level_0,symbol,open,high,low,close,Volume BTC,Volume USD,Date
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3/1/2022 0:00,BTC/USD,43221.71,43626.49,43185.48,43185.48,49.006289,2116360.0,2022-03-01
2/28/2022 0:00,BTC/USD,37717.1,44256.08,37468.99,43178.98,3160.61807,136472300.0,2022-02-28
2/27/2022 0:00,BTC/USD,39146.66,39886.92,37015.74,37712.68,1701.817043,64180080.0,2022-02-27
2/26/2022 0:00,BTC/USD,39242.64,40330.99,38600.0,39146.66,912.724087,35730100.0,2022-02-26
2/25/2022 0:00,BTC/USD,38360.93,39727.97,38027.61,39231.64,2202.851827,86421490.0,2022-02-25


In [27]:
data = data[["close"]].copy()
lag = 7
periods = range(lag, 0, -1)
data.shift(periods=periods)

Unnamed: 0_level_0,close_7,close_6,close_5,close_4,close_3,close_2,close_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3/1/2022 0:00,,,,,,,
2/28/2022 0:00,,,,,,,43185.48
2/27/2022 0:00,,,,,,43185.48,43178.98
2/26/2022 0:00,,,,,43185.48,43178.98,37712.68
2/25/2022 0:00,,,,43185.48,43178.98,37712.68,39146.66
...,...,...,...,...,...,...,...
12/2/2014 0:00,350.49,364.61,376.87,375.07,374.95,365.20,376.67
12/1/2014 0:00,364.61,376.87,375.07,374.95,365.20,376.67,379.25
11/30/2014 0:00,376.87,375.07,374.95,365.20,376.67,379.25,378.39
11/29/2014 0:00,375.07,374.95,365.20,376.67,379.25,378.39,373.34


In [29]:
data = pd.concat([data, data.shift(periods=periods)], axis=1)
data.head()

Unnamed: 0_level_0,close,close_7,close_6,close_5,close_4,close_3,close_2,close_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3/1/2022 0:00,43185.48,,,,,,,
2/28/2022 0:00,43178.98,,,,,,,43185.48
2/27/2022 0:00,37712.68,,,,,,43185.48,43178.98
2/26/2022 0:00,39146.66,,,,,43185.48,43178.98,37712.68
2/25/2022 0:00,39231.64,,,,43185.48,43178.98,37712.68,39146.66


In [31]:
data.dropna(axis=0, inplace=True)
data.head()

Unnamed: 0_level_0,close,close_7,close_6,close_5,close_4,close_3,close_2,close_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2/22/2022 0:00,38269.94,43185.48,43178.98,37712.68,39146.66,39231.64,38376.88,37274.18
2/21/2022 0:00,37076.6,43178.98,37712.68,39146.66,39231.64,38376.88,37274.18,38269.94
2/20/2022 0:00,38373.9,37712.68,39146.66,39231.64,38376.88,37274.18,38269.94,37076.6
2/19/2022 0:00,40109.02,39146.66,39231.64,38376.88,37274.18,38269.94,37076.6,38373.9
2/18/2022 0:00,39996.99,39231.64,38376.88,37274.18,38269.94,37076.6,38373.9,40109.02


## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.

**WARNING: DO NOT SHUFFLE THE DATASET.**



In [39]:
X, y = data.drop(columns=["close"]), data["close"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [65]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [69]:
X_train.shape[0]

2115

In [79]:
(X_train.shape[0], lag, (X_train.shape[1] // lag))

(2115, 7, 1)

In [75]:
X_train = X_train.reshape(X_train.shape[0], lag, (X_train.shape[1] // lag))
X_test = X_test.reshape(X_test.shape[0], lag, (X_test.shape[1] // lag))

In [81]:
X_train

array([[[6.36872729e-01],
        [6.36775890e-01],
        [5.55336582e-01],
        ...,
        [5.77966706e-01],
        [5.65232121e-01],
        [5.48803619e-01]],

       [[6.36775890e-01],
        [5.55336582e-01],
        [5.76700638e-01],
        ...,
        [5.65232121e-01],
        [5.48803619e-01],
        [5.63638883e-01]],

       [[5.55336582e-01],
        [5.76700638e-01],
        [5.77966706e-01],
        ...,
        [5.48803619e-01],
        [5.63638883e-01],
        [5.45859986e-01]],

       ...,

       [[5.63160792e-05],
        [2.24817364e-04],
        [2.25860254e-04],
        ...,
        [2.92456253e-04],
        [2.61169542e-04],
        [2.71002508e-04]],

       [[2.24817364e-04],
        [2.25860254e-04],
        [2.63851260e-04],
        ...,
        [2.61169542e-04],
        [2.71002508e-04],
        [2.54316262e-04]],

       [[2.25860254e-04],
        [2.63851260e-04],
        [2.92456253e-04],
        ...,
        [2.71002508e-04],
        [2.5431

## Neural Networks

Create an LSTM model and train it using the `train` function.

In [85]:
model = Sequential([
    Input((lag, (X_train.shape[1] // lag))), # (timesteps, features)
    LSTM(64, activation="relu", return_sequences=True),
    LSTM(32, activation="relu"),
    Dense(1, activation="linear")
])

In [87]:
model.compile(
    loss="mean_squared_error",
    optimizer="adam",
    metrics=["mean_squared_error"],
)

In [91]:
history = model.fit(X_train, y_train, validation_split=0.2, epochs=64, batch_size=8)

Epoch 1/64
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - loss: 486808704.0000 - mean_squared_error: 486808704.0000 - val_loss: 32073290.0000 - val_mean_squared_error: 32073290.0000
Epoch 2/64
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 12359125.0000 - mean_squared_error: 12359125.0000 - val_loss: 16223629.0000 - val_mean_squared_error: 16223629.0000
Epoch 3/64
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 10216777.0000 - mean_squared_error: 10216777.0000 - val_loss: 17940128.0000 - val_mean_squared_error: 17940128.0000
Epoch 4/64
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 11739503.0000 - mean_squared_error: 11739503.0000 - val_loss: 12787282.0000 - val_mean_squared_error: 12787282.0000
Epoch 5/64
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - loss: 10533970.0000 - mean_squared_error: 10533970.0000 - val_lo

Use the trained model to make predictions for the test set.

In [103]:
pred_y = model.predict(X_test)

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [105]:
r2_score(y_test, pred_y)

-14.084100588668914

# Additional Bonus Task

Group the data by month. You can use [pandas.Grouper](https://pandas.pydata.org/docs/reference/api/pandas.Grouper.html) function.

Create an LSTM model to predict the 'close' price on a montly frequency.