### Import necessary libraries

In [1]:
import requests
import json
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
# from fbprophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
plt.style.use('fivethirtyeight') # For plots
sb.set()

### Extracting Data

Here, we extract the data we want to explore. <br>
1. **Bitcoin** <br>
2. **Dogecoin** <br>

In [2]:
api_key = '925GOXIZ940U7PA6'
function = 'DIGITAL_CURRENCY_DAILY'
symbol = 'DOGE'
market = 'SGD'
url = f'https://www.alphavantage.co/query?function={function}&symbol={symbol}&market={market}&apikey={api_key}'

response_Doge = requests.get(url)
response_Doge = response_Doge.json()

In [3]:
api_key = '925GOXIZ940U7PA6'
function = 'DIGITAL_CURRENCY_DAILY'
symbol = 'BTC'
market = 'SGD'
url = f'https://www.alphavantage.co/query?function={function}&symbol={symbol}&market={market}&apikey={api_key}'

response_BTC = requests.get(url)
response_BTC = response_BTC.json()

### Data Cleaning on Bitcoin & Dogecoin

In [4]:
BTC_resp = response_BTC['Time Series (Digital Currency Daily)']
BTC_resp = pd.DataFrame(BTC_resp)
BTC_resp = BTC_resp.T
BTC_resp.reset_index(level = 0, inplace=True)
BTC_resp.rename(columns={'index':'Date'}, inplace=True)

Doge_resp = response_Doge['Time Series (Digital Currency Daily)']
Doge_resp = pd.DataFrame(Doge_resp)
Doge_resp = Doge_resp.T
Doge_resp.reset_index(level = 0, inplace=True)
Doge_resp.rename(columns={'index':'Date'}, inplace=True)

In [5]:
BTC_resp.head()

Unnamed: 0,Date,1a. open (SGD),1b. open (USD),2a. high (SGD),2b. high (USD),3a. low (SGD),3b. low (USD),4a. close (SGD),4b. close (USD),5. volume,6. market cap (USD)
0,2021-07-05,47442.7734972,35288.13,47450.3695832,35293.78,45710.96,34000.0,46096.8949464,34287.06,27861.88816,27861.88816
1,2021-07-04,46610.5516928,34669.12,48356.616254,35967.85,46191.126746,34357.15,47440.5955044,35286.51,43703.475789,43703.475789
2,2021-07-03,45423.9758376,33786.54,46982.2759084,34945.61,44792.3444812,33316.73,46610.5651372,34669.13,43044.578641,43044.578641
3,2021-07-02,45041.8725452,33502.33,45680.0916576,33977.04,43961.84356,32699.0,45423.989282,33786.55,56172.181378,56172.181378
4,2021-07-01,47115.8998,35045.0,47132.7994108,35057.57,43977.97684,32711.0,45045.0454236,33504.69,71708.266112,71708.266112


In [6]:
Doge_resp.head()

Unnamed: 0,Date,1a. open (SGD),1b. open (USD),2a. high (SGD),2b. high (USD),3a. low (SGD),3b. low (USD),4a. close (SGD),4b. close (USD),5. volume,6. market cap (USD)
0,2021-07-05,0.33121624,0.24636,0.33476556,0.249,0.32247738,0.23986,0.32660481,0.24293,424332505.4,424332505.4
1,2021-07-04,0.33118935,0.24634,0.34014332,0.253,0.32702159,0.24324,0.33131035,0.24643,823044174.7,823044174.7
2,2021-07-03,0.32936091,0.24498,0.33663433,0.25039,0.32535448,0.242,0.33121624,0.24636,759251070.1,759251070.1
3,2021-07-02,0.32777447,0.2438,0.33498067,0.24916,0.32050105,0.23839,0.32945502,0.24505,1081847381.6,1081847381.6
4,2021-07-01,0.34135332,0.2539,0.35143662,0.2614,0.32020527,0.23817,0.32778792,0.24381,2253058516.9,2253058516.9


We are not interested in the `Volume` and `Market Cap`. Therefore we drop them. <br>
Also, we drop all `USD` values as we want to base on `SGD` values.

In [7]:
BTC_resp = BTC_resp.drop(['5. volume', '6. market cap (USD)', '1b. open (USD)', '2b. high (USD)', '3b. low (USD)', '4b. close (USD)'], axis=1)
Doge_resp = Doge_resp.drop(['5. volume', '6. market cap (USD)', '1b. open (USD)', '2b. high (USD)', '3b. low (USD)', '4b. close (USD)'], axis=1)

In [8]:
BTC_resp.head()

Unnamed: 0,Date,1a. open (SGD),2a. high (SGD),3a. low (SGD),4a. close (SGD)
0,2021-07-05,47442.7734972,47450.3695832,45710.96,46096.8949464
1,2021-07-04,46610.5516928,48356.616254,46191.126746,47440.5955044
2,2021-07-03,45423.9758376,46982.2759084,44792.3444812,46610.5651372
3,2021-07-02,45041.8725452,45680.0916576,43961.84356,45423.989282
4,2021-07-01,47115.8998,47132.7994108,43977.97684,45045.0454236


In [9]:
Doge_resp.head()

Unnamed: 0,Date,1a. open (SGD),2a. high (SGD),3a. low (SGD),4a. close (SGD)
0,2021-07-05,0.33121624,0.33476556,0.32247738,0.32660481
1,2021-07-04,0.33118935,0.34014332,0.32702159,0.33131035
2,2021-07-03,0.32936091,0.33663433,0.32535448,0.33121624
3,2021-07-02,0.32777447,0.33498067,0.32050105,0.32945502
4,2021-07-01,0.34135332,0.35143662,0.32020527,0.32778792


We convert the Date column to a `datetime` format, and convert the rest of the columns to `float` as they are currently typed as an object.

In [10]:
for i in BTC_resp.columns[1:]:
    print(i)

1a. open (SGD)
2a. high (SGD)
3a. low (SGD)
4a. close (SGD)


In [None]:
BTC_resp.Date = pd.to_datetime(BTC_resp.Date, format = '%Y/%m/%d')
for colName in BTC_resp.columns[1:]:
    BTC_resp[str(colName)] = BTC_resp[str(colName)].astype(float)
    
Doge_resp.Date = pd.to_datetime(Doge_resp.Date, format = '%Y/%m/%d')
for colName in Doge_resp.columns[1:]:
    Doge_resp[str(colName)] = Doge_resp[str(colName)].astype(float)

In [None]:
BTC_resp.info()

In [None]:
Doge_resp.info()

Now that we have the Data prepared in the formats we want, we can perfom some EDA on the Data.

In [None]:
def showPlot(df):
    plt.figure(figsize=(24,12))
    sb.lineplot(data=df, x=df['Date'], y=df['1a. open (SGD)'], label='Open')
    sb.lineplot(data=df, x=df['Date'], y=df['2a. high (SGD)'], label='High', color='g')
    sb.lineplot(data=df, x=df['Date'], y=df['3a. low (SGD)'], label='Low', color='orange')
    sb.lineplot(data=df, x=df['Date'], y=df['4a. close (SGD)'], label='Close', color='r')
    plt.xlabel("Date")
    plt.ylabel("Price")
    ax = plt.gca()
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%d-%m-%y"))
    plt.gcf().autofmt_xdate()
    plt.show()

In [None]:
showPlot(BTC_resp)

In [None]:
showPlot(Doge_resp)

In [None]:
BTC_resp.describe()

In [None]:
Doge_resp.describe()

We want to see the correlation between Bitcoin and Dogecoin... <br>
We decided to stick with one variable which was the opening price. <br>
Hence we dropped all columns except for the Date and Open columns.

In [None]:
response_Doge = response_Doge['Time Series (Digital Currency Daily)']
df_Doge = pd.DataFrame(response_Doge)
df_Doge = df_Doge.T
df_Doge.reset_index(level = 0, inplace=True)

df_Doge = df_Doge.drop(['1b. open (USD)', '2a. high (SGD)', '2b. high (USD)', '3a. low (SGD)', '3b. low (USD)', '4a. close (SGD)', '4b. close (USD)', '5. volume', '6. market cap (USD)'], axis=1)
df_Doge.columns = ['Date' , 'Open (Doge)']
df_Doge.Date = pd.to_datetime(df_Doge.Date, format = '%Y/%m/%d')
df_Doge['Open (Doge)'] = df_Doge['Open (Doge)'].astype(float)

response_BTC = response_BTC['Time Series (Digital Currency Daily)']
df_BTC = pd.DataFrame(response_BTC)
df_BTC = df_BTC.T
df_BTC.reset_index(level = 0, inplace=True)

df_BTC = df_BTC.drop(['1b. open (USD)', '2a. high (SGD)', '2b. high (USD)', '3a. low (SGD)', '3b. low (USD)', '4a. close (SGD)', '4b. close (USD)', '5. volume', '6. market cap (USD)'], axis=1)
df_BTC.columns = ['Date', 'Open (BTC)']
df_BTC.Date = pd.to_datetime(df_BTC.Date, format = '%Y/%m/%d')
df_BTC['Open (BTC)'] = df_BTC['Open (BTC)'].astype(float)

In [None]:
df_BTC

In [None]:
df_Doge

### Let's Concat BTC & Doge 

In [None]:
# Drop Doge Date col first
# Join both of them into one df
joint_df = pd.concat([df_BTC, df_Doge.drop(['Date'], axis=1)], axis=1)
joint_df.info()

In [None]:
joint_df.isna().sum()

In [None]:
# Drop null val
joint_df = joint_df.dropna()
joint_df.isna().sum()

In [None]:
joint_df.head()

In [None]:
# Reverse the dates, reset index
joint_df = joint_df.iloc[::-1]
joint_df = joint_df.reset_index(drop=True)

In [None]:
joint_df

### Let's do some EDA!

In [None]:
joint_df.describe()

Here, we can see the central tendencies of the variables we are interested in!

#### We shall plot these variables onto a line plot to better visualize the data!

In [None]:
'''
var: accept col of df -> y axis of timeseries
color: accept str
'''

def timeSeriesPlot(var, color=None):
    plt.figure(figsize=(24,12))
    plt.plot(joint_df['Date'], var, color=color)
    ax = plt.gca()
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%d-%m-%y"))
    plt.gcf().autofmt_xdate()
    plt.show()

In [None]:
timeSeriesPlot(joint_df['Open (BTC)'])

In [None]:
timeSeriesPlot(joint_df['Open (Doge)'], 'r')

For now, we can only plot them separately as the differences in **Bitcoin** and **Dogecoin** are `EXTREMELY LARGE`... <br>
In order to visualize both of them together, we first need to `normalize` them.

### Let's Normalize the Data!

We will normalize the data using the the `MinMaxScaler` function from `sklearn`!

In [None]:
joint_df2 = joint_df.drop(['Date'], axis=1)
joint_df2 = joint_df2.reset_index(drop=True)
joint_df2.head()

In [None]:
# Data Prep
BTC_T = joint_df2.drop(['Open (Doge)'], axis=1)
Doge_T = joint_df2.drop(['Open (BTC)'], axis=1)
BTC_T = BTC_T.values
Doge_T = Doge_T.values

# Normalizing
scaler = MinMaxScaler(feature_range=(0,1))
BTC_T = scaler.fit_transform(BTC_T)
BTC_T = pd.DataFrame(BTC_T, columns=['BTC_Norm'])

Doge_T = scaler.fit_transform(Doge_T)
Doge_T = pd.DataFrame(Doge_T, columns=['Doge_Norm'])

In [None]:
joint_df2 = pd.concat([joint_df, BTC_T, Doge_T], axis=1)

In [None]:
joint_df2

Now that the data is normalized, we can plot them on the same graph!

In [None]:
plt.figure(figsize=(24,12))
sb.lineplot(x=joint_df2['Date'], y=joint_df2['BTC_Norm'], color='green', label='BTC')
sb.lineplot(x=joint_df2['Date'], y=joint_df2['Doge_Norm'], color='orange', label='Doge')
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%d-%m-%y"))
plt.gcf().autofmt_xdate()
plt.show()

In [None]:
joint_df2.describe()

### We will use LSTM to predict for our model

We will try to predict Bitcoin first...

In [None]:
BTC = joint_df.drop(['Date', 'Open (Doge)'], axis=1)

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
BTC_T = BTC.values
BTC_T = scaler.fit_transform(BTC_T)

### Train/Test Split

We split our model into 80% train data and 20% test data

In [None]:
# 80 - 20 split
train_size = int(len(BTC_T) * 0.8)
test_size = int(len(BTC_T) - train_size)
train, test = BTC_T[0:train_size,:], BTC_T[train_size:len(BTC_T),:]

In [None]:
def create_features(data, look_back):
    X, Y = [], []
    for i in range(len(data) - look_back - 1):
        window = data[i:(i + look_back), 0]
        X.append(window)
        Y.append(data[i + look_back, 0])
    return np.array(X), np.array(Y)

Here, we reshape the train & test sets into the format that is accepted by the LSTM function.

In [None]:
look_back = 1
X_train, Y_train = create_features(train, look_back)
X_test, Y_test = create_features(test, look_back)

X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

In [None]:
# Import Essential Libs
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint

In [None]:
# Set seed
tf.random.set_seed(11)
np.random.seed(11)

We set the parameters for the model. The dropout is used to reduce overfitting. Batch size is set to 30, to train based on the previous 30 days data as BTC runs 24/7.

In [None]:
# Build Model
model = Sequential()

model.add(LSTM(units = 2000, activation = 'relu', 
               input_shape = (X_train.shape[1], look_back)))

model.add(Dropout(0.2))

model.add(Dense(1, activation = 'linear'))

model.compile(loss = 'mean_squared_error', optimizer = 'adam')

In [None]:
filepath = 'saved_models_btc/model_epoch_{epoch:02d}.hdf5'

checkpoint = ModelCheckpoint(filepath = filepath,
                             monitor = 'val_loss',
                             verbose = 1,
                             save_best_only = True,
                             mode ='min'
                            )

history = model.fit(X_train, Y_train, epochs = 100, batch_size = 24, validation_data = (X_test, Y_test), 
                    callbacks = [checkpoint], 
                    verbose = 1, shuffle = False)

model.summary()

In [None]:
from keras.models import load_model

best_model = load_model('saved_models_btc\model_epoch_19.hdf5')
# Predicting and inverse transforming the predictions

train_predict = best_model.predict(X_train)

Y_hat_train = scaler.inverse_transform(train_predict)

test_predict = best_model.predict(X_test)

Y_hat_test = scaler.inverse_transform(test_predict)

# Inverse transforming the actual values, to return them to their original values
Y_test = scaler.inverse_transform([Y_test])
Y_train = scaler.inverse_transform([Y_train])

In [None]:
Y_hat_train = np.reshape(Y_hat_train, newshape = Y_hat_train.shape)
Y_hat_test = np.reshape(Y_hat_test, newshape = Y_hat_test.shape)

Y_train = np.reshape(Y_train, newshape = Y_hat_train.shape)
Y_test = np.reshape(Y_test, newshape = Y_hat_test.shape)

It can be expected that the RMSE of the train & test sets to be high as Cryptocurrencies are extremely volatile and unpredictable...

In [None]:
from sklearn.metrics import mean_squared_error

train_RMSE = np.sqrt(mean_squared_error(Y_train, Y_hat_train))

test_RMSE = np.sqrt(mean_squared_error(Y_test, Y_hat_test))

print('Train RMSE is: ')
print(train_RMSE, '\n')
print('Test RMSE is: ')
print(test_RMSE)

In [None]:
Y = np.append(Y_train, Y_test)
Y_hat = np.append(Y_hat_train, Y_hat_test)

In [None]:
result_df = pd.DataFrame()

result_df['Actual_Y'] = Y
result_df['Predicted_Y'] = Y_hat

result_df

In [None]:
plt.figure(figsize=(24,12))
sb.lineplot(x=np.arange(0, len(result_df), 1), y=result_df['Actual_Y'], color='blue', label='Actual_Y')
sb.lineplot(x=np.arange(0, len(result_df), 1), y=result_df['Predicted_Y'], color='orange', label='Predicted_Y')
# ax = plt.gca()
# ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
# ax.xaxis.set_major_formatter(mdates.DateFormatter("%d-%m-%y"))
# plt.gcf().autofmt_xdate()
# plt.show()

From the results, we can see that the predicted values trend follows the actual values. However, at certain points in time, the model is overestimating/underestimating the intrinsic value of BTC.