In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
import yfinance as yf
import datetime

## Import data

In [None]:
start_date = datetime.datetime(2018, 1, 1)
end_date = datetime.datetime(2024, 1, 1)
btc_info = yf.Ticker("BTC-USD")

# pass the parameters as the taken dates for start and end
df = btc_info.history(start = start_date, end = end_date)

In [None]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01 00:00:00+00:00,14112.200195,14112.200195,13154.700195,13657.200195,10291200000,0.0,0.0
2018-01-02 00:00:00+00:00,13625.0,15444.599609,13163.599609,14982.099609,16846600192,0.0,0.0
2018-01-03 00:00:00+00:00,14978.200195,15572.799805,14844.5,15201.0,16871900160,0.0,0.0
2018-01-04 00:00:00+00:00,15270.700195,15739.700195,14522.200195,15599.200195,21783199744,0.0,0.0
2018-01-05 00:00:00+00:00,15477.200195,17705.199219,15202.799805,17429.5,23840899072,0.0,0.0


In [None]:
df = df.drop(columns=['Dividends', 'Stock Splits'])

In [None]:
df.columns = ['open', 'high', 'low', 'close', 'vol']

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2191 entries, 2018-01-01 00:00:00+00:00 to 2023-12-31 00:00:00+00:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    2191 non-null   float64
 1   high    2191 non-null   float64
 2   low     2191 non-null   float64
 3   close   2191 non-null   float64
 4   vol     2191 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 102.7 KB


In [None]:
# Prepare the volume and price differences, normalize volume
BTC_vol = df["vol"].values
df_diff = df.diff().dropna()
df_diff["vol"] = np.log(1 + BTC_vol[:-1])

In [None]:
df_aligned = df.loc[df_diff.index]

In [None]:
# Train data
# Period : From start of 2018 to end of 2022
mask_train = (df_diff.index >= "2018-01-01") & (df_diff.index < "2023-01-01")
df_train = df_diff.loc[mask_train].copy()
train_close = df_aligned.loc[mask_train, "close"].values
df_train["Relative_Close"] = train_close / train_close[0]

In [None]:
# Test data
# Period : Whole 2023
mask_test = (df_diff.index >= "2023-01-01") & (df_diff.index < "2024-01-01")  # December 2018 for testing
df_test = df_diff.loc[mask_test].copy()
test_close = df_aligned.loc[mask_test, "close"].values
df_test["Relative_Close"] = test_close / train_close[0]


In [None]:
df_train.head()

Unnamed: 0_level_0,open,high,low,close,vol,Relative_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02 00:00:00+00:00,-487.200195,1332.399414,8.899414,1324.899414,23.054555,1.0
2018-01-03 00:00:00+00:00,1353.200195,128.200195,1680.900391,218.900391,23.547415,1.014611
2018-01-04 00:00:00+00:00,292.5,166.900391,-322.299805,398.200195,23.548915,1.041189
2018-01-05 00:00:00+00:00,206.5,1965.499023,680.599609,1830.299805,23.804405,1.163355
2018-01-06 00:00:00+00:00,1984.899414,7.201172,1561.799805,97.5,23.894668,1.169863


In [None]:
df_test.head()

Unnamed: 0_level_0,open,high,low,close,vol,Relative_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:00:00+00:00,-55.759766,1.453125,3.714844,77.583984,23.142672,1.109663
2023-01-02 00:00:00+00:00,77.595703,128.904297,50.994141,63.390625,22.94728,1.113894
2023-01-03 00:00:00+00:00,63.337891,1.103516,50.142578,-8.613281,23.216287,1.113319
2023-01-04 00:00:00+00:00,-8.642578,204.138672,45.392578,183.380859,23.355376,1.125559
2023-01-05 00:00:00+00:00,183.267578,-80.564453,122.519531,-26.501953,23.636798,1.12379


In [None]:
# Generate dataset function
def generate_dataset(df, seq_len):
    X_list, y_list = [], []
    for i in range(len(df) - seq_len):
        X_list.append(df.iloc[i:(i+seq_len), :].values)
        y_list.append(df["close"].iloc[i + seq_len])
    return np.array(X_list), np.array(y_list)

In [None]:
LAG = 15

In [None]:
# Prepare training and test datasets
X_train, y_train = generate_dataset(df_train, LAG)
X_test, y_test = generate_dataset(pd.concat((df_train.iloc[-LAG:], df_test)), LAG)

In [None]:
# Model architecture
tf.keras.utils.set_random_seed(4002)

model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=20, validation_split=0.2, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7caa890ac490>

In [None]:
# Predict
predictions = model.predict(X_test)



In [None]:
# Evaluate
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print(f'Test MSE: {mse}')

Test MSE: 431626.19896269374
