In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
import datetime

## Import data

In [None]:
df = pd.read_csv('/content/2021-2022_bit.csv')

In [None]:
df = df.iloc[:,1:].copy()

In [None]:
df = df[::-1]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146879 entries, 146878 to 0
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Date        146879 non-null  object 
 1   Open        146879 non-null  float64
 2   High        146879 non-null  float64
 3   Low         146879 non-null  float64
 4   Close       146879 non-null  float64
 5   Volume BTC  146879 non-null  float64
dtypes: float64(5), object(1)
memory usage: 6.7+ MB


In [None]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume BTC
146878,2021-09-25 00:01:00,42833.68,42851.83,42824.43,42846.39,1.164127
146877,2021-09-25 00:02:00,42846.39,42880.67,42820.33,42847.35,5.216199
146876,2021-09-25 00:03:00,42847.35,42850.91,42788.63,42803.92,3.004288
146875,2021-09-25 00:04:00,42803.92,42803.92,42727.38,42738.56,2.083598
146874,2021-09-25 00:05:00,42738.56,42738.56,42626.59,42639.53,0.658538
...,...,...,...,...,...,...
4,2022-01-04 23:55:00,45874.46,45893.53,45874.46,45876.19,0.135848
3,2022-01-04 23:56:00,45876.19,45883.47,45850.00,45850.00,0.004880
2,2022-01-04 23:57:00,45850.00,45873.72,45844.93,45859.15,0.499837
1,2022-01-04 23:58:00,45859.15,45870.66,45859.15,45870.20,0.006786


In [None]:
df.columns = ['date', 'open', 'high', 'low', 'close', 'vol']

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df = df.set_index('date')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 146879 entries, 2021-09-25 00:01:00 to 2022-01-04 23:59:00
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   open    146879 non-null  float64
 1   high    146879 non-null  float64
 2   low     146879 non-null  float64
 3   close   146879 non-null  float64
 4   vol     146879 non-null  float64
dtypes: float64(5)
memory usage: 6.7 MB


In [None]:
# Prepare the volume and price differences, normalize volume
BTC_vol = df["vol"].values
df_diff = df.diff().dropna()
df_diff["vol"] = np.log(1 + BTC_vol[:-1])

In [None]:
df_aligned = df.loc[df_diff.index]

In [None]:
# Train data
# Period : From start of 2018 to end of 2022
mask_train = (df_diff.index >= "2021-10-01") & (df_diff.index < "2021-12-31")
df_train = df_diff.loc[mask_train].copy()
train_close = df_aligned.loc[mask_train, "close"].values
df_train["Relative_Close"] = train_close / train_close[0]

In [None]:
# Test data
# Period : Whole 2023
mask_test = (df_diff.index >= "2021-12-31") & (df_diff.index < "2022-01-01")  # December 2018 for testing
df_test = df_diff.loc[mask_test].copy()
test_close = df_aligned.loc[mask_test, "close"].values
df_test["Relative_Close"] = test_close / train_close[0]


In [None]:
df_train.head()

Unnamed: 0_level_0,open,high,low,close,vol,Relative_Close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-10-01 00:00:00,49.39,0.0,35.28,0.0,0.172721,1.0
2021-10-01 00:01:00,0.0,43.23,14.11,41.79,0.138222,1.000953
2021-10-01 00:02:00,41.79,85.12,-27.83,86.56,1.004677,1.002928
2021-10-01 00:03:00,86.56,16.26,127.39,14.35,2.271787,1.003256
2021-10-01 00:04:00,14.35,12.35,24.08,14.26,1.217967,1.003581


In [None]:
df_test.head()

Unnamed: 0_level_0,open,high,low,close,vol,Relative_Close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-31 00:00:00,-8.94,-23.58,-30.65,-30.65,0.108346,1.074458
2021-12-31 00:01:00,-30.65,67.03,0.0,95.45,0.035732,1.076636
2021-12-31 00:02:00,95.45,76.63,88.21,72.93,0.090812,1.0783
2021-12-31 00:03:00,72.93,19.19,27.12,-53.05,0.887851,1.07709
2021-12-31 00:04:00,-53.05,-58.71,-2.89,19.46,0.722145,1.077534


In [None]:
# Generate dataset function
def generate_dataset(df, seq_len):
    X_list, y_list = [], []
    for i in range(len(df) - seq_len):
        X_list.append(df.iloc[i:(i+seq_len), :].values)
        y_list.append(df["close"].iloc[i + seq_len])
    return np.array(X_list), np.array(y_list)

In [None]:
LAG = 30

In [None]:
# Prepare training and test datasets
X_train, y_train = generate_dataset(df_train, LAG)
X_test, y_test = generate_dataset(pd.concat((df_train.iloc[-LAG:], df_test)), LAG)

In [None]:
# Model architecture
tf.keras.utils.set_random_seed(4002)

model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=20, validation_split=0.2, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7e0da1554fa0>

In [None]:
# Predict
predictions = model.predict(X_test)



In [None]:
# Evaluate
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print(f'Test MSE: {mse}')

Test MSE: 2216.583482370543


In [None]:
# 0.04