In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
import datetime
from sklearn.preprocessing import MinMaxScaler

## Import data

In [None]:
df = pd.read_csv('/content/2021-2022_bit.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264959 entries, 0 to 264958
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  264959 non-null  int64  
 1   Date        264959 non-null  object 
 2   Open        264959 non-null  float64
 3   High        264959 non-null  float64
 4   Low         264959 non-null  float64
 5   Close       264959 non-null  float64
 6   Volume BTC  264959 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 14.2+ MB


In [None]:
df = df.iloc[:,1:].copy()

In [None]:
# reverse the data
df = df[::-1]

In [None]:
df.columns = ['date', 'open', 'high', 'low', 'close', 'vol']

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df = df.set_index('date')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 264959 entries, 2021-08-01 00:01:00 to 2022-01-31 23:59:00
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   open    264959 non-null  float64
 1   high    264959 non-null  float64
 2   low     264959 non-null  float64
 3   close   264959 non-null  float64
 4   vol     264959 non-null  float64
dtypes: float64(5)
memory usage: 12.1 MB


In [None]:
df

Unnamed: 0_level_0,open,high,low,close,vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-08-01 00:01:00,41469.05,41469.05,41332.91,41348.38,3.513764
2021-08-01 00:02:00,41348.38,41348.38,41165.96,41196.18,8.040649
2021-08-01 00:03:00,41196.18,41339.38,41183.56,41327.59,3.081120
2021-08-01 00:04:00,41327.59,41387.71,41327.59,41379.84,0.044790
2021-08-01 00:05:00,41379.84,41550.50,41379.84,41550.50,0.196676
...,...,...,...,...,...
2022-01-31 23:55:00,38479.35,38510.17,38479.35,38510.17,0.008319
2022-01-31 23:56:00,38510.17,38561.36,38506.33,38519.38,4.325445
2022-01-31 23:57:00,38519.38,38541.20,38519.38,38533.44,0.026744
2022-01-31 23:58:00,38533.44,38533.44,38526.84,38531.01,0.046275


In [None]:
df.isnull().sum()

open     0
high     0
low      0
close    0
vol      0
dtype: int64

### Feature add

In [None]:
# Moving Average function
def moving_average(data, period):
    return data.rolling(window=period).mean()

In [None]:
# Adding new columns for 10-day and 30-day moving averages
df['MA10'] = moving_average(df['close'], 10)
df['MA30'] = moving_average(df['close'], 30)

In [None]:
df

Unnamed: 0_level_0,open,high,low,close,vol,MA10,MA30
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-08-01 00:01:00,41469.05,41469.05,41332.91,41348.38,3.513764,,
2021-08-01 00:02:00,41348.38,41348.38,41165.96,41196.18,8.040649,,
2021-08-01 00:03:00,41196.18,41339.38,41183.56,41327.59,3.081120,,
2021-08-01 00:04:00,41327.59,41387.71,41327.59,41379.84,0.044790,,
2021-08-01 00:05:00,41379.84,41550.50,41379.84,41550.50,0.196676,,
...,...,...,...,...,...,...,...
2022-01-31 23:55:00,38479.35,38510.17,38479.35,38510.17,0.008319,38442.754,38449.889333
2022-01-31 23:56:00,38510.17,38561.36,38506.33,38519.38,4.325445,38455.803,38452.091667
2022-01-31 23:57:00,38519.38,38541.20,38519.38,38533.44,0.026744,38470.501,38454.850000
2022-01-31 23:58:00,38533.44,38533.44,38526.84,38531.01,0.046275,38484.882,38457.103333


In [None]:
df = df[df.index > '2021-09-25']

In [None]:
# Prepare the volume and price differences, normalize volume
BTC_vol = df["vol"].values
df_diff = df.diff().dropna()
df_diff["vol"] = np.log(1 + BTC_vol[:-1])

In [None]:
df_aligned = df.loc[df_diff.index]

In [None]:
# Train data
# Period : From start of 2018 to end of 2022
mask_train = (df_diff.index >= "2021-10-01") & (df_diff.index < "2021-12-31")
df_train = df_diff.loc[mask_train].copy()
train_close = df_aligned.loc[mask_train, "close"].values
df_train["Relative_Close"] = train_close / train_close[0]

In [None]:
# Test data
# Period : Whole 2023
mask_test = (df_diff.index >= "2021-12-31") & (df_diff.index < "2022-01-01")  # December 2018 for testing
df_test = df_diff.loc[mask_test].copy()
test_close = df_aligned.loc[mask_test, "close"].values
df_test["Relative_Close"] = test_close / train_close[0]


In [None]:
df_train.head()

Unnamed: 0_level_0,open,high,low,close,vol,MA10,MA30,Relative_Close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-10-01 00:00:00,49.39,0.0,35.28,0.0,0.172721,1.982,1.288,1.0
2021-10-01 00:01:00,0.0,43.23,14.11,41.79,0.138222,6.593,1.023,1.000953
2021-10-01 00:02:00,41.79,85.12,-27.83,86.56,1.004677,11.712,3.451333,1.002928
2021-10-01 00:03:00,86.56,16.26,127.39,14.35,2.271787,18.638,2.855333,1.003256
2021-10-01 00:04:00,14.35,12.35,24.08,14.26,1.217967,18.893,4.195667,1.003581


In [None]:
df_test.head()

Unnamed: 0_level_0,open,high,low,close,vol,MA10,MA30,Relative_Close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-31 00:00:00,-8.94,-23.58,-30.65,-30.65,0.108346,-1.422,-1.886,1.074458
2021-12-31 00:01:00,-30.65,67.03,0.0,95.45,0.035732,5.754,0.532333,1.076636
2021-12-31 00:02:00,95.45,76.63,88.21,72.93,0.090812,11.848,5.238333,1.0783
2021-12-31 00:03:00,72.93,19.19,27.12,-53.05,0.887851,5.516,2.523,1.07709
2021-12-31 00:04:00,-53.05,-58.71,-2.89,19.46,0.722145,5.572,3.719667,1.077534


In [None]:
# Generate dataset function
def generate_dataset(df, seq_len):
    X_list, y_list = [], []
    for i in range(len(df) - seq_len):
        X_list.append(df.iloc[i:(i+seq_len), :].values)
        y_list.append(df["close"].iloc[i + seq_len])
    return np.array(X_list), np.array(y_list)

In [None]:
LAG = 5

In [None]:
# Prepare training and test datasets
X_train, y_train = generate_dataset(df_train, LAG)
X_test, y_test = generate_dataset(pd.concat((df_train.iloc[-LAG:], df_test)), LAG)

In [None]:
num_samples, num_timesteps, num_features = X_train.shape
X_train_reshaped = X_train.reshape(-1, num_features)
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train_reshaped)
X_train_scaled = X_train_scaled.reshape(num_samples, num_timesteps, num_features)

In [None]:
num_samples_test, num_timesteps, num_features = X_test.shape
X_test_reshaped = X_test.reshape(-1, num_features)
X_test_scaled = scaler.transform(X_test_reshaped)
X_test_scaled = X_test_scaled.reshape(num_samples_test, num_timesteps, num_features)

In [None]:
# Model architecture
tf.keras.utils.set_random_seed(4002)

model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')

In [None]:
# Train the model
model.fit(X_train_scaled, y_train, epochs=20, validation_split=0.2, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7eae3ef8a740>

In [None]:
# Predict
predictions = model.predict(X_test_scaled)



In [None]:
# Evaluate
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)
print(f'Test MSE: {mse}')

Test MSE: 1442.3190667249723
