In [None]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
import datetime
from sklearn.preprocessing import MinMaxScaler

## Import data

In [None]:
df = pd.read_csv('/content/2021-2022_bit.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264959 entries, 0 to 264958
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  264959 non-null  int64  
 1   Date        264959 non-null  object 
 2   Open        264959 non-null  float64
 3   High        264959 non-null  float64
 4   Low         264959 non-null  float64
 5   Close       264959 non-null  float64
 6   Volume BTC  264959 non-null  float64
dtypes: float64(5), int64(1), object(1)
memory usage: 14.2+ MB


In [None]:
df = df.iloc[:,1:].copy()

In [None]:
# reverse the data
df = df[::-1]

In [None]:
df.columns = ['date', 'open', 'high', 'low', 'close', 'vol']

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df = df.set_index('date')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 264959 entries, 2021-08-01 00:01:00 to 2022-01-31 23:59:00
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   open    264959 non-null  float64
 1   high    264959 non-null  float64
 2   low     264959 non-null  float64
 3   close   264959 non-null  float64
 4   vol     264959 non-null  float64
dtypes: float64(5)
memory usage: 12.1 MB


In [None]:
df

Unnamed: 0_level_0,open,high,low,close,vol
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-08-01 00:01:00,41469.05,41469.05,41332.91,41348.38,3.513764
2021-08-01 00:02:00,41348.38,41348.38,41165.96,41196.18,8.040649
2021-08-01 00:03:00,41196.18,41339.38,41183.56,41327.59,3.081120
2021-08-01 00:04:00,41327.59,41387.71,41327.59,41379.84,0.044790
2021-08-01 00:05:00,41379.84,41550.50,41379.84,41550.50,0.196676
...,...,...,...,...,...
2022-01-31 23:55:00,38479.35,38510.17,38479.35,38510.17,0.008319
2022-01-31 23:56:00,38510.17,38561.36,38506.33,38519.38,4.325445
2022-01-31 23:57:00,38519.38,38541.20,38519.38,38533.44,0.026744
2022-01-31 23:58:00,38533.44,38533.44,38526.84,38531.01,0.046275


In [None]:
df.isnull().sum()

open     0
high     0
low      0
close    0
vol      0
dtype: int64

### Feature add

In [None]:
# Moving Average function
def moving_average(data, period):
    return data.rolling(window=period).mean()

In [None]:
# Adding new columns for 10-day and 30-day moving averages
df['MA10'] = moving_average(df['close'], 10)
df['MA30'] = moving_average(df['close'], 30)

In [None]:
df

Unnamed: 0_level_0,open,high,low,close,vol,MA10,MA30
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-08-01 00:01:00,41469.05,41469.05,41332.91,41348.38,3.513764,,
2021-08-01 00:02:00,41348.38,41348.38,41165.96,41196.18,8.040649,,
2021-08-01 00:03:00,41196.18,41339.38,41183.56,41327.59,3.081120,,
2021-08-01 00:04:00,41327.59,41387.71,41327.59,41379.84,0.044790,,
2021-08-01 00:05:00,41379.84,41550.50,41379.84,41550.50,0.196676,,
...,...,...,...,...,...,...,...
2022-01-31 23:55:00,38479.35,38510.17,38479.35,38510.17,0.008319,38442.754,38449.889333
2022-01-31 23:56:00,38510.17,38561.36,38506.33,38519.38,4.325445,38455.803,38452.091667
2022-01-31 23:57:00,38519.38,38541.20,38519.38,38533.44,0.026744,38470.501,38454.850000
2022-01-31 23:58:00,38533.44,38533.44,38526.84,38531.01,0.046275,38484.882,38457.103333


In [None]:
df = df[df.index > '2021-09-25']

In [None]:
# Prepare the volume and price differences, normalize volume
BTC_vol = df["vol"].values
df_diff = df.diff().dropna()
df_diff["vol"] = np.log(1 + BTC_vol[:-1])

In [None]:
df_aligned = df.loc[df_diff.index]

In [None]:
# Train data
# Period : From start of 2018 to end of 2022
mask_train = (df_diff.index >= "2021-10-01") & (df_diff.index < "2021-12-31")
df_train = df_diff.loc[mask_train].copy()
train_close = df_aligned.loc[mask_train, "close"].values
df_train["Relative_Close"] = train_close / train_close[0]

In [None]:
# Test data
# Period : Whole 2023
mask_test = (df_diff.index >= "2021-12-31") & (df_diff.index < "2022-01-01")  # December 2018 for testing
df_test = df_diff.loc[mask_test].copy()
test_close = df_aligned.loc[mask_test, "close"].values
df_test["Relative_Close"] = test_close / train_close[0]


In [None]:
df_train.head()

Unnamed: 0_level_0,open,high,low,close,vol,MA10,MA30,Relative_Close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-10-01 00:00:00,49.39,0.0,35.28,0.0,0.172721,1.982,1.288,1.0
2021-10-01 00:01:00,0.0,43.23,14.11,41.79,0.138222,6.593,1.023,1.000953
2021-10-01 00:02:00,41.79,85.12,-27.83,86.56,1.004677,11.712,3.451333,1.002928
2021-10-01 00:03:00,86.56,16.26,127.39,14.35,2.271787,18.638,2.855333,1.003256
2021-10-01 00:04:00,14.35,12.35,24.08,14.26,1.217967,18.893,4.195667,1.003581


In [None]:
df_test.head()

Unnamed: 0_level_0,open,high,low,close,vol,MA10,MA30,Relative_Close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-31 00:00:00,-8.94,-23.58,-30.65,-30.65,0.108346,-1.422,-1.886,1.074458
2021-12-31 00:01:00,-30.65,67.03,0.0,95.45,0.035732,5.754,0.532333,1.076636
2021-12-31 00:02:00,95.45,76.63,88.21,72.93,0.090812,11.848,5.238333,1.0783
2021-12-31 00:03:00,72.93,19.19,27.12,-53.05,0.887851,5.516,2.523,1.07709
2021-12-31 00:04:00,-53.05,-58.71,-2.89,19.46,0.722145,5.572,3.719667,1.077534


In [None]:
# Generate dataset function
def generate_dataset(df, seq_len):
    X_list, y_list = [], []
    for i in range(len(df) - seq_len):
        X_list.append(df.iloc[i:(i+seq_len), :].values)
        y_list.append(df["close"].iloc[i + seq_len])
    return np.array(X_list), np.array(y_list)

In [None]:
LAG = 5

In [None]:
validation_size = 0.2
n_validation = int(len(df_train) * validation_size)

df_val = df_train.iloc[-n_validation:]
df_train_reduced = df_train.iloc[:-n_validation]

X_train, y_train = generate_dataset(df_train_reduced, LAG)
X_val, y_val = generate_dataset(pd.concat((df_train_reduced.iloc[-LAG:], df_val)), LAG)
X_test, y_test = generate_dataset(pd.concat((df_train.iloc[-LAG:], df_test)), LAG)

In [None]:
num_samples, num_timesteps, num_features = X_train.shape
X_train_reshaped = X_train.reshape(-1, num_features)
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train_reshaped)
X_train_scaled = X_train_scaled.reshape(num_samples, num_timesteps, num_features)

In [None]:
num_samples_val, num_timesteps, num_features = X_val.shape
X_val_reshaped = X_val.reshape(-1, num_features)
X_val_scaled = scaler.transform(X_val_reshaped)
X_val_scaled = X_val_scaled.reshape(num_samples_val, num_timesteps, num_features)

In [None]:
num_samples_test, num_timesteps, num_features = X_test.shape
X_test_reshaped = X_test.reshape(-1, num_features)
X_test_scaled = scaler.transform(X_test_reshaped)
X_test_scaled = X_test_scaled.reshape(num_samples_test, num_timesteps, num_features)

### Hyperparameter Tuning

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from kerastuner.tuners import RandomSearch

  from kerastuner.tuners import RandomSearch


In [None]:
def build_model(hp):
    model = Sequential([
        LSTM(
            units=hp.Int('units1', min_value=32, max_value=256, step=32),
            return_sequences=True,
            input_shape=(X_train.shape[1], X_train.shape[2]),
            recurrent_dropout=hp.Float('recurrent_dropout1', min_value=0.0, max_value=0.5, step=0.1)
        ),
        Dropout(rate=hp.Float('dropout1', min_value=0.0, max_value=0.5, step=0.1)),
        LSTM(
            units=hp.Int('units2', min_value=32, max_value=256, step=32),
            recurrent_dropout=hp.Float('recurrent_dropout2', min_value=0.0, max_value=0.5, step=0.1)
        ),
        Dropout(rate=hp.Float('dropout2', min_value=0.0, max_value=0.5, step=0.1)),
        Dense(
            units=hp.Int('dense_units', min_value=16, max_value=128, step=16),
            activation='relu'
        ),
        Dense(1)
    ])

    model.compile(
        optimizer=Adam(
            learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')
        ),
        loss='mse'
    )

    return model

In [None]:
tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,  # Number of different configurations to try
    executions_per_trial=1,  # Number of models to train for each trial
    directory='my_dir',  # Directory to save logs and models
    project_name='lstm_tuning'
)

In [None]:
# Display search space summary
tuner.search_space_summary()

Search space summary
Default search space size: 8
units1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': 'linear'}
recurrent_dropout1 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
dropout1 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
units2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 256, 'step': 32, 'sampling': 'linear'}
recurrent_dropout2 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
dropout2 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
dense_units (Int)
{'default': None, 'conditions': [], 'min_value': 16, 'max_value': 128, 'step': 16, 'sampling': 'linear'}
learning_rate (Float)
{'default': 0.0001, 'conditions': [], 'min_value': 0.0001

In [None]:
# Perform the hyperparameter search
tuner.search(
    X_train_scaled, y_train,
    epochs=5,
    validation_data=(X_val_scaled, y_val),
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]
)

Trial 10 Complete [00h 06m 06s]
val_loss: 1554.7833251953125

Best val_loss So Far: 1554.7626953125
Total elapsed time: 01h 40m 35s


In [None]:
# Initialize an empty list to hold each trial's data
trial_data = []

# Iterate through each trial and collect the data
for trial_id, trial in tuner.oracle.trials.items():
    if trial.status == "COMPLETED":
        # Extract the final validation loss for the trial
        val_loss = trial.metrics.get_best_value('val_loss')
        # Prepare a dictionary for the trial
        trial_info = {
            'Trial ID': trial_id,
            'MSE': val_loss
        }
        # Update the dictionary with the hyperparameters
        trial_info.update(trial.hyperparameters.values)
        # Append the dictionary to the list
        trial_data.append(trial_info)

# Convert the list of dictionaries to a DataFrame
df_trials = pd.DataFrame(trial_data)

# Display the DataFrame
df_trials

Unnamed: 0,Trial ID,MSE,units1,recurrent_dropout1,dropout1,units2,recurrent_dropout2,dropout2,dense_units,learning_rate
0,0,1554.773315,128,0.1,0.3,192,0.0,0.2,112,0.001735
1,1,1554.785767,224,0.2,0.1,128,0.2,0.0,16,0.000513
2,2,1554.772095,192,0.1,0.3,224,0.2,0.0,112,0.005689
3,3,1554.773071,96,0.0,0.4,128,0.0,0.3,64,0.007101
4,4,1554.778564,256,0.2,0.3,96,0.2,0.2,32,0.007836
5,5,1554.791992,96,0.1,0.0,256,0.4,0.0,64,0.001413
6,6,1554.806519,96,0.4,0.4,192,0.3,0.3,16,0.00358
7,7,1554.762695,96,0.1,0.0,96,0.4,0.3,128,0.000246
8,8,1554.769043,96,0.4,0.2,32,0.2,0.3,128,0.000739
9,9,1554.783325,32,0.3,0.0,160,0.4,0.1,48,0.003748


In [None]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first LSTM layer is {best_hps.get('units1')},
the optimal dropout rates are {best_hps.get('dropout1')} for the first dropout layer and {best_hps.get('dropout2')} for the second dropout layer,
the optimal number of units in the second LSTM layer is {best_hps.get('units2')}, and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")


The hyperparameter search is complete. The optimal number of units in the first LSTM layer is 96,
the optimal dropout rates are 0.0 for the first dropout layer and 0.30000000000000004 for the second dropout layer,
the optimal number of units in the second LSTM layer is 96, and the optimal learning rate for the optimizer
is 0.0002461390263263757.



In [None]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train_scaled, y_train, epochs=50, validation_data=(X_val_scaled, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
predicted_prices = model.predict(X_test_scaled)



In [None]:
print("Test MSE:", np.mean((predicted_prices - y_test)**2))