#### Referenced the kaggle notebook : https://www.kaggle.com/code/xreina8/bitcoin-price-prophet-and-lstm-models-comparison

- adding the moving average feature
- dividing by the date month year

# Original code from the class
- Divides the data into a training set (last quarter of 2018) and a test set (first day of 2019) based on date ranges.

## 1. Data exploration

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv('/content/Bitcoin Historical Data3 2 (1).csv')

In [None]:
df.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,03/10/2024,69437.1,68360.7,69676.4,68250.3,37.43K,1.57%
1,03/09/2024,68366.5,68178.5,68576.9,67923.9,30.71K,0.29%
2,03/08/2024,68172.0,66854.4,69904.0,66170.7,112.67K,1.97%
3,03/07/2024,66855.3,66074.6,67985.5,65602.6,77.47K,1.17%
4,03/06/2024,66080.4,63794.7,67604.9,62848.7,117.91K,3.59%


In [None]:
df.columns = ['date', 'close', 'open', 'high', 'low', 'vol', 'change']

In [None]:
# 6 NA value is found in the volatility
df.isnull().sum()

date      0
close     0
open      0
high      0
low       0
vol       6
change    0
dtype: int64

## 2. Data preprocessing

### 2-1. Convert object to numberic value

In [None]:
def percentage_to_decimal(s):
    return float(s.strip('%')) / 100

In [None]:
def convert_volume(volume):

    volume_str = str(volume)

    if 'K' in volume_str:
        return float(volume_str.replace('K', '')) * 1e3
    elif 'M' in volume_str:
        return float(volume_str.replace('M', '')) * 1e6
    elif 'B' in volume_str:
        return float(volume_str.replace('B', '')) * 1e9
    else:
        return float(volume_str)

In [None]:
df['vol'] = df['vol'].apply(convert_volume)
df['close'] = df['close'].replace(',', '', regex=True).astype(float)
df['open'] = df['open'].replace(',', '', regex=True).astype(float)
df['high'] = df['high'].replace(',', '', regex=True).astype(float)
df['low'] = df['low'].replace(',', '', regex=True).astype(float)
df['change'] = df['change'].apply(percentage_to_decimal)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4985 entries, 0 to 4984
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    4985 non-null   object 
 1   close   4985 non-null   float64
 2   open    4985 non-null   float64
 3   high    4985 non-null   float64
 4   low     4985 non-null   float64
 5   vol     4979 non-null   float64
 6   change  4985 non-null   float64
dtypes: float64(6), object(1)
memory usage: 272.7+ KB


### 2-2. Convert date column

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.head()

Unnamed: 0,date,close,open,high,low,vol,change
0,2024-03-10,69437.1,68360.7,69676.4,68250.3,37430.0,0.0157
1,2024-03-09,68366.5,68178.5,68576.9,67923.9,30710.0,0.0029
2,2024-03-08,68172.0,66854.4,69904.0,66170.7,112670.0,0.0197
3,2024-03-07,66855.3,66074.6,67985.5,65602.6,77470.0,0.0117
4,2024-03-06,66080.4,63794.7,67604.9,62848.7,117910.0,0.0359


### 2.3 Handling missing value

In [None]:
# Row with missing value
df[df['vol'].isnull()]

Unnamed: 0,date,close,open,high,low,vol,change
4642,2011-06-25,17.5,17.5,17.5,17.5,,0.0
4643,2011-06-24,17.5,17.5,17.5,17.5,,0.0
4644,2011-06-23,17.5,17.5,17.5,17.5,,0.0
4645,2011-06-22,17.5,17.5,17.5,17.5,,0.0
4646,2011-06-21,17.5,17.5,17.5,17.5,,0.0
4647,2011-06-20,17.5,17.5,17.5,17.5,,0.0


In [None]:
df[df['date'].between('2011-01-01', '2011-12-31')]['vol'].mean()

37676.685236768804

In [None]:
# Fill missing values with the median
vol_numeric_median = df['vol'].median()
df['vol'].fillna(vol_numeric_median, inplace=True)

In [None]:
# Reverse the dataset order so it starts with the earliest date
df = df.iloc[::-1]

### 2-4. Feature engineering

In [None]:
# # Extract year, month, and day from 'Date'
# df['Year'] = df['date'].dt.year
# df['Month'] = df['date'].dt.month
# df['Day'] = df['date'].dt.day

# # Calculate moving averages
# df['MA7'] = df['close'].rolling(window=7).mean()
# df['MA30'] = df['close'].rolling(window=30).mean()

# # Calculate previous day price and price change
# df['Prev_Day_Price'] = df['close'].shift(1)
# df['Price_Change'] = df['close'] - df['Prev_Day_Price']

In [None]:
df.index = df['date']

In [None]:
df = df.drop(columns=['date'])

In [None]:
df.head()

Unnamed: 0_level_0,close,open,high,low,vol,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-07-18,0.1,0.0,0.1,0.1,80.0,0.0
2010-07-19,0.1,0.1,0.1,0.1,570.0,0.0
2010-07-20,0.1,0.1,0.1,0.1,260.0,0.0
2010-07-21,0.1,0.1,0.1,0.1,580.0,0.0
2010-07-22,0.1,0.1,0.1,0.1,2160.0,0.0


#### 2-4-1. Minmax scaling

In [None]:
scaler = MinMaxScaler()

columns_to_scale = ['close', 'open', 'high', 'low', 'vol', 'change']

scaled_data = scaler.fit_transform(df[columns_to_scale])

scaled_df = pd.DataFrame(scaled_data, columns=columns_to_scale, index=df.index)

In [None]:
scaled_df

Unnamed: 0_level_0,close,open,high,low,vol,change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-07-18,0.000000,0.000000,0.000000,0.000001,0.000000e+00,0.145185
2010-07-19,0.000000,0.000001,0.000000,0.000001,1.096197e-07,0.145185
2010-07-20,0.000000,0.000001,0.000000,0.000001,4.026846e-08,0.145185
2010-07-21,0.000000,0.000001,0.000000,0.000001,1.118568e-07,0.145185
2010-07-22,0.000000,0.000001,0.000000,0.000001,4.653244e-07,0.145185
...,...,...,...,...,...,...
2024-03-06,0.951658,0.933207,0.967111,0.920856,2.636018e-05,0.154295
2024-03-07,0.962818,0.966558,0.972555,0.961206,1.731320e-05,0.148154
2024-03-08,0.981781,0.977965,1.000000,0.969530,2.518792e-05,0.150184
2024-03-09,0.984582,0.997335,0.981015,0.995218,6.852349e-06,0.145921


In [None]:
df = scaled_df.copy()

In [None]:
 # Prepare the volume and price differences, normalize volume
BTC_vol = df["vol"].values
df_diff = df.diff().dropna()
df_diff["vol"] = np.log(1 + BTC_vol[:-1])  # Shifted by 1 to align with diff

In [None]:
# Assuming df_diff has been created by df.diff().dropna() or similar
# First, align the index of df to match df_diff after dropping NaN values
df_aligned = df.loc[df_diff.index]

# Now, proceed with mask creation and selection
mask_train = df_diff.index < "2018-12-01"
df_train = df_diff.loc[mask_train].copy()

# Use df_aligned to ensure the indexes match
# train_close = df_aligned.loc[mask_train, "close"].values

# Since indexes are aligned, this operation should now work without error
# df_train["Relative_Close"] = train_close / train_close[0]

In [None]:
mask_test = (df_diff.index  >= "2018-12-01") & (df_diff.index < "2019-01-01")  # December 2018 for testing
df_test = df_diff.loc[mask_test].copy()
# test_close = df_aligned.loc[mask_test, "close"].values
# df_test["Relative_Close"] = test_close / train_close[0]

In [None]:
# Generate dataset function
def generate_dataset(df, seq_len):
    X_list, y_list = [], []
    for i in range(len(df) - seq_len):
        X_list.append(df.iloc[i:(i+seq_len), :].values)
        y_list.append(df["close"].iloc[i + seq_len])
    return np.array(X_list), np.array(y_list)

In [None]:
LAG = 60 # Example: Use the past 60 days to predict the next day

In [None]:
# Prepare training and test datasets
X_train, y_train = generate_dataset(df_train, LAG)
X_test, y_test = generate_dataset(pd.concat((df_train.iloc[-LAG:], df_test)), LAG)

In [None]:
X_train.shape[1]

60

In [None]:
X_train.shape[2]

6

In [None]:
# Model architecture (unchanged from the minute-by-minute example)
tf.keras.utils.set_random_seed(4002)
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(50, dropout=0.2),
    Dense(25),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

In [None]:
# Define early stopping criteria
early_stopping = EarlyStopping(monitor='loss', patience=5, verbose=1, mode='min', restore_best_weights=True)

In [None]:
# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=100, shuffle=True, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 9: early stopping


<keras.src.callbacks.History at 0x7a70eba6da50>

In [None]:
predicted_prices = model.predict(X_test)



In [None]:
# Reshape predicted_prices to match the expected shape
predicted_prices_reshaped = np.zeros((predicted_prices.shape[0], df[columns_to_scale].shape[1]))
predicted_prices_reshaped[:, 0] = predicted_prices[:, 0]  # Assuming the first column is the predicted closing price

# Apply inverse transformation
predicted_prices_original_scale = scaler.inverse_transform(predicted_prices_reshaped)


In [None]:
predicted_closing_prices = predicted_prices_original_scale[:, 0]

In [None]:
print("Test MSE:", np.mean((predicted_closing_prices- y_test)**2))

Test MSE: 13.245655014072495
