### IMPORT REQUIREMENTS

In [184]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ta
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, LSTM, Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

In [33]:
XAU = pd.read_csv('./XAU_USD Historical Data.csv')
BTC = pd.read_csv('./Bitcoin Historical Data.csv')

In [34]:
print("----- GOLD -------------\n", XAU.head())
print("\n----- BITCOIN ----------\n", BTC.head())

----- GOLD -------------
          Date     Price      Open      High       Low  Vol. Change %
0  12/06/2023  2,028.76  2,018.97  2,035.93  2,018.82   NaN    0.46%
1  12/05/2023  2,019.42  2,030.15  2,041.33  2,010.02   NaN   -0.51%
2  12/04/2023  2,029.74  2,071.25  2,135.90  2,020.34   NaN   -1.99%
3  12/01/2023  2,070.90  2,034.49  2,075.34  2,033.75   NaN    1.73%
4  11/30/2023  2,035.75  2,044.80  2,047.59  2,031.84   NaN   -0.43%

----- BITCOIN ----------
          Date     Price      Open      High       Low     Vol. Change %
0  12/06/2023  43,734.6  44,076.2  44,144.5  43,466.7  101.41K   -0.78%
1  12/05/2023  44,076.2  41,989.6  44,424.1  41,424.9   96.84K    4.97%
2  12/04/2023  41,987.8  39,968.6  42,394.4  39,968.6  104.21K    5.05%
3  12/03/2023  39,970.2  39,456.8  40,178.9  39,280.3   35.27K    1.30%
4  12/02/2023  39,458.4  38,688.2  39,673.4  38,646.5   37.09K    1.99%


### DATA PREPROCESSING

* Data Cleaning

In [35]:
def df_CleanTransform(df):

    # change 'Date' to datetime format & sort by 'Date'
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values(by='Date', inplace=True)

    # change data type to float
    df['High'] = df['High'].str.replace(',', '').astype(float)
    df['Low'] = df['Low'].str.replace(',', '').astype(float)
    df['Open'] = df['Open'].str.replace(',', '').astype(float)
    df['Price'] = df['Price'].str.replace(',', '').astype(float)
    
    df['Change %'] = df['Change %'].str.rstrip('%').astype('float') / 100.0

    df.replace(np.nan, '12345.12345', inplace=True)
    df[df['Vol.'] == '12345.12345']
    df['Volume'] = df['Vol.'].str[:-1].astype(float)
    df['m'] = df['Vol.'].str[-1]
    change_value = {'K': 1000, 'M': 1000000, 'B': 1000000000, '5': 1}
    df['m'] = df['m'].map(change_value)
    df['Vol.'] = df['Volume']*df['m']
    df.replace(12345.1234, np.nan, inplace=True)
    df.drop(['Volume', 'm'], axis=1, inplace=True)

    return df

XAU = df_CleanTransform(XAU)
BTC = df_CleanTransform(BTC)

In [36]:
print("----- nulls in the GOLD dataset -------------\n", XAU.isna().sum())
print("\n----- nulls in the BITCOIN dataset ----------\n", BTC.isna().sum())

----- nulls in the GOLD dataset -------------
 Date           0
Price          0
Open           0
High           0
Low            0
Vol.        3889
Change %       0
dtype: int64

----- nulls in the BITCOIN dataset ----------
 Date        0
Price       0
Open        0
High        0
Low         0
Vol.        6
Change %    0
dtype: int64


In [37]:
XAU.drop('Vol.', axis=1, inplace=True)
BTC.dropna(subset=['Vol.'], inplace=True)

* Technical Feature Extraction

In [38]:
def technical_indicators(df):

    # Rate Of Change
    df['ROC_26'] = ta.momentum.ROCIndicator(close=df['Price'], window=26).roc()
    df['ROC_34'] = ta.momentum.ROCIndicator(close=df['Price'], window=34).roc()
    df['ROC_52'] = ta.momentum.ROCIndicator(close=df['Price'], window=52).roc()
    df['ROC_68'] = ta.momentum.ROCIndicator(close=df['Price'], window=68).roc()    
    df['ROC_78'] = ta.momentum.ROCIndicator(close=df['Price'], window=78).roc()
    df['ROC_102'] = ta.momentum.ROCIndicator(close=df['Price'], window=102).roc()

    # Exponential Moving Average
    df['EMA_14'] = ta.trend.EMAIndicator(close=df['Price'], window=14).ema_indicator() # short-term
    df['EMA_50'] = ta.trend.EMAIndicator(close=df['Price'], window=50).ema_indicator() # mid-term
    df['EMA_120'] = ta.trend.EMAIndicator(close=df['Price'], window=120).ema_indicator() # long-term

    # Relative Strength Index
    df['RSI_28'] = ta.momentum.RSIIndicator(close=df['Price'], window=28).rsi() #mid-term
    df['RSI_28_SMA14'] = ta.trend.SMAIndicator(close=df['RSI_28'], window=14).sma_indicator() # simple moving average of RSI

    # Moving Average Convergence Divergence - LongTerm
    df['MACD_long_macd'] = ta.trend.MACD(close=df['Price'], window_slow=120, window_fast=52, window_sign=20).macd()
    df['MACD_long_diff'] = ta.trend.MACD(close=df['Price'], window_slow=120, window_fast=52, window_sign=20).macd_diff()
    df['MACD_long_signal'] = ta.trend.MACD(close=df['Price'], window_slow=120, window_fast=52, window_sign=20).macd_signal()

    # Moving Average Convergence Divergence - MidTerm
    df['MACD_mid_macd'] = ta.trend.MACD(close=df['Price'], window_slow=52, window_fast=26, window_sign=10).macd()
    df['MACD_mid_diff'] = ta.trend.MACD(close=df['Price'], window_slow=52, window_fast=26, window_sign=10).macd_diff()
    df['MACD_mid_signal'] = ta.trend.MACD(close=df['Price'], window_slow=52, window_fast=26, window_sign=10).macd_signal()

    # Bollinger Bands - LongTerm
    df['BB_50_high'] = ta.volatility.BollingerBands(close=df['Price'], window=50, window_dev=2).bollinger_hband()
    df['BB_50_low'] = ta.volatility.BollingerBands(close=df['Price'], window=50, window_dev=2).bollinger_lband()
    df['BB_50_width'] = ta.volatility.BollingerBands(close=df['Price'], window=50, window_dev=2).bollinger_wband()
    df['BB_50_percentage'] = ta.volatility.BollingerBands(close=df['Price'], window=50, window_dev=2).bollinger_pband()

    # Bollinger Bands - MidTerm
    df['BB_20_high'] = ta.volatility.BollingerBands(close=df['Price'], window=20, window_dev=1.5).bollinger_hband()
    df['BB_20_low'] = ta.volatility.BollingerBands(close=df['Price'], window=20, window_dev=1.5).bollinger_lband()
    df['BB_20_width'] = ta.volatility.BollingerBands(close=df['Price'], window=20, window_dev=1.5).bollinger_wband()
    df['BB_20_percentage'] = ta.volatility.BollingerBands(close=df['Price'], window=20, window_dev=1.5).bollinger_pband()

    # Ichimoku Cloud
    df['IchiCloud_SpanA'] = ta.trend.IchimokuIndicator(high=df['High'], low=df['Low'], visual=True).ichimoku_a()
    df['IchiCloud_SpanB'] = ta.trend.IchimokuIndicator(high=df['High'], low=df['Low'], visual=True).ichimoku_b()
    df['IchiCloud_KijunSen'] = ta.trend.IchimokuIndicator(high=df['High'], low=df['Low'], visual=True).ichimoku_base_line()
    df['IchiCloud_TenkanSen'] = ta.trend.IchimokuIndicator(high=df['High'], low=df['Low'], visual=True).ichimoku_conversion_line()

    return df

XAU = technical_indicators(XAU)
BTC = technical_indicators(BTC)

In [None]:
fig, axes = plt.subplots(nrows=len(XAU.columns[1:]), ncols=2, figsize=(18, 6 * len(XAU.columns)))

for i, column in enumerate(XAU.columns[1:]):
    # Distribution plot
    sns.histplot(XAU[column], kde=True, ax=axes[i, 0], color='skyblue', stat='density')
    axes[i, 0].set_title(f'Distribution of {column}')

    # Raw data over time
    XAU[column].plot(ax=axes[i, 1], color='orange')
    axes[i, 1].set_title(f'Raw Data Over Time - {column}')
    axes[i, 1].invert_xaxis() 

In [87]:
xau_test = XAU.copy()

In [88]:
xau_test['target_change26'] = xau_test['ROC_26'].shift(-26)
xau_test['target_change34'] = xau_test['ROC_34'].shift(-34)
xau_test['target_change52'] = xau_test['ROC_52'].shift(-52)
xau_test['target_change68'] = xau_test['ROC_68'].shift(-68)
xau_test['target_change78'] = xau_test['ROC_78'].shift(-78)
xau_test['target_change102'] = xau_test['ROC_102'].shift(-102)

xau_test.drop(['Date', 'Open', 'High', 'Low'], axis=1, inplace=True)

xau_test.dropna(subset=xau_test.columns, inplace=True)

xau_test

Unnamed: 0,Price,Change %,ROC_26,ROC_34,ROC_52,ROC_68,ROC_78,ROC_102,EMA_14,EMA_50,...,IchiCloud_SpanA,IchiCloud_SpanB,IchiCloud_KijunSen,IchiCloud_TenkanSen,target_change26,target_change34,target_change52,target_change68,target_change78,target_change102
3750,925.60,0.0050,-2.675990,-2.465753,4.316466,5.253582,-0.904662,-6.815665,925.097736,930.216893,...,952.2250,927.9,936.125,924.025,1.777226,2.760372,7.470830,13.791054,12.888937,30.834054
3749,939.45,0.0150,-1.602514,-2.120233,3.996236,6.555890,1.776718,-5.416562,927.011371,930.578975,...,955.3000,927.9,936.125,924.475,0.149023,1.905370,5.567087,13.225824,12.752142,23.657459
3748,937.35,-0.0022,-1.812182,-4.352041,4.579940,4.960529,2.185763,-2.779650,928.389855,930.844506,...,955.3000,927.9,934.350,924.475,1.792287,4.326025,5.718248,12.551342,15.671841,23.571771
3747,937.70,0.0004,-1.811518,-3.845365,2.942145,5.377311,2.129282,-1.320705,929.631208,931.113349,...,957.7750,927.9,932.475,924.475,0.554548,5.748107,5.812093,12.973232,16.519143,20.512957
3746,949.35,0.0124,1.113005,-3.310078,4.318444,6.477120,2.366832,0.407192,932.260380,931.828511,...,956.0000,927.9,930.950,930.950,-0.452941,4.745352,6.146311,11.697477,14.915468,18.891873
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1931.99,0.0036,-1.501950,-1.278481,-2.897997,-4.373500,-1.908529,5.145746,1924.951719,1943.819569,...,1982.5450,2006.4,1933.050,1915.850,-2.082309,-0.638202,-0.642860,-0.015010,3.819378,5.370628
105,1957.09,0.0130,-0.293451,0.863251,-1.259302,-2.536329,-1.826436,6.271754,1929.236823,1944.339978,...,1982.1000,2006.4,1933.050,1930.190,-3.484766,-1.020393,-1.640190,-1.923775,1.982024,5.815267
104,1960.19,0.0016,1.059996,0.712109,-2.781856,-1.480662,-0.861310,6.466103,1933.363913,1944.961547,...,1982.1000,2006.4,1933.050,1933.185,-3.673113,-0.915728,-2.271719,-1.924813,1.148358,3.548125
103,1954.93,-0.0027,-0.652010,0.622285,-4.689168,-2.406721,-0.083309,6.575189,1936.239391,1945.352467,...,1980.2425,2006.4,1933.050,1933.245,-3.119805,-0.777010,-2.784754,-0.370346,1.392377,3.298839


In [161]:
def df_to_XY(df, window_size, target_len = 6):

    X_df = df.iloc[:,:-target_len].to_numpy()
    Y_df = df.iloc[:,-target_len:].to_numpy()

    X = []
    Y = []
    
    for i in range(len(X_df) - window_size):
        inputs = [row for row in X_df[i:i+window_size]]
        X.append(inputs)
        outputs = Y_df[i+window_size] # [[y] for y in Y_df[i+window_size]]  
        Y.append(outputs)

    return np.array(X), np.array(Y)

In [162]:
def split_dataframe_tvt(np_array, test_split, validation_split):

    test_split_index = int(test_split * len(np_array))

    train_split = np_array[:test_split_index]
    test_set = np_array[test_split_index:]

    validation_split_index = int(validation_split * len(train_split))
    train_set = train_split[:validation_split_index]
    validation_set = train_split[validation_split_index:]

    sets = {'Train': train_set, 'Validation': validation_set, 'Test': test_set}

    return sets

In [172]:
def output_normalizer(train, validation, test):

    i = 0 
    while i < train.shape[-1]:
      
        min_, max_ = np.min(train[:, i]), np.max(train[:, i])

        train[:, i] = (train[:, i] - min_) / (max_ - min_)
        validation[:, i] = (validation[:, i] - min_) / (max_ - min_)
        test[:, i] = (test[:, i] - min_) / (max_ - min_)

        i += 1

    return train, validation, test

def input_normalizer(train, validation, test):

    i = 0 
    while i < train.shape[-1]:
      
        min_, max_ = np.min(train[:, :, i]), np.max(train[:, :, i])

        train[:, :, i] = (train[:, :, i] - min_) / (max_ - min_)
        validation[:, :, i] = (validation[:, :, i] - min_) / (max_ - min_)
        test[:, :, i] = (test[:, :, i] - min_) / (max_ - min_)

        i += 1

    return train, validation, test


In [185]:
X, Y = df_to_XY(xau_test, 50) 
X_train, X_validation, X_test = split_dataframe_tvt(X, 0.8, 0.7)['Train'], split_dataframe_tvt(X, 0.8, 0.7)['Validation'], split_dataframe_tvt(X, 0.8, 0.7)['Test']
Y_train, Y_validation, Y_test = split_dataframe_tvt(Y, 0.8, 0.7)['Train'], split_dataframe_tvt(Y, 0.8, 0.7)['Validation'], split_dataframe_tvt(Y, 0.8, 0.7)['Test']

X_train, X_validation, X_test = input_normalizer(X_train, X_validation, X_test)
Y_train, Y_validation, Y_test = output_normalizer(Y_train, Y_validation, Y_test)

In [186]:
model_LSTM_test = Sequential()
model_LSTM_test.add(InputLayer((50, 31)))
model_LSTM_test.add(LSTM(128))
model_LSTM_test.add(Dense(8, 'relu'))
model_LSTM_test.add(Dense(2, 'linear'))

model_LSTM_test.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 128)               81920     
                                                                 
 dense_2 (Dense)             (None, 8)                 1032      
                                                                 
 dense_3 (Dense)             (None, 2)                 18        
                                                                 
Total params: 82970 (324.10 KB)
Trainable params: 82970 (324.10 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [175]:
# CP_LSTM = ModelCheckpoint('LSTM_TEST/', save_best_only=True)
model_LSTM_test.compile(loss=MeanSquaredError(), optimizer=Adam(learning_rate=0.0001), metrics=[RootMeanSquaredError()])
model_LSTM_test.fit(X_train, Y_train, validation_data=(X_validation, Y_validation), epochs=50) #callbacks=[CP_LSTM]

(2015, 50, 31)