In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import talib

In [2]:
#Load data

btc_data = pd.read_csv(r'../Data/Input/btc_data.csv')
btc_data['Date'] = pd.to_datetime(btc_data['Date'])
btc_data.set_index('Date', inplace=True)


In [3]:
#Moving averages EMA
btc_data['EMA_7'] = talib.EMA(btc_data['Closing Price (USD)'].values, timeperiod=7)
btc_data['EMA_20'] = talib.EMA(btc_data['Closing Price (USD)'].values, timeperiod=20)
btc_data['EMA_50'] = talib.EMA(btc_data['Closing Price (USD)'].values, timeperiod=50)
btc_data['EMA_100'] = talib.EMA(btc_data['Closing Price (USD)'].values, timeperiod=100)

#Moving averages WMA

btc_data['WMA_7'] = talib.WMA(btc_data['Closing Price (USD)'].values, timeperiod=7)
btc_data['WMA_20'] = talib.WMA(btc_data['Closing Price (USD)'].values, timeperiod=20)
btc_data['WMA_50'] = talib.WMA(btc_data['Closing Price (USD)'].values, timeperiod=50)
btc_data['WMA_100'] = talib.WMA(btc_data['Closing Price (USD)'].values, timeperiod=100)

# Interaction between short-term and long-term moving averages
btc_data['WMA7_WMA20'] = btc_data['WMA_7'] * btc_data['WMA_20']
btc_data['EMA20_EMA50'] = btc_data['EMA_20'] * btc_data['EMA_50']


In [4]:
#Volatility
btc_data['7_day_volatility'] = btc_data['Closing Price (USD)'].rolling(window=7).std()
btc_data['30_day_volatility'] = btc_data['Closing Price (USD)'].rolling(window=30).std()


#Bollinger Bands: A volatility indicator
upperband, middleband, lowerband = talib.BBANDS(btc_data['Closing Price (USD)'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
btc_data['BB_upper'] = upperband
btc_data['BB_middle'] = middleband
btc_data['BB_lower'] = lowerband

In [5]:
# Additional technical indicators

#RSI
btc_data['RSI'] = talib.RSI(btc_data['Closing Price (USD)'].values, timeperiod=14)

#MACD
btc_data['MACD'], btc_data['MACD_signal'], _ = talib.MACD(btc_data['Closing Price (USD)'].values, fastperiod=12, slowperiod=26, signalperiod=9)

#ATR
btc_data['ATR'] = talib.ATR(btc_data['24h High (USD)'].values, btc_data['24h Low (USD)'].values, btc_data['Closing Price (USD)'].values, timeperiod=14)

#Stochastic
btc_data['Stoch_k'], btc_data['Stoch_d'] = talib.STOCH(btc_data['24h High (USD)'], btc_data['24h Low (USD)'], btc_data['Closing Price (USD)'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)

#Ichimoku Cloud
high_prices = btc_data['24h High (USD)']
close_prices = btc_data['Closing Price (USD)']
low_prices = btc_data['24h Low (USD)']

nine_period_high = high_prices.rolling(window=9).max()
nine_period_low = low_prices.rolling(window=9).min()
btc_data['Ichimoku_Span_A'] = (nine_period_high + nine_period_low) / 2

twenty_six_period_high = high_prices.rolling(window=26).max()
twenty_six_period_low = low_prices.rolling(window=26).min()
btc_data['Ichimoku_Span_B'] = (twenty_six_period_high + twenty_six_period_low) / 2



In [6]:
#logarithmic return
btc_data['log_return'] = np.log(btc_data['Closing Price (USD)'] / btc_data['Closing Price (USD)'].shift(1))

# Rolling mean and standard deviation for a 14-day window
btc_data['14_day_rolling_mean'] = btc_data['Closing Price (USD)'].rolling(window=14).mean()
btc_data['14_day_rolling_std'] = btc_data['Closing Price (USD)'].rolling(window=14).std()

#Price Rate of Change
btc_data['ROC'] = btc_data['Closing Price (USD)'].pct_change(periods=14)

In [7]:
# Creating lag features
for lag in range(1, 4):
    btc_data[f'lag_{lag}'] = btc_data['Closing Price (USD)'].shift(lag)



In [8]:
btc_data.head(20)

Unnamed: 0_level_0,Closing Price (USD),24h Open (USD),24h High (USD),24h Low (USD),Closing_Price_Diff,EMA_7,EMA_20,EMA_50,EMA_100,WMA_7,...,Stoch_d,Ichimoku_Span_A,Ichimoku_Span_B,log_return,14_day_rolling_mean,14_day_rolling_std,ROC,lag_1,lag_2,lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,663.034,674.008,674.852,660.693,,,,,,,...,,,,,,,,,,
2017-01-02,667.747,663.034,668.197,663.034,4.713,,,,,,...,,,,0.007083,,,,663.034,,
2017-01-03,655.725,667.747,668.058,646.154,-12.022,,,,,,...,,,,-0.018168,,,,667.747,663.034,
2017-01-04,655.895,655.725,657.438,649.758,0.17,,,,,,...,,,,0.000259,,,,655.725,667.747,663.034
2017-01-05,661.284,655.895,664.918,653.583,5.389,,,,,,...,,,,0.008183,,,,655.895,655.725,667.747
2017-01-06,656.418,661.284,662.306,654.12799,-4.866,,,,,,...,,,,-0.007386,,,,661.284,655.895,655.725
2017-01-07,651.82899,656.418,657.72799,645.65,-4.58901,658.847427,,,,657.036212,...,,,,-0.007016,,,,656.418,661.284,655.895
2017-01-08,657.91599,651.82899,660.51,648.92499,6.087,658.614568,,,,656.803352,...,,,,0.009295,,,,651.82899,656.418,661.284
2017-01-09,656.427,657.91599,659.631,655.16099,-1.48899,658.067676,,,,656.381032,...,,660.251,,-0.002266,,,,657.91599,651.82899,656.418
2017-01-10,657.681,656.427,659.25299,655.385,1.254,657.971007,,,,656.676497,...,,656.9235,,0.001909,,,,656.427,657.91599,651.82899


In [9]:
# Cyclical encoding for day and month

btc_data['day_sin'] = np.sin(btc_data.index.day * (2. * np.pi / 31))
btc_data['day_cos'] = np.cos(btc_data.index.day * (2. * np.pi / 31))
btc_data['month_sin'] = np.sin((btc_data.index.month - 1) * (2. * np.pi / 12))
btc_data['month_cos'] = np.cos((btc_data.index.month - 1) * (2. * np.pi / 12))




In [10]:
# Handling NaN values
btc_data.fillna(method='ffill', inplace=True)
btc_data.fillna(0, inplace=True)


In [11]:

# Splitting data into training, testing, and validation sets based on dates
train_data = btc_data[btc_data.index.year < 2020]
test_data = btc_data[(btc_data.index.year == 2020) & (btc_data.index.month < 6)]
validation_data = btc_data[(btc_data.index.year == 2020) & (btc_data.index.month >= 6)]


In [12]:
# Scaling features
scaler = StandardScaler()
features_to_scale = [
    'log_return', 'WMA_7', 'WMA_20', 'WMA_50', 'WMA_100',
    'EMA_7', 'EMA_20', 'EMA_50', 'EMA_100', '7_day_volatility',
    '30_day_volatility', 'RSI', 'MACD', 'MACD_signal', 'ATR',
    'Stoch_k', 'Stoch_d', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
    'WMA7_WMA20', 'EMA20_EMA50', '14_day_rolling_mean', '14_day_rolling_std', 'ROC'
]

btc_data[features_to_scale] = scaler.fit_transform(btc_data[features_to_scale])

In [13]:
# Fit the scaler on the training data only
scaler.fit(train_data[features_to_scale])

# Scale the training data
train_data[features_to_scale] = scaler.transform(train_data[features_to_scale])

# Scale the testing data
test_data[features_to_scale] = scaler.transform(test_data[features_to_scale])

# Scale the out-of-sample data
validation_data[features_to_scale] = scaler.transform(validation_data[features_to_scale])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[features_to_scale] = scaler.transform(train_data[features_to_scale])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[features_to_scale] = scaler.transform(test_data[features_to_scale])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation_data[features_to_scale] = scaler.tra

In [14]:
# Saving testing and training data

train_data.to_csv(r'..\Data\output\train_data.csv', index=True)
test_data.to_csv(r'..\Data\output\test_data.csv', index=True)
validation_data.to_csv(r'..\Data\output\validation_data.csv', index=True)