## Trading Algorithm XGBoost

*Import essential libraries.*

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Data Preprocessing

*Import dataset of historical prices.*

In [4]:
trade_data = pd.read_csv('ETHBULL-USD-historical.csv')
trade_data.describe()

Unnamed: 0,close,high,low,open,time,volume,avg
count,66634.0,66634.0,66634.0,66634.0,66634.0,66634.0,66634.0
mean,2164.67516,2172.15718,2157.222371,2164.81404,1575084000000.0,246.630589,2164.7446
std,1632.501927,1637.489664,1627.691797,1632.735182,5772850000.0,4306.713785,1632.592313
min,37.7,43.81,34.98,38.68,1565086000000.0,0.0,41.145
25%,927.5,930.25,924.5,927.5,1570083000000.0,0.0,927.25
50%,2030.5,2039.0,2024.0,2030.5,1575086000000.0,0.0,2031.0
75%,2873.75,2887.0,2860.0,2873.0,1580083000000.0,0.0,2874.5
max,10047.5,10077.5,9937.5,10047.5,1585081000000.0,608227.8244,10000.0


*Make max_trade_data, a dataframe that contains the maximum value of the next 20 closing prices for each row. Also, the 'change' columns is initialized.*

In [5]:
max_trade_data = trade_data[['close','high','low','open']].rolling(20).max()
max_trade_data = max_trade_data.shift(periods=-20)
max_trade_data['change']=0

*Delete last 20 rows as the rolling makes them NaN.*

In [6]:
trade_data = trade_data[:-20]
max_trade_data = max_trade_data[:-20]

*Assign 1 in the 'change' column for rows that have a max_trade_data 'close' value larger than the trade_data 'close', and viceversa for -1. This is done in such a way that the value of 'change' is 1 if the closing value increased in the next 20 rows, it's -1 if the closing value decreased and 0 if it reached the same value.*

In [7]:
max_trade_data.loc[max_trade_data['close'] > trade_data['close'],'change']=1
max_trade_data.loc[max_trade_data['close'] < trade_data['close'],'change']=-1

In [8]:
trade_data['change']=max_trade_data['change']

*Make indicators*

In [9]:
trade_data['close_slope'] = trade_data['close'].diff()

trade_data['ema_26']=pd.Series.ewm(trade_data['close'],span=26).mean()
trade_data['ema_26_slope'] = trade_data['ema_26'].diff()
trade_data['std_26']=trade_data['close'].rolling(26).std()
trade_data['std_26_slope'] = trade_data['std_26'].diff()

trade_data['ema_9']=pd.Series.ewm(trade_data['close'],span=9).mean()
trade_data['ema_9_slope'] = trade_data['ema_9'].diff()
trade_data['std_9']=trade_data['close'].rolling(9).std()
trade_data['std_9_slope'] = trade_data['std_9'].diff()

trade_data['ema_20']=pd.Series.ewm(trade_data['close'],span=9).mean()
trade_data['ema_20_slope'] = trade_data['ema_20'].diff()
trade_data['std_20']=trade_data['close'].rolling(20).std()
trade_data['std_20_slope'] = trade_data['std_20'].diff()

Delete the first 26 rows as they are NaN.

In [10]:
trade_data = trade_data[26:]

*Divide into target and features*

In [11]:
trade_data_target = trade_data['change']
trade_data_features = trade_data.drop(['change'],axis=1)

In [12]:
trade_data_features.head()

Unnamed: 0,close,high,low,open,startTime,time,volume,avg,close_slope,ema_26,...,std_26,std_26_slope,ema_9,ema_9_slope,std_9,std_9_slope,ema_20,ema_20_slope,std_20,std_20_slope
26,8867.5,8875.0,8785.0,8832.5,2019-08-06T12:10:00+00:00,1565093000000.0,0.0,8850.0,35.0,9048.948352,...,378.682128,-15.45954,8923.205288,-13.968539,78.708102,-7.614332,8923.205288,-13.968539,113.863015,4.044085
27,8912.5,8937.5,8860.0,8867.5,2019-08-06T12:15:00+00:00,1565094000000.0,0.0,8890.0,45.0,9037.515888,...,347.827765,-30.854363,8921.060081,-2.145207,57.605579,-21.102522,8921.060081,-2.145207,99.98059,-13.882425
28,8917.5,8940.0,8910.0,8912.5,2019-08-06T12:20:00+00:00,1565094000000.0,0.0,8915.0,5.0,9027.556954,...,297.213469,-50.614296,8920.346961,-0.71312,41.570723,-16.034857,8920.346961,-0.71312,86.10963,-13.87096
29,8877.5,8937.5,8875.0,8917.5,2019-08-06T12:25:00+00:00,1565094000000.0,0.0,8897.5,-40.0,9015.215126,...,251.483292,-45.730177,8911.766948,-8.580014,37.578621,-3.992102,8911.766948,-8.580014,87.04383,0.934199
30,8837.5,8920.0,8837.5,8877.5,2019-08-06T12:30:00+00:00,1565095000000.0,0.0,8857.5,-40.0,9000.716981,...,199.246103,-52.237189,8896.898833,-14.868114,39.747467,2.168845,8896.898833,-14.868114,87.777661,0.733831


*Drop datetime column not adequate for xgboost*

In [13]:
trade_data_features = trade_data_features.drop(['startTime'],axis=1)

### Model Training

In [14]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

*Divide dataset into train and test data.*

In [15]:
X_train, X_test, y_train, y_test = train_test_split(trade_data_features, trade_data_target, test_size=0.33, random_state=42)

In [19]:
X_train_t,X_test_t,y_train_t,y_test_t = trade_data_features[:44000],trade_data_features[44000:],trade_data_target[:44000],trade_data_target[44000:]

*Initialize xgboost model*

In [20]:
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model_t = xgb.XGBClassifier(random_state=1,learning_rate=0.01)

*Train xgboost model.*

In [17]:
model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [21]:
model_t.fit(X_train_t,y_train_t)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [18]:
model.score(X_test,y_test)

0.8558816837315131

In [22]:
model_t.score(X_test,y_test)

0.8496928327645051

In [23]:
trade_data_target.value_counts()

 1    56865
-1     8249
 0     1474
Name: change, dtype: int64

In [30]:
new_y_train = y_train[y_train==1][:5573]
new_X_train = X_train[y_train==1][:5573]

In [31]:
new_y_train = pd.concat([y_train[y_train!=1],new_y_train])
new_X_train = pd.concat([X_train[y_train!=1],new_X_train])

In [32]:
model_new = xgb.XGBClassifier(random_state=1,learning_rate=0.01)

In [33]:
model_new.fit(new_X_train,new_y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [34]:
model_new.score(X_test,y_test)

0.5837087599544938