## Trading Algorithm XGBoost

*Import essential libraries.*

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Data Preprocessing

*Import dataset of historical prices.*

In [2]:
trade_data = pd.read_csv('ETHBULL-USD-historical.csv')
trade_data.describe()

Unnamed: 0,close,high,low,open,time,volume,avg
count,66634.0,66634.0,66634.0,66634.0,66634.0,66634.0,66634.0
mean,2164.67516,2172.15718,2157.222371,2164.81404,1575084000000.0,246.630589,2164.7446
std,1632.501927,1637.489664,1627.691797,1632.735182,5772850000.0,4306.713785,1632.592313
min,37.7,43.81,34.98,38.68,1565086000000.0,0.0,41.145
25%,927.5,930.25,924.5,927.5,1570083000000.0,0.0,927.25
50%,2030.5,2039.0,2024.0,2030.5,1575086000000.0,0.0,2031.0
75%,2873.75,2887.0,2860.0,2873.0,1580083000000.0,0.0,2874.5
max,10047.5,10077.5,9937.5,10047.5,1585081000000.0,608227.8244,10000.0


*Make max_trade_data, a dataframe that contains the maximum value of the next 20 closing prices for each row. Also, the 'change' columns is initialized.*

In [3]:
max_trade_data = trade_data[['close','high','low','open']].rolling(20).max()
max_trade_data = max_trade_data.shift(periods=-20)
max_trade_data['change']=0

*Delete last 20 rows as the rolling makes them NaN.*

In [4]:
trade_data = trade_data[:-20]
max_trade_data = max_trade_data[:-20]

*Assign 1 in the 'change' column for rows that have a max_trade_data 'close' value larger than the trade_data 'close', and viceversa for -1. This is done in such a way that the value of 'change' is 1 if the closing value increased in the next 20 rows, it's -1 if the closing value decreased and 0 if it reached the same value.*

In [5]:
max_trade_data.loc[max_trade_data['close'] > trade_data['close'],'change']=1
max_trade_data.loc[max_trade_data['close'] < trade_data['close'],'change']=-1

In [6]:
trade_data['change']=max_trade_data['change']

*Make indicators*

In [7]:
trade_data['close_slope'] = trade_data['close'].diff()

trade_data['ema_26']=pd.Series.ewm(trade_data['close'],span=26).mean()
trade_data['ema_26_slope'] = trade_data['ema_26'].diff()
trade_data['std_26']=trade_data['close'].rolling(26).std()
trade_data['std_26_slope'] = trade_data['std_26'].diff()

trade_data['ema_9']=pd.Series.ewm(trade_data['close'],span=9).mean()
trade_data['ema_9_slope'] = trade_data['ema_9'].diff()
trade_data['std_9']=trade_data['close'].rolling(9).std()
trade_data['std_9_slope'] = trade_data['std_9'].diff()

trade_data['ema_20']=pd.Series.ewm(trade_data['close'],span=9).mean()
trade_data['ema_20_slope'] = trade_data['ema_20'].diff()
trade_data['std_20']=trade_data['close'].rolling(20).std()
trade_data['std_20_slope'] = trade_data['std_20'].diff()

Delete the first 26 rows as they are NaN.

In [8]:
trade_data = trade_data[26:]

*Divide into target and features*

In [9]:
trade_data_target = trade_data['change']
trade_data_features = trade_data.drop(['change'],axis=1)

In [10]:
trade_data_features.head()

Unnamed: 0,close,high,low,open,startTime,time,volume,avg,close_slope,ema_26,...,std_26,std_26_slope,ema_9,ema_9_slope,std_9,std_9_slope,ema_20,ema_20_slope,std_20,std_20_slope
26,8867.5,8875.0,8785.0,8832.5,2019-08-06T12:10:00+00:00,1565093000000.0,0.0,8850.0,35.0,9048.948352,...,378.682128,-15.45954,8923.205288,-13.968539,78.708102,-7.614332,8923.205288,-13.968539,113.863015,4.044085
27,8912.5,8937.5,8860.0,8867.5,2019-08-06T12:15:00+00:00,1565094000000.0,0.0,8890.0,45.0,9037.515888,...,347.827765,-30.854363,8921.060081,-2.145207,57.605579,-21.102522,8921.060081,-2.145207,99.98059,-13.882425
28,8917.5,8940.0,8910.0,8912.5,2019-08-06T12:20:00+00:00,1565094000000.0,0.0,8915.0,5.0,9027.556954,...,297.213469,-50.614296,8920.346961,-0.71312,41.570723,-16.034857,8920.346961,-0.71312,86.10963,-13.87096
29,8877.5,8937.5,8875.0,8917.5,2019-08-06T12:25:00+00:00,1565094000000.0,0.0,8897.5,-40.0,9015.215126,...,251.483292,-45.730177,8911.766948,-8.580014,37.578621,-3.992102,8911.766948,-8.580014,87.04383,0.934199
30,8837.5,8920.0,8837.5,8877.5,2019-08-06T12:30:00+00:00,1565095000000.0,0.0,8857.5,-40.0,9000.716981,...,199.246103,-52.237189,8896.898833,-14.868114,39.747467,2.168845,8896.898833,-14.868114,87.777661,0.733831


*Drop datetime column not adequate for xgboost*

In [11]:
trade_data_features = trade_data_features.drop(['startTime'],axis=1)

### Model Training

In [12]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

*Divide dataset into train and test data.*

In [13]:
X_train, X_test, y_train, y_test = train_test_split(trade_data_features, trade_data_target, test_size=0.33, random_state=42)

In [14]:
X_train_t,X_test_t,y_train_t,y_test_t = trade_data_features[:44000],trade_data_features[44000:],trade_data_target[:44000],trade_data_target[44000:]

*Initialize xgboost model*

In [15]:
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model_t = xgb.XGBClassifier(random_state=1,learning_rate=0.01, scale_pos_weight=99)

*Train xgboost model.*

In [16]:
model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [17]:
X_train_t = X_train_t[['avg', 'close_slope', 'ema_26_slope', 'ema_9']]

model_t.fit(X_train_t,y_train_t)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=99, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [18]:
model.score(X_test,y_test)

0.8558816837315131

In [20]:
model_t.score(X_test_t[['avg', 'close_slope', 'ema_26_slope', 'ema_9']],y_test_t)

0.8586417566849655

In [21]:
trade_data_target.value_counts()

 1    56865
-1     8249
 0     1474
Name: change, dtype: int64

In [22]:
new_y_train = y_train[y_train==1][:5573]
new_X_train = X_train[y_train==1][:5573]

In [23]:
new_y_train = pd.concat([y_train[y_train!=1],new_y_train])
new_X_train = pd.concat([X_train[y_train!=1],new_X_train])

In [24]:
model_new = xgb.XGBClassifier(random_state=1,learning_rate=0.01)

In [25]:
new_X_train = new_X_train[['avg', 'close_slope', 'ema_26_slope', 'ema_9']]

In [26]:
new_y_train.value_counts()

-1    5573
 1    5573
 0     981
Name: change, dtype: int64

In [27]:
model_new.fit(new_X_train,new_y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [28]:
model_new.score(X_test_t[['avg', 'close_slope', 'ema_26_slope', 'ema_9']],y_test_t)

0.5928811758455818

In [30]:
y = model_new.predict(X_test_t[['avg', 'close_slope', 'ema_26_slope', 'ema_9']])

In [31]:
y[y==0]

array([0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [32]:
model_new.feature_importances_

array([0.20943446, 0.30193612, 0.23852499, 0.25010446], dtype=float32)

In [33]:
X_test.tail()

Unnamed: 0,close,high,low,open,time,volume,avg,close_slope,ema_26,ema_26_slope,std_26,std_26_slope,ema_9,ema_9_slope,std_9,std_9_slope,ema_20,ema_20_slope,std_20,std_20_slope
11147,3515.0,3521.0,3515.0,3521.0,1568430000000.0,0.0,3518.0,-6.0,3526.306346,-0.904508,5.853204,0.402869,3524.298007,-2.324502,6.948221,1.483689,3524.298007,-2.324502,6.300376,0.479361
53280,2159.0,2168.5,2136.5,2136.5,1581075000000.0,10.5791,2147.75,22.5,2089.634178,5.549266,46.862187,0.891658,2127.255483,7.936129,18.556969,4.1261,2127.255483,7.936129,38.264179,0.182385
54007,2282.5,2286.5,2268.5,2278.5,1581293000000.0,0.0,2280.5,4.0,2248.82199,2.694241,24.615231,-1.598718,2271.006267,2.873433,9.663074,-0.528835,2271.006267,2.873433,15.867876,-0.479585
59527,1605.5,1613.0,1595.0,1598.0,1582949000000.0,0.0,1601.75,7.5,1567.598333,3.032133,18.121853,1.168194,1582.308372,5.797907,22.569953,2.473657,1582.308372,5.797907,17.882217,2.269385
29593,2034.5,2034.5,2025.0,2032.0,1573969000000.0,0.0,2033.25,2.5,2038.265502,-0.30124,11.384352,-0.666327,2032.391951,0.527012,4.981215,-0.130727,2032.391951,0.527012,7.877516,-1.25614


In [34]:
from random import randint

X_test_t = X_test_t[['avg', 'close_slope', 'ema_26_slope', 'ema_9']]

def calculate_roi(model, x_test, debug = True):
    df = x_test.copy()
    
    usd_balance = 100
    crypto_balance = 0
    n_actions = 0
    
    usd_balances = [usd_balance]
    
    df['prediction'] = model_new.predict(df)
    
    if debug:
        print(df)
    
    for index, row in df.iterrows():
        if row['prediction'] == 1 or row['prediction'] == 0:
            if usd_balance:
                crypto_balance = usd_balance / row['avg']
                usd_balance = 0
                
                if debug:
                    print("On index: ", index, "BUY")
                
                n_actions += 1
            else:
                pass
        elif row['prediction'] == -1:
            if usd_balance:
                pass
            else:
                usd_balance = crypto_balance * row['avg']
                
                if debug:
                    print("On index: ", index, "SELL")
                
                crypto_balance = 0
                
                usd_balances.append(usd_balance)
                n_actions += 1
                
    if usd_balance:
        pass
    else:
        usd_balance = crypto_balance * df.iloc[-1]['avg']
    
    if debug:
        print("Initial USD$ balance: 100")
        print("Final USD$ balance: ", usd_balance)

        print("Number of actions buy/sell taken: ", n_actions)

        print("Max USD$ Balance", max(usd_balances))
        
        print("Number of 0 predicted", len(df[df['prediction'] == 0]))
        
    
        
    return (usd_balance - 100)/100

calculate_roi(model_new, X_test_t, debug = False)

-0.9999970992101977

In [35]:
def get_avg_roi(model, x_test, iterations = 100):
    n = len(x_test)
    avg_roi = 0
    
    for i in range(iterations):
        initial_point = randint(0, n-288*7)
        x = calculate_roi(model, x_test[initial_point:initial_point + 288], debug = False)
        
        avg_roi += x / iterations
        
    return avg_roi
    
get_avg_roi(model_t, X_test_t)

-0.13056673902490673