# Machine Learning Trading Bot

This notebook aims to train machine learning algorithms to produce profitable trading signals.

## Procedure

1. Pull Bitcoin daily OHLCV candlestick data from Binance api
2. Add technical indicators using `finta` and set up the entry/exit signals according to daily returns
3. Split the data into training and testing datasets and scale the data
4. Train machine learning models using `xgboost` on various combinations of technical indicators
5. Make predictions using the testing data
6. Review the classification report associated with the model predictions
7. Plot the cumulative returns

In [None]:
# Imports
# in-built
import itertools
import os.path
import datetime as dt
import json
from pathlib import Path

import pandas as pd
from pandas.tseries.offsets import DateOffset
import hvplot.pandas
import numpy as np
import matplotlib.pyplot as plt
from finta import TA
from binance import Client

# machine learning imports
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# ML models
import xgboost as xgb
from sklearn.linear_model import LogisticRegression



### Step 1: Download the OHLCV data into a Pandas DataFrame

In [None]:
# Instantiate Binance client
client = Client()


In [None]:
# Create a function to download kline candlestick data from Binance
def get_historical_data(base, quote='USDT'):
    """ Download OHLCV data from Binance """
    klines = client.get_historical_klines(
        base + quote,
        Client.KLINE_INTERVAL_1DAY,
        "5 year ago UTC",
    )
    # klines columns=['Open Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close Time', 'Quote asset volume', 'Number of trades', 'Taker buy base asset volume', 'Taker buy quote asset volume', 'Ignore'])
    # Keep OHLCV data
    cols_ohlcv = ('open', 'high', 'low', 'close', 'volume')
    df = pd.DataFrame((x[:6] for x in klines), columns=['timestamp', *cols_ohlcv])
    df[[*cols_ohlcv]] = df[[*cols_ohlcv]].astype(float)
    # Convert timestamp into date format
    df['date'] = pd.to_datetime(df['timestamp'], unit='ms')
    # Set date as index
    df.set_index('date', inplace=True)
    # Drop timestamp column
    df.drop(columns='timestamp', inplace=True)

    return df


In [None]:
# Get OHLCV dataset from Binance
today = (dt.datetime.today()).strftime('%Y-%m-%d')
root_dir = os.path.join(Path().resolve(), '..')
crypto = "BTC"
try:
    # load cached ohlcv_df from csv file
    ohlcv_df = pd.read_csv(
        os.path.join(root_dir, f"ohlcv_df_{today}.csv"),
        index_col='date',
        infer_datetime_format=True,
        parse_dates=True,
    )
except FileNotFoundError:
    # download ohlcv data from binance
    ohlcv_df = get_historical_data(crypto)
    # save the data to a csv
    ohlcv_df.to_csv(os.path.join(root_dir, f"ohlcv_df_{today}.csv"))

# Review the DataFrame
ohlcv_df.head()

## Step 2: Generate trading signals using `finta`

In [None]:
def add_TA_and_signal(ohlcv_df):
    """ Add technical indicators and buy/sell signal to the input ohlcv dataframe """
    target_col = 'test'
    # Add bollinger bands
    bbands_df = TA.BBANDS(ohlcv_df)
    # Add custom TA entry/exit signals using bollinger bands
    bbands_df['close_vs_BB'] = np.select(
        [
            bbands_df['BB_UPPER'] < ohlcv_df['close'],
            bbands_df['BB_LOWER'] > ohlcv_df['close'],
        ],
        [-1, 1],
        default=0
    )
    # Add EMA
    ema_df = pd.DataFrame(
        [
            TA.EMA(ohlcv_df, 5),
            TA.EMA(ohlcv_df, 12),
        ]
    ).T
    # ema_df['EMA_DIFFERENCE'] = np.where(ema_df.iloc[:,1] > ema_df.iloc[:,0], 1 , -1)
    ema_df['EMA_DIFFERENCE'] = ema_df.iloc[:,1] - ema_df.iloc[:,0]
    # calculate returns
    returns_df = pd.DataFrame(ohlcv_df['close'].pct_change())
    returns_df.columns = ['returns']
    # set up entry/exit signals
    returns_df[target_col] = np.where(returns_df['returns'] > 0, 1, -1)
    returns_df[target_col] = returns_df[target_col].shift(-1)
    # Add custom indicator that counts the consecutive number of green/red days
    returns_df['consecutive'] = (
        (
            returns_df[target_col].groupby(
                # true if the previous value is different from the the current
                (returns_df[target_col] != returns_df[target_col].shift())
                # cumulatively sum them up to categorise them into groups of the same values
                .cumsum()
            )
            # count each value in the group starting from 1
            .cumcount() + 1
        # multiply each value by +/- 1 if the original was +ve or -ve
        ) * np.where(returns_df[target_col] > 0, 1, -1)
    # shift it back to normal because target_col is shifted -1
    ).shift()
    # Load sentiment analysis
    sentiment_df = pd.read_csv(
        '../Sentiment-analysis/BTC_2022-03-10_df.csv',
        index_col='date',
        parse_dates=True,
    )
    # drop source text columns
    sentiment_df.drop(columns=['BTC_headline','BTC_desc'], inplace=True)
    # fill missing days with value of 0.0
    sentiment_df = sentiment_df.asfreq('D').fillna(0.0)

    return pd.concat(
        [
            ohlcv_df,
            TA.SMA(ohlcv_df, 4),
            TA.SMA(ohlcv_df, 100),
            bbands_df,
            ema_df,
            returns_df,
            TA.RSI(ohlcv_df, 14),
            TA.DMI(ohlcv_df),
            TA.VWAP(ohlcv_df),
            TA.PIVOT_FIB(ohlcv_df),
            sentiment_df,
        ],
        axis='columns',
    )


In [None]:
# Add technical indicators and signal
master_df = add_TA_and_signal(ohlcv_df).dropna()
# Review the dataframe
master_df.head()

## Step 3: Split the data into training and testing datasets

In [None]:
# Split training and test data

# Segment the features from the target
target_col = 'test'
y = master_df[target_col]
X = master_df.drop(columns=target_col)

# number of weeks of test data
num_weeks_test_data = 12
# Select the ending period for the training data
end_training = X.index.max() - DateOffset(weeks=num_weeks_test_data)
# Generate the X_train, X_test, y_train and y_test DataFrames
X_train = X.loc[: end_training]
X_test = X.loc[end_training:]
y_train = y.loc[: end_training]
y_test = y.loc[end_training:]


In [None]:
# Scale the features DataFrames

# Create a StandardScaler instance
X_scaler = StandardScaler()

# Apply the scaler model to fit the X-train data
X_scaler.fit(X_train)

# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


## Step 4: Train a machine learning model using `xgboost`, fitting it the training data and make predictions based on the testing data

In [None]:
# create the Xgboost base model
model = xgb.XGBClassifier(
    n_estimators=10,
    max_depth=5,
    objective='reg:logistic',
    learning_rate=0.1,
    random_state=1,
)
# fit the model using the training data
model.fit(X_train_scaled, y_train)

# make predictions
predictions = model.predict(X_test_scaled)

# Review the model's predicted values
predictions[:10]

## Step 5: Review the classification report associated with the `xgboost` model predictions

In [None]:
# Use a classification report to evaluate the model using the predictions and testing data
report = classification_report(y_test, predictions, output_dict=True)

# Print the classification report
print(json.dumps(report, indent=4))


## Step 6: Create a predictions DataFrame that contains columns for "predictions" values, "daily returns", and "model returns"

In [None]:
# Calculate the model's returns
predictions_df = X_test.copy()
# Add y_test to the DataFrame
predictions_df['y_test'] = y_test
# Add the model predictions to the DataFrame
predictions_df['predictions'] = predictions
# Add the actual returns to the DataFrame
predictions_df['daily returns'] = master_df['returns']
# Add the strategy returns to the DataFrame
predictions_df['model returns'] = predictions_df['daily returns'] * predictions_df['predictions'].shift()

# Review the DataFrame
display(predictions_df.head())
display(predictions_df.tail())

## Step 7: Create a cumulative return plot that shows the actual returns vs. the strategy returns. Save a PNG image of this plot. This will serve as a baseline against which to compare the effects of tuning the trading algorithm.

In [None]:
# Plot the cumulative daily returns versus the cumulative model returns
cumulative_returns = (1 + predictions_df[['daily returns', 'model returns']]).cumprod()
fig = cumulative_returns.plot(title='Daily BTC Returns VS XGBoost Model Returns').get_figure()
# save the plot as an image
fig.savefig('xgb_model_returns.png', bbox_inches='tight')
# Display as a hvplot
cumulative_returns.hvplot(title='Daily BTC Returns VS XGBoost Model Returns')

