In [1]:
from google.colab import files

import json
import numpy as np
import pandas as pd

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None

In [2]:
!pip install treeinterpreter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from treeinterpreter import treeinterpreter as ti

In [4]:
# upload file and read it in
dataset = files.upload()
df = pd.read_excel('FeatureEngineeringDataTemplate.xlsx')

In [5]:
# data manipulation.
df['IsMonday'] = ''
df['IsTuesday'] = ''
df['IsWednesday'] = ''
df['IsThursday'] = ''
df['IsFriday'] = ''

df['LowNewsImpact'] = ''
df['MediumNewsImpact'] = ''
df['HighNewsImpact'] = ''
df['HolidayNewsImpact'] = ''

for i, row in df.iterrows():
    df.at[i, 'IsMonday'] = 1 if row['Day of Week'] == 1 else 0
    df.at[i, 'IsTuesday'] = 1 if row['Day of Week'] == 2 else 0
    df.at[i, 'IsWednesday'] = 1 if row['Day of Week'] == 3 else 0
    df.at[i, 'IsThursday'] = 1 if row['Day of Week'] == 4 else 0
    df.at[i, 'IsFriday'] = 1 if row['Day of Week'] == 5 else 0

    df.at[i, 'LowNewsImpact'] = 1 if row['News Impact'] == 1 else 0
    df.at[i, 'MediumNewsImpact'] = 1 if row['News Impact'] == 2 else 0
    df.at[i, 'HighNewsImpact'] = 1 if row['News Impact'] == 3 else 0
    df.at[i, 'HolidayNewsImpact'] = 1 if row['News Impact'] == 4 else 0


In [6]:
# columnNames = df.columns
columnsToExclude = ['Order Type', 'Day of Week', 'News Impact', 'Outcome']

dfCopy = df.copy()
dfCopy.loc[dfCopy['Outcome'] == 'Win', 'Outcome'] = 1
dfCopy.loc[dfCopy['Outcome'] == 'Lose', 'Outcome'] = 0

buyX = dfCopy.loc[dfCopy['Order Type'] == 'Buy']
buyY = buyX[['Outcome']] 
buyX = buyX.drop(columnsToExclude, axis = 1)

buyX = buyX.astype(int)
buyY = buyY.astype(int)

sellX = dfCopy.loc[dfCopy['Order Type'] == 'Sell']
sellY = sellX[['Outcome']]
sellX = sellX.drop(columnsToExclude, axis = 1)

sellX = sellX.astype(int)
sellY = sellY.astype(int)

In [7]:
class CustomForest:
    def __init__(self, x, y):
        self.trained = False
        self.forest = RandomForestClassifier(n_estimators=40, max_features=0.5, min_samples_leaf=3)

        xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=1)
        xTrain, xVal, yTrain, yVal = train_test_split(xTrain, yTrain, test_size=0.25, random_state=2)

        self.x = x
        self.y = y       

        self.xTrain = xTrain
        self.yTrain = yTrain
        
        self.xTest = xTest
        self.yTest = yTest

        self.xVal = xVal
        self.yVal = yVal
    
    def Train(self):
        self.forest = self.forest.fit(self.xTrain, self.yTrain.values.ravel())
        self.trained = True

    def PrintResults(self):
        if (self.trained):
            train_error = np.round(self.forest.score(self.xTrain, self.yTrain), 2)
            test_error = np.round(self.forest.score(self.xTest, self.yTest), 2)

            print("Training Set Mean Accuracy = " + str(train_error))
            print("Test Set Mean Accuracy = " + str(test_error))

            print("\n")
            print("Overall Feature Importance:")
            # Get numerical feature importances
            importances = list(self.forest.feature_importances_)
            # List of tuples with variable and importance
            feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(self.x, importances)]
            # Sort the feature importances by most important first
            feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
            # Print out the feature and importances 
            [print('Feature: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

            print("\n")
            print("Feature Importance for Class:")
            prediction, bias, contributions = ti.predict(self.forest, self.x[0:1])
            print(f"Class Prediction: {prediction}")
            print(f"Bias (trainset prior): {bias}")

            for c, feature in zip(contributions[0], self.x.columns):
              print(f"Feature: {feature}, Contribution: {c}")
        else:
            print("Forest not trained yet")


In [10]:
buyForest = CustomForest(buyX, buyY)
sellForest = CustomForest(sellX, sellY)

epochs = 5
for i in range(0, epochs):
  buyForest.Train()
  sellForest.Train()

In [11]:
buyForest.PrintResults()
sellForest.PrintResults()

Training Set Mean Accuracy = 0.92
Test Set Mean Accuracy = 0.69


Overall Feature Importance:
Feature: 5 Period On Balance Volumn Average Change Importance: 0.18
Feature: 10 Period On Balance Volumn Average Change Importance: 0.17
Feature: 20 Period On Balance Volumn Average Change Importance: 0.17
Feature: 40 Period On Balance Volumn Average Change Importance: 0.17
Feature: Previous Consecutive Bullish Heikin Ashi Candles Importance: 0.09
Feature: Current Structure Is Bullish Importance: 0.03
Feature: Entry During RSI Above 70 Importance: 0.02
Feature: IsMonday             Importance: 0.02
Feature: During News          Importance: 0.01
Feature: Previous Candle Was Bullish Importance: 0.01
Feature: Previous Candle Was Bullish Engulfing Importance: 0.01
Feature: Previous Candle Was Hammer Pattern Importance: 0.01
Feature: Previuos Candle Was Shooting Star Pattern Importance: 0.01
Feature: Entry Above 200 EMA  Importance: 0.01
Feature: Previous Consecutive Bearish Heikin Ashi Candles Imp



Class Prediction: [[0.88272243 0.11727757]]
Bias (trainset prior): [[0.72593493 0.27406507]]
Feature: During News, Contribution: [ 0.00541372 -0.00541372]
Feature: Previous Candle Was Bullish, Contribution: [ 0.00239931 -0.00239931]
Feature: Previous Candle Was Bullish Engulfing, Contribution: [0. 0.]
Feature: Previous Candle Was Bearish Engulfing, Contribution: [-0.00144727  0.00144727]
Feature: Previous Candle Was Hammer Pattern, Contribution: [ 0.00353015 -0.00353015]
Feature: Previuos Candle Was Shooting Star Pattern, Contribution: [-0.00810982  0.00810982]
Feature: Entry Above 5 EMA, Contribution: [0. 0.]
Feature: Entry Above 50 EMA, Contribution: [-0.00037682  0.00037682]
Feature: Entry Above 200 EMA, Contribution: [-0.00621775  0.00621775]
Feature: 5 Period On Balance Volumn Average Change, Contribution: [ 0.00313198 -0.00313198]
Feature: 10 Period On Balance Volumn Average Change, Contribution: [ 0.02130926 -0.02130926]
Feature: 20 Period On Balance Volumn Average Change, Contr