In [None]:
from google.colab import files

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

In [None]:
pd.options.mode.chained_assignment = None

In [None]:
dataset = files.upload()
df = pd.read_csv('NDX100 Daily Indicator Signals.csv')

Saving NDX100 Daily Indicator Signals.csv to NDX100 Daily Indicator Signals (2).csv


In [None]:
df = df.assign(Change=lambda x: x.close > x.open)

df.loc[df['Change'] == True, 'Change'] = 'Increase'
df.loc[df['Change'] == False, 'Change'] = 'Decrease'

df = df.assign(TomorrowsChange=lambda x: x.Change.shift(-1))
df = df.dropna()

for i in range(1, 6):
  df[f'-{i} Days Change'] = df.Change.shift(i)

# drop the first 5 rows since they will be NaN for '-5 Days Change'
df = df.iloc[5:, :]

In [None]:
featureName = df.columns
featuresToExclude= ['time', 'TomorrowsChange']

labels = df[['TomorrowsChange']]
labels.loc[labels['TomorrowsChange'] == 'Increase', 'TomorrowsChange'] = 1
labels.loc[labels['TomorrowsChange'] == 'Decrease', 'TomorrowsChange'] = 0
labels = labels.astype(int)

features = df.drop(featuresToExclude, axis=1)
features = pd.get_dummies(features)

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(features, labels, test_size=0.2, random_state=1)
xTrain, xVal, yTrain, yVal = train_test_split(xTrain, yTrain, test_size=0.25, random_state=2)

In [None]:
forest = RandomForestClassifier(n_estimators=10, max_depth=10)
forest = forest.fit(xTrain, yTrain.values.ravel())

prediction = forest.predict(xTest)

In [None]:
train_error = np.round(forest.score(xTrain, yTrain), 2)
test_error = np.round(forest.score(xTest, yTest), 2)

print("Training Set Mean Accuracy = " + str(train_error))
print("Test Set Mean Accuracy = " + str(test_error))

Training Set Mean Accuracy = 0.99
Test Set Mean Accuracy = 0.93


In [None]:
# Get numerical feature importances
importances = list(forest.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: Bullish Engulfing    Importance: 0.38
Variable: Bearish Engulfing    Importance: 0.17
Variable: Change_Increase      Importance: 0.08
Variable: Upper Fractal        Importance: 0.05
Variable: Change_Decrease      Importance: 0.05
Variable: close                Importance: 0.04
Variable: open                 Importance: 0.04
Variable: Lower Fractal        Importance: 0.02
Variable: 14/21 EMA Crossunder Importance: 0.01
Variable: Above 50 EMA         Importance: 0.01
Variable: Above 200 EMA        Importance: 0.01
Variable: Parabolic SAR Above  Importance: 0.01
Variable: Alligator Lips Above Teeth Importance: 0.01
Variable: MACD Crossover       Importance: 0.01
Variable: MACD Crossunder      Importance: 0.01
Variable: Stoch Main Overbought Importance: 0.01
Variable: Stoch Signal Overbought Importance: 0.01
Variable: Stoch Main Oversold  Importance: 0.01
Variable: Doji                 Importance: 0.01
Variable: -1 Days Change_Increase Importance: 0.01
Variable: -2 Days Change_De

In [None]:
# Compared to indicator values, and traditional signals for that case, it looks like Bearish and Bullish engulfing actually have some importance.
# My conclusion so far: indicators aren't good at predicting if the direct next candle will increase or decrease because they are a deviation of price, and not price exactly. 
# As a result, my current thoughts are that indicators should be used for a bias or direction of price, and price action or candle stick patterns should be used to confirm it.
# My guess is that indicators will perform better when they are taken into account over time. 