In [None]:
from google.colab import files

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None

In [None]:
# upload file and read it in
dataset = files.upload()
df = pd.read_csv('records.csv')

In [None]:
# data manipulation.
df['IsBuy'] = ''
df['IsSell'] = ''

df['IsMonday'] = ''
df['IsTuesday'] = ''
df['IsWednesday'] = ''
df['IsThursday'] = ''
df['IsFriday'] = ''

df['LowNewsImpact'] = ''
df['MediumNewsImpact'] = ''
df['HighNewsImpact'] = ''
df['HolidayNewsImpact'] = ''

for i, row in df.iterrows():
    df.at[i, 'IsBuy'] = 1 if row['Order Type'] == 'Buy' else 0
    df.at[i, 'IsSell'] = 1 if row['Order Type'] == 'Sell' else 0

    df.at[i, 'IsMonday'] = 1 if row['Day of Week'] == 1 else 0
    df.at[i, 'IsTuesday'] = 1 if row['Day of Week'] == 2 else 0
    df.at[i, 'IsWednesday'] = 1 if row['Day of Week'] == 3 else 0
    df.at[i, 'IsThursday'] = 1 if row['Day of Week'] == 4 else 0
    df.at[i, 'IsFriday'] = 1 if row['Day of Week'] == 5 else 0

    df.at[i, 'LowNewsImpact'] = 1 if row['News Impact'] == 1 else 0
    df.at[i, 'MediumNewsImpact'] = 1 if row['News Impact'] == 2 else 0
    df.at[i, 'HighNewsImpact'] = 1 if row['News Impact'] == 3 else 0
    df.at[i, 'HolidayNewsImpact'] = 1 if row['News Impact'] == 4 else 0


In [None]:
featureName = df.columns
featuresToExclude = ['Order Type', 'Day of Week', 'News Impact', 'Outcome']

labels = df[['Outcome']]
labels.loc[labels['Outcome'] == 'Win', 'Outcome'] = 1
labels.loc[labels['Outcome'] == 'Lose', 'Outcome'] = 0
labels = labels.astype(int)

features = df.drop(featuresToExclude, axis=1)
features = pd.get_dummies(features)

In [None]:
class CustomForest:
    def __init__(self, x, y):
        self.trained = False
        self.forest = RandomForestClassifier(n_estimators=10, max_depth=10)

        xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=1)
        xTrain, xVal, yTrain, yVal = train_test_split(xTrain, yTrain, test_size=0.25, random_state=2)

        self.x = x
        self.y = y       

        self.xTrain = xTrain
        self.yTrain = yTrain
        
        self.xTest = xTest
        self.yTest = yTest

        self.xVal = xVal
        self.yVal = yVal
    
    def Train(self):
        self.forest = self.forest.fit(self.xTrain, self.yTrain.values.ravel())
        self.trained = True

    def PrintResults(self):
        if (self.Trained)
            train_error = np.round(self.forest.score(self.xTrain, self.yTrain), 2)
            test_error = np.round(self.forest.score(self.xTest, self.yTest), 2)

            print("Training Set Mean Accuracy = " + str(train_error))
            print("Test Set Mean Accuracy = " + str(test_error))

            print("\n Feature Importance:")
            # Get numerical feature importances
            importances = list(self.forest.feature_importances_)
            # List of tuples with variable and importance
            feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(self.x, importances)]
            # Sort the feature importances by most important first
            feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
            # Print out the feature and importances 
            [print('Feature: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
        else:
            print("Forest not trained yet")
    


In [None]:
# get buy / sell / win / lose x and y

In [None]:
buyWinForest = CustomForest(buyWinX, buyWinY)
buyLoseForest = CustomForest(buyLoseX, buyLoseY)

sellWinForest = CustomForest(sellWinX, sellWinY)
sellLoseForest = CustomForest(sellLoseX, sellLoseY)

buyWinForest.Train()
buyLoseForest.Train()

sellWinForest.Train()
sellLoseForest.Train()

buyWinForest.PrintResults()
buyLoseForest.PrintResults()

sellWinForest.PrintResults()
sellLoseForest.PrintResults()