In [1]:
# load dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

import random

In [2]:
def RFC(d,f,my_seed):
    np.random.seed(my_seed)
    random.seed(my_seed)
    
    model = RandomForestClassifier(n_estimators=50)
    model.fit(d, f)
    return model

In [3]:
# read data
df = pd.read_csv('Data/combined_2019.csv')
del df['sentiment']
begin_drop = 91
end_drop = 101
begin = 67
end = 91

In [4]:
# create moving averages
for i in range(31):
    colname = 'ma' + str(i+1)
    df[colname] = df.iloc[:,(i+8):(i+18)].sum(axis=1)

df.drop(df.columns[begin_drop:end_drop], axis=1, inplace=True)

In [5]:
# Select trainging hour and test hour
data = df[df['hour'].isin([6,7,8])]
del df

In [6]:
# Categorize the 'sum' column the number of PIPs over selected range
def setlabels(x):
    if x < -5:
        return -1
    elif x > 5:
        return 1
    else:
        return 0

data["labels"] = data["ma31"].apply(setlabels)

In [7]:
profit_tot =[]
train_int = 1000
predict_int = 500
count = int((data.shape[0] - train_int) / predict_int)

df_predict = pd.DataFrame()
#df_predict = data.iloc[0:train_int+100].copy()
#df_predict['class'] = 0

for j in range(count):
    
    start = predict_int * j

    # Get fitting data
    X_data = data.iloc[start:start + train_int].copy()
    X = X_data.iloc[:,begin:end].to_numpy()
    y = data['labels'].iloc[start:start+train_int]
    le = LabelEncoder()
    le.fit(y)
    y_encoded = le.transform(y)

    rfc = RFC(X, y_encoded, 49)

    # Get predicting data
    X_pred = data.iloc[start + train_int + 100:start + train_int + predict_int + 100].copy()
    X_predict = X_pred.iloc[:,begin:end].to_numpy()
    X_pred['class'] = le.inverse_transform(rfc.predict(X_predict))
    X_pred = X_pred.drop(X_pred[(X_pred['class'] == 0)].index)
    df_predict = df_predict.append(X_pred)
    
    X_pred['profit'] = X_pred['class'] * X_pred['ma31']
    
    profit = X_pred['profit'].sum()
    print("Est Profit Cohort " + str(j) + ": " + str(profit))
    profit_tot.append(profit)

print("Total Profit " + " : " + str(sum(profit_tot)))

Est Profit Cohort 0: 191.7
Est Profit Cohort 1: 16.200000000000003
Est Profit Cohort 2: -1.3999999999999986
Est Profit Cohort 3: 56.3
Est Profit Cohort 4: -51.5
Est Profit Cohort 5: 33.5
Est Profit Cohort 6: -3.0999999999999983
Est Profit Cohort 7: 8.5
Est Profit Cohort 8: -13.100000000000001
Est Profit Cohort 9: -1.2000000000000006
Est Profit Cohort 10: -79.8
Est Profit Cohort 11: -32.9
Est Profit Cohort 12: -66.19999999999999
Est Profit Cohort 13: 2.4000000000000004
Est Profit Cohort 14: 0.0
Est Profit Cohort 15: -1.4000000000000004
Est Profit Cohort 16: -18.800000000000008
Est Profit Cohort 17: 31.900000000000002
Est Profit Cohort 18: 30.1
Est Profit Cohort 19: 0.0
Est Profit Cohort 20: -40.8
Est Profit Cohort 21: -90.99999999999999
Est Profit Cohort 22: -17.599999999999994
Est Profit Cohort 23: 0.0
Est Profit Cohort 24: -26.100000000000005
Est Profit Cohort 25: -56.199999999999996
Est Profit Cohort 26: -0.6999999999999984
Est Profit Cohort 27: -9.799999999999999
Est Profit Cohort 2

In [8]:
df_predict.to_csv('2019_predict_5_rfc.csv')