In [1]:
# load dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

import random
from datetime import datetime

In [2]:
def RFC(d,f):
    #np.random.seed(my_seed)
    #random.seed(my_seed)
    
    model = RandomForestClassifier(n_estimators=50)
    model.fit(d, f)
    return model

In [4]:
# read data
df = pd.read_csv('../Data/combined_2019.csv')
del df['sentiment']
begin_drop = 91
end_drop = 101
begin = 67
end = 91

In [5]:
# create moving averages
for i in range(31):
    colname = 'ma' + str(i+1)
    df[colname] = df.iloc[:,(i+8):(i+18)].sum(axis=1)

df.drop(df.columns[begin_drop:end_drop], axis=1, inplace=True)

In [6]:
# Select trainging hour and test hour
data = df[df['hour'].isin([6,7,8])]
del df

In [7]:
# Categorize the 'sum' column the number of PIPs over selected range
def setlabels(x):
    if x < -5:
        return -1
    elif x > 5:
        return 1
    else:
        return 0

data["labels"] = data["ma31"].apply(setlabels)

In [10]:
sims = []
for k in range(10):
    
    profit_tot =[]
    train_int = 1000
    predict_int = 500
    count = int((data.shape[0] - train_int) / predict_int)

    df_predict = pd.DataFrame()

    for j in range(count):

        start = predict_int * j

        # Get fitting data
        X_data = data.iloc[start:start + train_int].copy()
        X = X_data.iloc[:,begin:end].to_numpy()
        y = data['labels'].iloc[start:start+train_int]
        le = LabelEncoder()
        le.fit(y)
        y_encoded = le.transform(y)

        rfc = RFC(X, y_encoded)

        del X_data, X, y, y_encoded
        #X = None
        #y = None
        #y_encoded = None
        
        # Get predicting data
        X_pred = data.iloc[start + train_int + 100:start + train_int + predict_int + 100].copy()
        X_predict = X_pred.iloc[:,begin:end].to_numpy()
        X_pred['class'] = le.inverse_transform(rfc.predict(X_predict))
        X_pred = X_pred.drop(X_pred[(X_pred['class'] == 0)].index)
        X_pred['profit'] = X_pred['class'] * X_pred['ma31']
        profit = X_pred['profit'].sum()
        #print("Est Profit Cohort " + str(j) + ": " + str(profit))
        profit_tot.append(profit)
        X_pred = None
        X_predict = None
            
    now = datetime.now()    
    print("Profit for Sim " + str(k) + " : " + str(sum(profit_tot)) + " time " + now.strftime("%H:%M:%S"))
    sims.append(sum(profit_tot))
    
print("Total Sum " + " : " + str(sum(sims)))

Profit for Sim 0 : -333.6999999999999 time 08:40:44
Profit for Sim 1 : -428.7000000000001 time 08:40:55
Profit for Sim 2 : -512.8 time 08:41:07
Profit for Sim 3 : -368.0999999999999 time 08:41:18
Profit for Sim 4 : -384.1999999999998 time 08:41:29
Profit for Sim 5 : -561.6999999999998 time 08:41:39
Profit for Sim 6 : -305.00000000000006 time 08:41:50
Profit for Sim 7 : -253.6 time 08:42:02
Profit for Sim 8 : -479.0 time 08:42:13
Profit for Sim 9 : -427.2000000000001 time 08:42:24
Total Sum  : -4053.9999999999995


In [None]:
df_sims = pd.DataFrame(sims)
df_sims.to_csv('2019_predict_sims.csv')