# Intraday Random Forests

## 1) Introduction

## 2.1) Random Forests Regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_squared_error
from sklearn import metrics

import os
os.chdir('ml4t')

In [2]:
dataset = pd.read_parquet('data/us_equity/taq_min/GOOG.parquet')
dataset = dataset.swaplevel(1,0)
dataset = dataset.astype({'OpenBidPrice': 'float64',
                          'OpenAskPrice': 'float64',
                          'FirstTradePrice': 'float64',
                          'HighBidPrice': 'float64',
                          'HighAskPrice': 'float64',
                          'HighTradePrice': 'float64',
                          'LowBidPrice': 'float64',
                          'LowAskPrice': 'float64',
                          'LowTradePrice': 'float64',
                          'CloseBidPrice': 'float64',
                          'CloseAskPrice': 'float64',
                          'LastTradePrice': 'float64',
                          'MinSpread': 'float64',
                          'MaxSpread': 'float64',
                          'VolumeWeightPrice': 'float64',
                          'FinraVolumeWeightPrice': 'float64',
                          'TradeToMidVolWeight': 'float64',
                          'TradeToMidVolWeightRelative': 'float64',
                          'TimeWeightBid': 'float64',
                          'TimeWeightAsk': 'float64'})
dataset=dataset.dropna()

TypeError: Can only swap levels on a hierarchical axis.

In [3]:
dataset.head()

Unnamed: 0,TradeDate,BarDateTime,Ticker,SecId,OpenBarTimeOffset,OpenBidPrice,OpenBidSize,OpenAskPrice,OpenAskSize,FirstTradeTimeOffset,...,FinraVolumeWeightPrice,UptickVolume,DowntickVolume,RepeatUptickVolume,RepeatDowntickVolume,UnknownTickVolume,TradeToMidVolWeight,TradeToMidVolWeightRelative,TimeWeightBid,TimeWeightAsk
0,2012-01-03,2012-01-03 04:00:00-05:00,GOOG,0,0,0.0,0,0.0,0,0.0,...,0.0,0,0,0,0,0,0.0,0.0,655.5,656.0
1,2012-01-03,2012-01-03 04:01:00-05:00,GOOG,0,0,655.5,900,656.0,100,0.0,...,0.0,0,0,0,0,0,0.0,0.0,655.64,656.0
2,2012-01-03,2012-01-03 04:02:00-05:00,GOOG,0,0,655.64,100,656.0,100,23.793,...,0.0,0,0,0,0,100,-441.5,-0.45846,655.5,665.23
3,2012-01-03,2012-01-03 04:03:00-05:00,GOOG,0,0,655.5,900,665.23,100,0.0,...,0.0,0,0,0,0,0,0.0,0.0,655.5,665.23
4,2012-01-03,2012-01-03 04:04:00-05:00,GOOG,0,0,655.5,900,665.23,100,0.0,...,0.0,0,0,0,0,0,0.0,0.0,655.5,665.23


In [5]:
dataset.set_index('BarDateTime',inplace=True)

In [8]:
X = dataset.drop(['LastTradePrice','TradeDate','Ticker'], axis=1)
y = dataset['LastTradePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index.values, columns=X_train.columns.values)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index.values, columns=X_test.columns.values)

In [9]:
pca=PCA()
pca.fit(X_train)
cpts = pd.DataFrame(pca.transform(X_train))
x_axis = np.arange(1, pca.n_components_+1)
pca_scaled = PCA()
pca_scaled.fit(X_train_scaled)
cpts_scaled = pd.DataFrame(pca.transform(X_train_scaled))

In [None]:
rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=0)
rf.fit(X_train, y_train)

In [None]:
predicted_train = rf.predict(X_train)
predicted_test = rf.predict(X_test)

test_score = r2_score(y_test, predicted_test)
spearman = spearmanr(y_test, predicted_test)
pearson = pearsonr(y_test, predicted_test)

print('Out-of-bag R-2 score estimate:', rf.oob_score_)
print('Test data R-2 score:', test_score)
print('Test data Spearman correlation:',spearman[0])
print('Test data Pearson correlation:',pearson[0])

In [None]:
y_pred = rf.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn import metrics

print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

In [None]:
print('Random Forests Regression Score:', rf.score(X_test, y_test))
