### Random forest for stock pred


Link paper: https://arxiv.org/pdf/1605.00003v1.pdf

In [19]:
import src.get_data as get_data
import src.load_data as load
import src.indexes as idx

import pandas as pd
import seaborn as sns
from pylab import rcParams
import numpy as np
import matplotlib.pyplot as plt
import sqlalchemy

%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 14, 8

# some magic
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
data = get_data.get_data_frame()
data = data[['open', 'close', 'low', 'high', 'volume', 'date']]




### Technical Analisys

In [36]:
# Compute another features

data = idx.RSI(data,14)
data = idx.PROC(data, 14)
data = idx.SO(data,14)
data = idx.Williams_R(data, 14 )
data["EWMA"] = pd.ewma(data["close"], com=.5)
data = idx.detrend_data(data)

	Series.ewm(com=0.5,min_periods=0,adjust=True,ignore_na=False).mean()
  import sys


In [37]:
data = idx.calculate_targets(data, 1)
data = idx.calculate_targets(data, 3)
data = idx.calculate_targets(data, 5)
data = idx.calculate_targets(data, 10)
data = idx.calculate_targets(data, 14)
data = idx.calculate_targets(data, 30)

# drop nan values 
data = data.dropna()

In [45]:
from sklearn.preprocessing import MinMaxScaler
import plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)

################### JUST PLOTLY #########
""""plt.plot(data["WR"],label="WR")
    plt.plot(data["SO"],label="So")
    plt.plot(data["RSI"],label="RSI")
    plt.plot(data["PROC"],label="PROC")

    plt.plot(data["close"],label="close")
    plt.legend()
""""
##########################################

WR  =   go.Scatter( x = data.date, y = data["WR"],name="WR")
SO  =   go.Scatter( x = data.date, y = data["SO"],name="So")
RSI =   go.Scatter( x = data.date, y = data["RSI"],name="RSI")
PROC  =   go.Scatter( x = data.date, y = data["PROC"],name="PROC")
close =   go.Scatter( x = data.date, y = data["close"],name="close")

iplot([WR,SO,RSI,PROC,close])
plt.show()

# Implement Random Forest Strategy

In [49]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as make_forest
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score as acc
import numpy as np
import tqdm

In [81]:
criterion="gini"
num_features = 6
n_estimators = 65
prediction_window = 1
oob_score = True

train_labels = ["Close_detrend","volume","EWMA", "SO","WR","RSI"]

selected_data = data.dropna(axis=0, how='any') .copy()
selected_data = selected_data[['close', 'volume', 'RSI', 'PROC', 'SO', 'WR', 'EWMA', 'Close_detrend',
                               'Target(1)', 'Target(3)', 'Target(5)', 'Target(10)', 'Target(14)',
                               'Target(30)']]

In [102]:
def split_x_y(df, train_labels, PREDICTION_WINDOW):
    x = df[train_labels].as_matrix()
    y = df['Target({})'.format(PREDICTION_WINDOW)].as_matrix()
    
    return x,y
    
def train_on_df(x,y,train_frac):
    msk = np.random.rand(len(x)) < train_frac
    
    train_x = x[msk]
    train_y = y[msk]
    
    test_x = x[~msk]
    test_y = y[~msk]
    
    random_Forest = make_forest(n_estimators=n_estimators, max_features=num_features, \
                                bootstrap=True, oob_score=oob_score, verbose=0, criterion=criterion, n_jobs=-1)
    random_Forest.fit(train_x, train_y)
        
    predict_labels = random_Forest.predict(test_x)
    test_accuracy = random_Forest.score(test_x, test_y)
    return random_Forest, test_accuracy

#### Prediction

In [104]:
import tqdm

x1,y1 = split_x_y(selected_data, train_labels,1)
x30,y30 = split_x_y(selected_data, train_labels,30)

forest1, accurency1 = train_on_df(x1,y1,0.8)
forest30, accurency30 = train_on_df(x30,y30,0.8)

df_stock = pd.DataFrame()
df_stock["close"] = selected_data["close"]
df_stock["Close_detrend"] = selected_data["Close_detrend"]
df_stock["Target(1)"] = selected_data["Target(1)"]
df_stock["Target(30)"] = selected_data["Target(30)"]
df_stock["Prediction(1)"] = forest1.predict(selected_data[train_labels].as_matrix())
df_stock["Prediction(30)"] = forest30.predict(selected_data[train_labels].as_matrix())


### Train forest

In [107]:
x_1_day, y_1_day = split_x_y(selected_data, train_labels,1)
x_30_day,y_30_day = split_x_y(selected_data, train_labels,30)

complete_forest1, complete_acc1 = train_on_df(x_1_day,y_1_day,0.8)
complete_forest30, complete_acc30 = train_on_df(x_30_day,y_30_day,0.8)

print ('Accuracy model in one day: ', str(round(complete_acc1*100,2)))
print ('Accuracy model in 30 days: ', str(round(complete_acc30*100,2)))

Accuracy model in one day:  54.74
Accuracy model in 30 days:  91.39
