In [1]:
from __future__ import absolute_import

import pandas as pd
import numpy as np
import dask.dataframe as dd
import dask.array as da

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# df = dd.read_csv('C:/Users/BTK/Desktop/Eric/Git/financial-models/database/CCM$N_M1.csv', sep='\t', usecols=[0, 1, 4, 5])
df = dd.read_csv('D:/Dados historicos-NOVO/Bovespa_02012017_30062021/SEQL3_BOV_T_bid_ask.csv', sep='\t', usecols=[4, 5])

df = df.dropna().reset_index(drop=True)

def RSI(data, period, applied_price):
    dataframe = data

    delta = dataframe[applied_price].diff(1)
    delta = delta.fillna(0)

    dataframe['<DELTA>'] = delta

    dataframe['<UP>'] = delta.copy()
    dataframe['<DOWN>'] = delta.copy()

    delta = dataframe['<DELTA>'].values.compute()
    up = []
    down = []

    for _i in range(len(dataframe)):
        if delta[_i] < 0:
            up.append(0)
            down.append(delta[_i])
        elif delta[_i] > 0:
            up.append(delta[_i])
            down.append(0)
        else:
            up.append(0)
            down.append(0)
    down = pd.Series(down)

    chunks = dataframe.map_partitions(lambda x: len(x)).compute().to_numpy()

    up = da.from_array(up, chunks=tuple(chunks))
    down = da.from_array(down, chunks=tuple(chunks))

    dataframe['<UP>'] = up
    dataframe['<DOWN>'] = down

    avg_gain = dataframe['<UP>'].rolling(window=period).mean()
    avg_loss = abs(dataframe['<DOWN>'].rolling(window=period).mean())

    RS = avg_gain/avg_loss

    RSI = 100.0 - (100.0/(1.0 + RS))

    dataframe['<INDICATOR>'] = RSI

    return dataframe

In [2]:
df = RSI(df, 7, '<LAST>').fillna(0)
df

Unnamed: 0_level_0,<LAST>,<VOLUME>,<DELTA>,<UP>,<DOWN>,<INDICATOR>
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...
,...,...,...,...,...,...


In [3]:
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, criterion="mae")

In [4]:
df_model = df.copy().compute()

y = pd.DataFrame()

y['<INDICATOR>'] = df_model['<INDICATOR>']

df_model = df_model.drop(['<DELTA>', '<UP>', '<DOWN>', '<INDICATOR>'], axis=1)

train_size = int(len(df_model) * (1 - 0.2))

scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(df_model)
df_scaled = pd.DataFrame(scaler.transform(df_model)).rename(columns={0: '<LAST>', 1: '<VOLUME>'})

scaler_y = MinMaxScaler(feature_range=(0, 1))
scaler_y.fit(y)
y_scaled = pd.DataFrame(scaler_y.transform(y)).rename(columns={0: '<LAST>', 1: '<VOLUME>'})

X_train, X_test = df_scaled[['<LAST>', '<VOLUME>']][:train_size], df_scaled[['<LAST>', '<VOLUME>']][train_size:len(df_scaled)]
y_train, y_test = y_scaled[['<INDICATOR>']][:train_size], y_scaled[['<INDICATOR>']][train_size:len(y_scaled)]

In [5]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error

print(f'Model score: {mean_squared_error(y_test, y_pred)} RMSE')

  model.fit(X_train, y_train)
