### Exemplo de treinamento simples

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import pickle
from simple_robot import feature_eng
# Baixando os dados de DOGE COIN
df = pd.read_parquet('https://drive.google.com/u/0/uc?id=17c2r9qbnsxPVxaYukrp6vhTY-CQy8WZa&export=download')

#### Calculando o target (y)

In [2]:
# Calculando qual a média de close dos próximos 10min
df['forward_average'] = df[::-1]['close'].rolling(10).mean()[::-1].shift(-1)

# Target será a diferença percentual do 'forward_average' com o 'close' atual 
df['target'] = 100*(df['forward_average'] - df['close']) / df['close']

df.head(3)
# Outra possibilidade: target como a diferença entre o proximo minuto e o atual: df['diff']= -df['close'].diff(-1)

Unnamed: 0_level_0,symbol,datetime,close_time,open,high,low,close,volume,number_of_trades,forward_average,target
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-07-05 12:00:00,DOGEUSDT,2019-07-05 12:00:00,1562328059999,0.00449,0.0046,0.00376,0.0042,60726008.0,521,0.004201,0.018099
2019-07-05 12:01:00,DOGEUSDT,2019-07-05 12:01:00,1562328119999,0.0042,0.004387,0.0042,0.0043,84307704.0,561,0.004158,-3.309539
2019-07-05 12:02:00,DOGEUSDT,2019-07-05 12:02:00,1562328179999,0.0043,0.004475,0.0043,0.004475,48182744.0,291,0.004099,-8.400496


#### Calculando as features (x)

In [3]:
# Toda a parte de criação de features está no arquivo simple_robot.py. Aqui apenas chamamos a função. Isso é útil, pois conseguimos usar a mesma função no momento de colocar o robô em produção
df = feature_eng(df)
df.head(5)

Unnamed: 0_level_0,const,close,volume,number_of_trades,target,lag_1,lag_2,ma_10,ma_30,ratio_ma,time
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-07-05 12:29:00,1.0,0.00381,12415741.0,59,1.444208,0.761075,-0.763702,0.003874,0.003997,0.969227,0
2019-07-05 12:30:00,1.0,0.00381,4488370.0,68,1.678743,-0.010499,0.750656,0.00386,0.003984,0.9688,1
2019-07-05 12:31:00,1.0,0.00386,6436490.0,74,0.439895,1.295341,1.284978,0.003856,0.003969,0.971372,2
2019-07-05 12:32:00,1.0,0.003847,6321878.0,61,0.88768,-0.335317,0.964367,0.003845,0.003949,0.973784,3
2019-07-05 12:33:00,1.0,0.00384,1257411.0,18,1.221779,-0.177071,-0.512982,0.003837,0.003935,0.975138,4


#### Separando em treino/ teste
Separando usando data. Isso é importante, pois precisamos entender se os modelos criados em um tempo passado continua sendo útil em um tempo futuro.

In [4]:
test_treshold = '2021-06-01 00:00:00'

train = df[df.index <= test_treshold]
test = df[df.index > test_treshold]

X_train = train.drop(columns=['target'])
y_train = train['target']

X_test = test.drop(columns=['target'])
y_test = test['target']

# Modelo linear simples
model = sm.OLS(y_train,X_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.012
Method:,Least Squares,F-statistic:,1320.0
Date:,"Sun, 08 Aug 2021",Prob (F-statistic):,0.0
Time:,12:54:02,Log-Likelihood:,-740210.0
No. Observations:,1000328,AIC:,1480000.0
Df Residuals:,1000318,BIC:,1481000.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.6935,0.122,22.041,0.000,2.454,2.933
close,-12.4440,0.651,-19.120,0.000,-13.720,-11.168
volume,4.515e-09,8.09e-11,55.808,0.000,4.36e-09,4.67e-09
number_of_trades,3.849e-06,6.4e-07,6.009,0.000,2.59e-06,5.1e-06
lag_1,-0.0266,0.003,-10.584,0.000,-0.031,-0.022
lag_2,-0.0665,0.002,-35.035,0.000,-0.070,-0.063
ma_10,13.1863,0.997,13.225,0.000,11.232,15.141
ma_30,-0.8399,0.617,-1.361,0.174,-2.050,0.370
ratio_ma,-2.6946,0.122,-22.052,0.000,-2.934,-2.455

0,1,2,3
Omnibus:,483216.165,Durbin-Watson:,0.279
Prob(Omnibus):,0.0,Jarque-Bera (JB):,690137035.388
Skew:,0.746,Prob(JB):,0.0
Kurtosis:,131.669,Cond. No.,20600000000.0


#### Resultado do Modelo Linear

In [6]:
y_hat = model.predict(X_test)
MSE = ((y_hat - y_test)**2).mean()
MSE

0.17070794256499783

In [7]:
MAE = ((y_hat - y_test).abs()).mean()
MAE

0.26346826055111083

#### Referência
É sempre recomendado ter valores de referência, para saber se seu modelo é ou não melhor do que outras alternativas "naive"

Abaixo, um exemplo de resultado Naive, assumindo todos 0

In [8]:
MSE_assuming_all_zero = (y_test**2).mean()
MSE_assuming_all_zero

0.16986135624179463

In [9]:
MAE_assuming_all_zero = (y_test.abs()).mean()
MAE_assuming_all_zero

0.2633675092910254

In [10]:
# Salvando o modelo em um arquivo pickle para ser utilizado nas etapas seguintes
filename = 'model_dummy.pickle'
pickle.dump(model, open(filename, 'wb'))