In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets, linear_model
from sklearn.metrics import r2_score
from datetime import datetime, timedelta

import getDados
from enums import *
import etl
import graficos

In [28]:
## Processo de ETL
papel = papeis.BBAS3
intervalo = intervalos.um_dia.value
data_inicio = str(datetime.now().year - 5) + '-01-01'
data_fim = (datetime.now() + timedelta(days=-1)).strftime('%Y-%m-%d')
df = getDados.get_fromYahoo_por_datas(papel=papel, data_inicio=data_inicio, data_fim=data_fim, intervalo=intervalo)
df = etl.padroniza_df_yahoo(df_yahoo=df, papel=papel)
df = etl.add_media_movel_simples(df=df, periodo=9, campo='Fechamento')
df = etl.add_media_movel_exponencial(df=df, periodo=50, campo='Fechamento')
df = etl.desvioPadrao(df=df, periodo=20, campo='Fechamento')
df = etl.bandasBollinger(df=df, campo='MMS11', seCampoNaoExiste=True)
df = etl.ifr(df=df, campo='Fechamento', periodo=14)
df['Max20'] = df['Fechamento'].rolling(window=20).max()
df['Min20'] = df['Fechamento'].rolling(window=20).min()

In [29]:
df['FechamentoSeguinte'] = df.Fechamento.shift(-1)
df_original = df.copy()
df.reset_index(inplace=True)
df.dropna(inplace=True)
df_original.tail()

Unnamed: 0_level_0,Papel,Abertura,Alta,Baixa,Fechamento,Volume,MMS9,MME50,DesvioPadrao,BandaSuperior,BandaInferior,IFR,Max20,Min20,FechamentoSeguinte
Datas,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-11-08,BBAS3,37.779999,38.299999,37.32,38.07,14742300,38.261111,39.327449,1.874794,42.300496,34.801322,44.776356,44.68,37.02,37.060001
2022-11-09,BBAS3,37.639999,38.560001,36.919998,37.060001,18987500,38.144444,39.23853,1.950762,42.166979,34.363929,41.40061,44.68,37.02,36.330002
2022-11-10,BBAS3,37.0,37.68,35.669998,36.330002,64558100,37.893333,39.12447,2.068527,42.113418,33.839309,39.105792,44.68,36.330002,35.52
2022-11-11,BBAS3,36.310001,36.650002,35.360001,35.52,54156800,37.526667,38.983118,2.231205,42.20332,33.278498,36.676518,44.68,35.52,36.400002
2022-11-14,BBAS3,36.0,36.799999,35.650002,36.400002,23033900,37.457778,38.88182,2.309646,42.16111,32.922527,40.96706,44.68,35.52,


In [26]:
df.tail()

Unnamed: 0,Datas,Papel,Abertura,Alta,Baixa,Fechamento,Volume,MMS9,MME50,DesvioPadrao,BandaSuperior,BandaInferior,IFR,Max20,Min20,FechamentoSeguinte
1455,2022-11-07,BBAS3,38.91,39.189999,37.619999,37.889999,23671200,38.421111,39.378774,1.855995,42.863808,35.439829,44.020995,44.68,37.02,38.07
1456,2022-11-08,BBAS3,37.779999,38.299999,37.32,38.07,14742300,38.261111,39.327449,1.874794,42.300496,34.801322,44.776356,44.68,37.02,37.060001
1457,2022-11-09,BBAS3,37.639999,38.560001,36.919998,37.060001,18987500,38.144444,39.23853,1.950762,42.166979,34.363929,41.40061,44.68,37.02,36.330002
1458,2022-11-10,BBAS3,37.0,37.68,35.669998,36.330002,64558100,37.893333,39.12447,2.068527,42.113418,33.839309,39.105792,44.68,36.330002,35.52
1459,2022-11-11,BBAS3,36.310001,36.650002,35.360001,35.52,54156800,37.526667,38.983118,2.231205,42.20332,33.278498,36.676518,44.68,35.52,36.400002


In [22]:
## -- -- -- -- -- -- --
## distribuição da base
qtd_linhas = len(df)
qtd_linhas_treino = round(.70 * qtd_linhas)
qtd_linhas_teste = qtd_linhas - qtd_linhas_treino
treino_de = 0
treino_ate = qtd_linhas_treino
teste_de = qtd_linhas_treino
teste_ate = qtd_linhas_treino + qtd_linhas_teste

print({
    'Qtd': qtd_linhas,
    'Qtd Treino': f'{qtd_linhas_treino} de {treino_de} até {treino_ate}',
    'Qtd Teste': f'{qtd_linhas_teste} de {teste_de} até {teste_ate}'
})

{'Qtd': 1411, 'Qtd Treino': '988 de 0 até 988', 'Qtd Teste': '423 de 988 até 1411'}


In [17]:
## -- -- -- -- -- -- --
## separação de features e labels
features = df[['Abertura','Alta','Baixa','Volume','MME50','BandaInferior','Min20','Fechamento']].copy()
labels = df['FechamentoSeguinte'].copy()
X_train = features[:qtd_linhas_treino].copy()
X_test = features[teste_de:teste_ate].copy()
y_train = labels[:qtd_linhas_treino].copy()
y_test = labels[teste_de:teste_ate].copy()


## -- -- -- -- -- -- --
## testa os melhores campos
features_list = ('Abertura','Alta','Baixa','Volume','MMS9','MME50','DesvioPadrao','BandaSuperior','BandaInferior','IFR','Max20','Min20','Fechamento')
k_best_features = SelectKBest(k='all')
k_best_features.fit_transform(features, labels)
k_best_features_scores = k_best_features.scores_
raw_pairs = zip(features_list[1:], k_best_features_scores)
ordered_pairs = list(reversed(sorted(raw_pairs, key=lambda x: x[1])))

k_best_features_final = dict(ordered_pairs[:15])
best_features = k_best_features_final.keys()
df_k_best = pd.DataFrame.from_dict(k_best_features_final, orient='index')
df_k_best.rename(columns={0: 'Score'}, inplace=True)
df_k_best.Score = df_k_best.Score.map('{:.2f}'.format)
df_k_best

Unnamed: 0,Score
BandaInferior,207.27
Baixa,175.85
MME50,167.41
Volume,147.64
Alta,106.64
BandaSuperior,48.56
DesvioPadrao,35.2
MMS9,1.93


In [20]:
## -- -- -- -- -- -- --
## Normalizando os dados de entrada(features)
scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale  = scaler.transform(X_test)


## -- -- -- -- -- -- --
## Treinamento usando regressão linear
lr = linear_model.LinearRegression()
lr.fit(X_train_scale, y_train)
pred= lr.predict(X_test_scale)
cd =r2_score(y_test, pred)

print(f'Coeficiente de determinação: {cd * 100:.2f}')

Coeficiente de determinação: 97.91


In [6]:
## -- -- -- -- -- -- --
## Usando regressão linear
datas_full = df.Datas[teste_de:teste_ate].copy()
datas_teste = datas_full[teste_de:teste_ate].copy()
fechamento_full = df.Fechamento.copy()
fechamento = fechamento_full[teste_de:teste_ate].copy()

df_previsao = pd.DataFrame({'Datas': datas_full, 'Real': fechamento, 'Previsao': pred})
# df_previsao.Previsao = df_previsao.Previsao.shift(+1)
df_previsao.set_index('Datas', inplace=True)
df_previsao['Diferenca'] =  (df_previsao.Previsao - df_previsao.Real)

a_prever = df_original[['Abertura','Alta','Baixa','Volume','MME50','BandaInferior','Min20','Fechamento']].tail(1)

# -- -- -- -- -- -- --
# Prevendo dados
previsao=scaler.transform( a_prever )
pred=lr.predict(previsao)
desv_pad = df_previsao.Diferenca.rolling(window=20).std()[-2]
print(f'Previsão: {pred[0]:,.2f}, Margem de erro: {desv_pad:,.2f}')

Previsão: 36.36, Margem de erro: 0.21


In [23]:
hoje = pd.DataFrame({'Datas': [datetime.now().strftime('%Y-%m-%d')], 'Previsao': [pred[0]]})
hoje.set_index('Datas', inplace=True)
df_analise = df_original[['Fechamento']].tail(31).join(df_previsao[['Previsao']].tail(31))
df_analise.rename(columns={'Fechamento': 'Real'}, inplace=True)
df_analise.Previsao = df_analise.Previsao.shift(+1)
df_analise['Diferenca'] =  (df_analise.Previsao - df_analise.Real)
df_analise = pd.concat([df_analise, hoje])
df_analise['DesvPad'] = df_analise.Diferenca.rolling(window=20).std().map('{:,.2f}'.format)
df_analise.DesvPad = df_analise.DesvPad.shift(+1)
df_analise.Real = df_analise.Real.map('{:,.2f}'.format)
df_analise.Previsao = df_analise.Previsao.map('{:,.2f}'.format)
df_analise.Diferenca = df_analise.Diferenca.map('{:,.2f}'.format)
# df_analise.tail(5)
graficos.LinhasPrevisão2(df_previsao=df_previsao.tail(50), papel='Banco do Brasil', df_prever=df_analise.tail(2))