In [30]:
# bibliotecas matetmáticas
import math
import pandas as pd
import numpy as np

# bibliotecas para uso de redes neurais. Neste caso, do tipo LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout

# bibliotecas para o pré-processamento dos dados. Normalização, etc...
from sklearn.preprocessing import MinMaxScaler

# bibliotecas para representação dos dados em gráficos
import plotly.offline as py
import plotly.graph_objs as go

Carregamento do dataset ".csv" com as informações de preços da ação da petrobrás entre 01/01/2013 e 31/01/2020.

In [2]:
df = pd.read_csv('PETR4.SA.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2013-01-02,19.99,20.209999,19.690001,19.690001,16.812237,30182600.0
1,2013-01-03,19.809999,20.4,19.700001,20.4,17.418468,30552600.0
2,2013-01-04,20.33,20.620001,20.17,20.43,17.444086,36141000.0
3,2013-01-07,20.48,20.67,19.950001,20.08,17.145239,28069600.0
4,2013-01-08,20.110001,20.23,19.459999,19.5,16.650005,29091300.0


In [3]:
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1755,2020-01-24,29.559999,29.780001,29.139999,29.299999,29.297832,36898700.0
1756,2020-01-27,28.629999,28.639999,27.67,28.030001,28.027927,66105300.0
1757,2020-01-28,28.43,29.139999,28.42,28.799999,28.797869,47259100.0
1758,2020-01-29,29.0,29.08,28.67,28.85,28.847866,42297500.0
1759,2020-01-30,28.280001,29.040001,28.18,28.940001,28.93786,44266800.0


In [4]:
len(df)

1760

Exclusão de valores nulos do dataset.

In [5]:
df = df.dropna()
len(df)

1758

Gráfico dos dados de treinamento.

In [51]:
btc_trace = go.Scatter(x=df['Date'], y=df['Open'], name='Price')
py.iplot([btc_trace])

Separação da coluna de valores de abertura para treinamento.

In [6]:
train = df.iloc[:,1:2].values
train

array([[19.99    ],
       [19.809999],
       [20.33    ],
       ...,
       [28.43    ],
       [29.      ],
       [28.280001]])

Normalização dos dados em escala de 0 a 1. Utilizada para otimização do tempo de processamento dos cálculos nas múltiplas camadas da rede neural.

In [7]:
normalizer = MinMaxScaler(feature_range=(0, 1))
normalized_train = normalizer.fit_transform(train)
normalized_train[0:10]

array([[0.59160737],
       [0.58486323],
       [0.60434622],
       [0.6099663 ],
       [0.59610347],
       [0.5784938 ],
       [0.58336458],
       [0.58636196],
       [0.59235671],
       [0.59235671]])

Função para separação dos dados em dados de entrada (previsores) e dados de saída (previstos). A varável loo_back indica quantos valores anteriores serão utilizados para a previsão de cada saída.

In [8]:
def dataset_with_look_back(dataset, look_back=1):
    dataX, dataY = [], []
    
    for i in range(len(dataset) - look_back):
        a = dataset[i:(i + look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
        
    return np.array(dataX), np.array(dataY)

In [12]:
predictors, predicted_prices = dataset_with_look_back(normalized_train, 90)
predictors[0:10]

array([[0.59160737, 0.58486323, 0.60434622, 0.6099663 , 0.59610347,
        0.5784938 , 0.58336458, 0.58636196, 0.59235671, 0.59235671,
        0.58786061, 0.58673666, 0.58448856, 0.57587113, 0.57025105,
        0.57025105, 0.56837773, 0.58186589, 0.56762835, 0.55414015,
        0.52678908, 0.52229298, 0.52304236, 0.49007124, 0.5144249 ,
        0.5076808 , 0.50018732, 0.51517428, 0.50917952, 0.50243543,
        0.50355937, 0.51367555, 0.52079432, 0.49718994, 0.49868867,
        0.48145371, 0.47133763, 0.47171231, 0.47058825, 0.46084678,
        0.47283629, 0.46571748, 0.50206072, 0.5286624 , 0.54664674,
        0.52791313, 0.54889474, 0.54739601, 0.55376548, 0.56950171,
        0.5507681 , 0.5507681 , 0.55638814, 0.54402396, 0.53765453,
        0.54439867, 0.54439867, 0.53952793, 0.53278383, 0.52716375,
        0.52716375, 0.50505809, 0.51517428, 0.50131137, 0.51180223,
        0.49756465, 0.52491568, 0.53465716, 0.51479961, 0.50580747,
        0.49718994, 0.496066  , 0.49156989, 0.52

In [13]:
predicted_prices[0:10]

array([0.57324843, 0.57399777, 0.58860999, 0.58860999, 0.59910085,
       0.60209819, 0.57512175, 0.58786061, 0.59235671, 0.59797679])

Reorganização dos dados de treino e teste para o formato aceito pelo método LSTM() do Keras. O formato é (batch_size, timesteps, input_dim), onde:

* __batch_size:__ quantidade de dados de entrada
* __timesteps:__ quantidade de intervalos temporais. Ex: 5 em 5 dados de entrada.
* __input_dim:__ quantidade de atributos utilizados na previsão. Ex: Open e High = 2 atributos

In [14]:
predictors = np.reshape(predictors, (predictors.shape[0], predictors.shape[1], 1))
predictors.shape

(1668, 90, 1)

Estruturação da Rede Neural LSTM.

In [15]:
model = Sequential()

# camada 1     units é a quantidade de células de memória
model.add(LSTM(units = 100, return_sequences=True, input_shape=(predictors.shape[1], 1)))
# O dropout é usado para evitar overfitting
model.add(Dropout(0.3))

# camada 2
model.add(LSTM(units = 50, return_sequences=True))
model.add(Dropout(0.3))

# camada 3
model.add(LSTM(units = 50, return_sequences=True))
model.add(Dropout(0.3))

# camada 4
model.add(LSTM(units = 50))
model.add(Dropout(0.3))

# camada final com units de 1 para uma única saída
model.add(Dense(units = 1, activation='linear'))

# optimizer é a função utilizada para o cálculo do gradiente
# loss é o erro utilizado para o ajuste dos pesos
model.compile(optimizer="rmsprop", loss="mean_squared_error", 
              metrics=["mean_absolute_error"] )

model.fit(predictors, predicted_prices, epochs=100, batch_size = 32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100


Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f25484f8c10>

Leitura de base de dados de teste.

In [22]:
df_test = pd.read_csv('PETR4.SA_teste.csv')
df_test = df_test.dropna()
test_prices = df_test.iloc[:,1:2].values
test_prices

array([[28.52    ],
       [28.65    ],
       [29.049999],
       [28.700001],
       [29.      ],
       [28.889999],
       [29.4     ],
       [29.639999],
       [29.77    ],
       [29.860001],
       [29.629999],
       [29.059999],
       [29.870001],
       [30.799999],
       [29.51    ],
       [25.719999],
       [25.16    ]])

In [52]:
btc_trace = go.Scatter(x=df_test['Date'], y=df_test['Open'], name='Price')
py.iplot([btc_trace])

Definição e tratamento dos dados de entrada de teste utilizados.

In [46]:
complete_df = pd.concat((df['Open'], df_test['Open']), axis=0)
test = complete_df[len(complete_df)-len(df_test)-90:].values
test = test.reshape(-1, 1)
normalized_test = normalizer.transform(test)
normalized_test[-10:]
test[0:10]

array([[27.83    ],
       [27.35    ],
       [27.      ],
       [27.42    ],
       [27.030001],
       [27.4     ],
       [27.549999],
       [27.559999],
       [27.6     ],
       [27.219999]])

In [26]:
Xtest, Ytest = dataset_with_look_back(normalized_test, 90)
Xtest[0:10]

array([[0.46596867, 0.38219902, 0.32111698, 0.39441543, 0.32635276,
        0.39092502, 0.41710287, 0.41884807, 0.42582904, 0.35951123,
        0.27748696, 0.29493897, 0.2111695 , 0.1762653 , 0.20767892,
        0.24956374, 0.30890058, 0.32809796, 0.37521804, 0.41710287,
        0.48342068, 0.4764397 , 0.45200706, 0.46073289, 0.58813291,
        0.6963352 , 0.70506138, 0.70855143, 0.73996523, 0.80453766,
        0.82024465, 0.94764414, 1.        , 0.91623053, 0.85340312,
        0.84467729, 0.9232115 , 0.81849927, 0.85165826, 0.83595145,
        0.8237347 , 0.79232143, 0.71378721, 0.65794078, 0.80279246,
        0.83071568, 0.77835982, 0.73123922, 0.72600362, 0.75567203,
        0.73996523, 0.6998256 , 0.71553241, 0.78184987, 0.8708553 ,
        0.89703314, 0.8621293 , 0.93368254, 0.95636997, 1.        ,
        0.85340312, 0.78184987, 0.80104726, 0.9232115 , 0.94938935,
        0.87783628, 0.94764414, 0.96509633, 0.94066317, 0.93368254,
        0.9982548 , 0.91972093, 0.98778377, 0.96

In [35]:
Ytest

array([0.58638754, 0.60907515, 0.67888302, 0.61780133, 0.67015719,
       0.6509598 , 0.73996523, 0.78184987, 0.80453766, 0.82024465,
       0.78010467, 0.68062822, 0.82198985, 0.98429319, 0.75916244,
       0.09773108, 0.        ])

In [28]:
Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1))
Xtest.shape

(17, 90, 1)

In [43]:
predict_test = model.predict(Xtest)
predict_test = normalizer.inverse_transform(predict_test)
Ytest = normalizer.inverse_transform(Ytest.reshape(-1, 1))

In [49]:
print("Média de preço real:", Ytest.mean())
print("Média de preço previsto:", predict_test.mean())

Média de preço real: 28.895882117647062
Média de preço previsto: 29.317429


In [53]:
predictDates = df_test['Date']
actual_chart = go.Scatter(x=predictDates, y=Ytest[:,0], name='Actual Price')
predict_chart = go.Scatter(x=predictDates, y=predict_test[:,0], name='Predict Price')
py.iplot([predict_chart, actual_chart])