<a href="https://colab.research.google.com/github/BrunoFUAL/modelos_previsao/blob/main/pycaret.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#instalar as bibliotecas que serão utilizadas

!pip install pycaret
!pip install yfinance


In [None]:
# para permitir que o pycaret seja utilizado dentro do google colab

from pycaret.utils import enable_colab
enable_colab()

In [4]:
#Importar as bibliotecas

import yfinance as yf
import pandas as pd

In [5]:
#Escolha da ação - neste caso a Apple

df = yf.Ticker('AAPL')

#Escolha do intervalo de dados - foi selecionado um periodo de 10 anos.

dataset = df.history(period='10y')
dataset

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-29,17.433207,17.527869,17.262509,17.475040,380508800,0.0,0.0
2012-05-30,17.381294,17.710781,17.300678,17.685741,529429600,0.0,0.0
2012-05-31,17.733684,17.756892,17.450308,17.641771,491674400,0.0,0.0
2012-06-01,17.380071,17.486643,17.116237,17.130589,520987600,0.0,0.0
2012-06-04,17.146162,17.329379,16.749190,17.231358,556995600,0.0,0.0
...,...,...,...,...,...,...,...
2022-05-23,137.789993,143.259995,137.649994,143.110001,117726300,0.0,0.0
2022-05-24,140.809998,141.970001,137.330002,140.360001,104132700,0.0,0.0
2022-05-25,138.429993,141.789993,138.339996,140.520004,92482700,0.0,0.0
2022-05-26,137.389999,144.339996,137.139999,143.779999,90601500,0.0,0.0


In [6]:
#Efetuar a "limpeza" do dataset, eliminando os campos que não vão ser utilizados

dataset = dataset.drop(['Dividends', 'Stock Splits'], axis=1)
dataset

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-05-29,17.433207,17.527869,17.262509,17.475040,380508800
2012-05-30,17.381294,17.710781,17.300678,17.685741,529429600
2012-05-31,17.733684,17.756892,17.450308,17.641771,491674400
2012-06-01,17.380071,17.486643,17.116237,17.130589,520987600
2012-06-04,17.146162,17.329379,16.749190,17.231358,556995600
...,...,...,...,...,...
2022-05-23,137.789993,143.259995,137.649994,143.110001,117726300
2022-05-24,140.809998,141.970001,137.330002,140.360001,104132700
2022-05-25,138.429993,141.789993,138.339996,140.520004,92482700
2022-05-26,137.389999,144.339996,137.139999,143.779999,90601500


In [7]:
#Criar novos atributos
#Media movel de 7 dias e de 30 dias


dataset['MM7d'] = dataset['Close'].rolling(window=7).mean().round(2)
dataset['MM30d'] = dataset['Close'].rolling(window=30).mean().round(2)
dataset


Unnamed: 0_level_0,Open,High,Low,Close,Volume,MM7d,MM30d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-29,17.433207,17.527869,17.262509,17.475040,380508800,,
2012-05-30,17.381294,17.710781,17.300678,17.685741,529429600,,
2012-05-31,17.733684,17.756892,17.450308,17.641771,491674400,,
2012-06-01,17.380071,17.486643,17.116237,17.130589,520987600,,
2012-06-04,17.146162,17.329379,16.749190,17.231358,556995600,,
...,...,...,...,...,...,...,...
2022-05-23,137.789993,143.259995,137.649994,143.110001,117726300,142.97,156.14
2022-05-24,140.809998,141.970001,137.330002,140.360001,104132700,142.00,155.30
2022-05-25,138.429993,141.789993,138.339996,140.520004,92482700,141.28,154.40
2022-05-26,137.389999,144.339996,137.139999,143.779999,90601500,140.50,153.52


In [8]:
#5 dias para previsão

dataset_prever = dataset.tail(5)
dataset_prever

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MM7d,MM30d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-05-23,137.789993,143.259995,137.649994,143.110001,117726300,142.97,156.14
2022-05-24,140.809998,141.970001,137.330002,140.360001,104132700,142.0,155.3
2022-05-25,138.429993,141.789993,138.339996,140.520004,92482700,141.28,154.4
2022-05-26,137.389999,144.339996,137.139999,143.779999,90601500,140.5,153.52
2022-05-27,145.389999,149.679993,145.259995,149.639999,90796900,141.76,153.01


In [9]:
#Retirar os ultimos 5 dias do nosso dataset
#O inplace serve para retirar os dados efetivamente do dataset

dataset.drop(dataset.tail(5).index, inplace=True)
dataset

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MM7d,MM30d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-29,17.433207,17.527869,17.262509,17.475040,380508800,,
2012-05-30,17.381294,17.710781,17.300678,17.685741,529429600,,
2012-05-31,17.733684,17.756892,17.450308,17.641771,491674400,,
2012-06-01,17.380071,17.486643,17.116237,17.130589,520987600,,
2012-06-04,17.146162,17.329379,16.749190,17.231358,556995600,,
...,...,...,...,...,...,...,...
2022-05-16,145.550003,147.520004,144.179993,145.539993,86643800,149.37,161.41
2022-05-17,148.860001,149.770004,146.679993,149.240005,78336300,148.22,160.45
2022-05-18,146.850006,147.360001,139.899994,140.820007,109742900,146.61,159.31
2022-05-19,139.880005,141.660004,136.600006,137.350006,136095600,144.16,158.17


In [10]:
#Avançar para a frente os valores das ações uma vez que o valor a analisar é sempre o de fecho do dia seguinte

dataset['Close'] = dataset['Close'].shift(-1)
dataset

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MM7d,MM30d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-05-29,17.433207,17.527869,17.262509,17.685741,380508800,,
2012-05-30,17.381294,17.710781,17.300678,17.641771,529429600,,
2012-05-31,17.733684,17.756892,17.450308,17.130589,491674400,,
2012-06-01,17.380071,17.486643,17.116237,17.231358,520987600,,
2012-06-04,17.146162,17.329379,16.749190,17.186779,556995600,,
...,...,...,...,...,...,...,...
2022-05-16,145.550003,147.520004,144.179993,149.240005,86643800,149.37,161.41
2022-05-17,148.860001,149.770004,146.679993,140.820007,78336300,148.22,160.45
2022-05-18,146.850006,147.360001,139.899994,137.350006,109742900,146.61,159.31
2022-05-19,139.880005,141.660004,136.600006,137.589996,136095600,144.16,158.17


In [11]:
#Serão retirados os valores nulos (NaN)

dataset.dropna(inplace=True)
dataset

Unnamed: 0_level_0,Open,High,Low,Close,Volume,MM7d,MM30d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-07-10,18.870551,18.928570,18.483961,18.457092,511957600,18.38,17.74
2012-07-11,18.508698,18.555724,18.236925,18.288223,469322000,18.47,17.77
2012-07-12,18.329142,18.427774,18.098287,18.473572,428041600,18.50,17.79
2012-07-13,18.411889,18.541362,18.321806,18.532825,311427200,18.52,17.82
2012-07-16,18.478165,18.676651,18.475111,18.533737,301260400,18.51,17.86
...,...,...,...,...,...,...,...
2022-05-13,144.589996,148.100006,143.110001,145.539993,113787000,150.94,162.36
2022-05-16,145.550003,147.520004,144.179993,149.240005,86643800,149.37,161.41
2022-05-17,148.860001,149.770004,146.679993,140.820007,78336300,148.22,160.45
2022-05-18,146.850006,147.360001,139.899994,137.350006,109742900,146.61,159.31


In [12]:
#Retirar o indice (a data neste caso)

dataset.reset_index(drop=True, inplace=True)
dataset_prever.reset_index(drop=True, inplace=True)

In [13]:
dataset

Unnamed: 0,Open,High,Low,Close,Volume,MM7d,MM30d
0,18.870551,18.928570,18.483961,18.457092,511957600,18.38,17.74
1,18.508698,18.555724,18.236925,18.288223,469322000,18.47,17.77
2,18.329142,18.427774,18.098287,18.473572,428041600,18.50,17.79
3,18.411889,18.541362,18.321806,18.532825,311427200,18.52,17.82
4,18.478165,18.676651,18.475111,18.533737,301260400,18.51,17.86
...,...,...,...,...,...,...,...
2478,144.589996,148.100006,143.110001,145.539993,113787000,150.94,162.36
2479,145.550003,147.520004,144.179993,149.240005,86643800,149.37,161.41
2480,148.860001,149.770004,146.679993,140.820007,78336300,148.22,160.45
2481,146.850006,147.360001,139.899994,137.350006,109742900,146.61,159.31


In [14]:
!pip install Jinja2
!pip install markupsafe==2.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting markupsafe==2.0.1
  Downloading MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (31 kB)
Installing collected packages: markupsafe
  Attempting uninstall: markupsafe
    Found existing installation: MarkupSafe 2.1.1
    Uninstalling MarkupSafe-2.1.1:
      Successfully uninstalled MarkupSafe-2.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires markupsafe~=2.1.1, but you have markupsafe 2.0.1 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed markupsafe-2.0.1


In [15]:
from pycaret.regression import *

  defaults = yaml.load(f)


In [16]:
#Neste ponto são definidos os parametros que serão trabalhados 
#o dataset, o atributo a ser previsto e um session id que permite reproduzir as mesmas informações
#O preço de fecho fica assinalado como LABEL

setup(data=dataset, target='Close', session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,Close
2,Original Data,"(2483, 7)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1738, 2)"


(0        18.457092
 1        18.288223
 2        18.473572
 3        18.532825
 4        18.533737
            ...    
 2478    145.539993
 2479    149.240005
 2480    140.820007
 2481    137.350006
 2482    137.589996
 Name: Close, Length: 2483, dtype: float32,
             Open        High         Low       Close     Volume    MM7d  \
 0      18.870551   18.928570   18.483961   18.457092  511957600   18.38   
 1      18.508698   18.555724   18.236925   18.288223  469322000   18.47   
 2      18.329142   18.427774   18.098287   18.473572  428041600   18.50   
 3      18.411889   18.541362   18.321806   18.532825  311427200   18.52   
 4      18.478165   18.676651   18.475111   18.533737  301260400   18.51   
 ...          ...         ...         ...         ...        ...     ...   
 2478  144.589996  148.100006  143.110001  145.539993  113787000  150.94   
 2479  145.550003  147.520004  144.179993  149.240005   86643800  149.37   
 2480  148.860001  149.770004  146.679993  140.82000

In [17]:
#O pycret tras varios modelos dentro da biblioteca (aproximadamente 25 modelos)
#Neste caso será seleccionado o top 3 dos modelos com base en determinadas metricas (MAE, MSE, RMSE, etc)
#Os modelos estão organizados através do R2 que determina os valores que estão em cima da linha de regressão

top3 =compare_models(n_select = 3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,0.9331,3.0157,1.7218,0.9985,0.0214,0.0161,0.279
lasso,Lasso Regression,0.9341,3.0163,1.722,0.9985,0.0215,0.0162,0.011
ridge,Ridge Regression,0.9331,3.0157,1.7218,0.9985,0.0214,0.0161,0.011
en,Elastic Net,0.9341,3.0163,1.7219,0.9985,0.0215,0.0162,0.012
lar,Least Angle Regression,0.9331,3.0157,1.7218,0.9985,0.0214,0.0161,0.011
omp,Orthogonal Matching Pursuit,0.9332,3.0157,1.722,0.9985,0.0214,0.0161,0.012
br,Bayesian Ridge,0.9331,3.0157,1.7218,0.9985,0.0214,0.0161,0.011
gbr,Gradient Boosting Regressor,1.0412,3.6892,1.9042,0.9982,0.0234,0.018,0.114
lightgbm,Light Gradient Boosting Machine,1.057,4.0085,1.9903,0.998,0.024,0.0179,0.081
rf,Random Forest Regressor,1.0679,4.1334,2.0157,0.998,0.0242,0.0182,0.568


In [18]:
print(top3)

[LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False), Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=123,
      selection='cyclic', tol=0.0001, warm_start=False), Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=123, solver='auto', tol=0.001)]


In [19]:
#Para visualizar todos os modelos que estão dentro do pycaret podemos utilizar o seguinte comando

models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [20]:
ridge = create_model('ridge', fold=10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8991,2.4969,1.5802,0.9988,0.023,0.0179
1,0.8237,2.6932,1.6411,0.9988,0.0194,0.0144
2,1.1768,4.9796,2.2315,0.9975,0.0245,0.0181
3,0.9869,3.4718,1.8633,0.9985,0.0213,0.0159
4,1.042,3.7598,1.939,0.9981,0.0252,0.0179
5,0.7742,1.9844,1.4087,0.999,0.0191,0.0139
6,0.9857,3.1075,1.7628,0.9985,0.0202,0.0163
7,0.8311,2.3219,1.5238,0.9988,0.0188,0.014
8,0.9113,2.5496,1.5968,0.9988,0.0204,0.016
9,0.8999,2.7927,1.6711,0.9983,0.0221,0.0167


In [21]:
lasso = create_model('lasso', fold=10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.9028,2.5071,1.5834,0.9988,0.023,0.0179
1,0.829,2.706,1.645,0.9988,0.0196,0.0146
2,1.1787,4.9919,2.2343,0.9975,0.0246,0.0182
3,0.9872,3.4599,1.8601,0.9985,0.0215,0.016
4,1.0416,3.7536,1.9374,0.9982,0.0252,0.0179
5,0.7721,1.9705,1.4038,0.999,0.0191,0.0139
6,0.9856,3.1076,1.7628,0.9985,0.0202,0.0163
7,0.8329,2.3324,1.5272,0.9988,0.0188,0.014
8,0.914,2.5697,1.603,0.9988,0.0205,0.016
9,0.8969,2.7644,1.6627,0.9984,0.0221,0.0167


In [22]:
lr = create_model('lr', fold=10)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8991,2.4969,1.5802,0.9988,0.023,0.0179
1,0.8237,2.6932,1.6411,0.9988,0.0194,0.0144
2,1.1768,4.9796,2.2315,0.9975,0.0245,0.0181
3,0.9869,3.4718,1.8633,0.9985,0.0213,0.0159
4,1.042,3.7598,1.939,0.9981,0.0252,0.0179
5,0.7742,1.9844,1.4087,0.999,0.0191,0.0139
6,0.9857,3.1075,1.7628,0.9985,0.0202,0.0163
7,0.8311,2.3219,1.5238,0.9988,0.0188,0.014
8,0.9113,2.5497,1.5968,0.9988,0.0204,0.016
9,0.8999,2.7927,1.6711,0.9983,0.0221,0.0167


In [26]:
#Efetuar tunning nos parametros dos modelos para eveitar que fique com "overfit"

ridge_params = {'alpha': [0.02,0.024,0.025,0.026,0.03]}
tunne_ridge = tune_model(ridge, n_iter=1000, optimize='RMSE', custom_grid=ridge_params)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8991,2.4969,1.5802,0.9988,0.023,0.0179
1,0.8237,2.6932,1.6411,0.9988,0.0194,0.0144
2,1.1768,4.9796,2.2315,0.9975,0.0245,0.0181
3,0.9869,3.4718,1.8633,0.9985,0.0213,0.0159
4,1.042,3.7598,1.939,0.9981,0.0252,0.0179
5,0.7742,1.9844,1.4087,0.999,0.0191,0.0139
6,0.9857,3.1075,1.7628,0.9985,0.0202,0.0163
7,0.8311,2.3219,1.5238,0.9988,0.0188,0.014
8,0.9113,2.5496,1.5968,0.9988,0.0204,0.016
9,0.8999,2.7927,1.6711,0.9983,0.0221,0.0167


In [27]:
tunne_lasso = tune_model(lasso, n_iter=1000, optimize='RMSE')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8989,2.4988,1.5808,0.9988,0.0234,0.018
1,0.8098,2.6788,1.6367,0.9988,0.0189,0.0139
2,1.1743,4.9657,2.2284,0.9975,0.0243,0.0179
3,0.9844,3.4743,1.864,0.9985,0.021,0.0157
4,1.0424,3.7607,1.9392,0.9981,0.0252,0.0179
5,0.7742,1.9887,1.4102,0.999,0.0189,0.0139
6,0.9885,3.1137,1.7646,0.9985,0.0201,0.0163
7,0.8359,2.3214,1.5236,0.9988,0.0188,0.0141
8,0.914,2.5385,1.5933,0.9988,0.0204,0.016
9,0.9003,2.791,1.6706,0.9983,0.0221,0.0167


In [28]:
tunne_lr = tune_model(lr, n_iter=1000, optimize='RMSE')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8974,2.4946,1.5794,0.9988,0.0234,0.0179
1,0.8097,2.676,1.6359,0.9988,0.0189,0.0139
2,1.1738,4.963,2.2278,0.9975,0.0243,0.0179
3,0.9852,3.4791,1.8652,0.9985,0.021,0.0157
4,1.0426,3.7636,1.94,0.9981,0.0252,0.0179
5,0.7753,1.9939,1.4121,0.999,0.0189,0.0139
6,0.9883,3.113,1.7644,0.9985,0.0201,0.0163
7,0.835,2.3177,1.5224,0.9988,0.0188,0.0141
8,0.9125,2.5317,1.5911,0.9988,0.0204,0.016
9,0.901,2.8004,1.6734,0.9983,0.0221,0.0167
