In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.stattools import adfuller
from tqdm import tqdm_notebook
from itertools import product
from typing import Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
data_path = 'C:\\Tera\\Projeto\\data\\milho-cepea_RS.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,date,RS
0,05/01/2004,20.83
1,12/01/2004,19.66
2,30/07/2004,18.14
3,02/08/2004,18.24
4,03/08/2004,18.04


In [3]:
year_period = 252

eps_diff = np.diff(df['RS'], n=year_period)
ad_fuller_result = adfuller(eps_diff)

print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

ADF Statistic: -447.5166208784025
p-value: 0.0


In [2]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tqdm import tqdm_notebook
from itertools import product
from typing import Union
import pandas as pd

def optimize_SARIMA(endog: Union[pd.Series, list], order_list: list, d: int, D: int, s: int) -> pd.DataFrame:
    
    results = []
    
    for order in tqdm_notebook(order_list):
        try: 
            model = SARIMAX(
                endog, 
                order=(order[0], d, order[1]),
                seasonal_order=(order[2], D, order[3], s),
                simple_differencing=False).fit(disp=False)
        except:
            continue
            
        aic = model.aic
        print(f'order: {order}||aic: {aic}')
        results.append([order, aic])
        
    result_df = pd.DataFrame(results)
    result_df.columns = ['(p,q,P,Q)', 'AIC']
    
    #Sort in ascending order, lower AIC is better
    result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return result_df

In [3]:
data_path = 'C:\\Tera\\Projeto\\data\\milho-cepea_RS.csv'
df = pd.read_csv(data_path)

train_interval = int(len(df['RS'])*0.9)
test_interval = len(df['RS']) - train_interval

train = df['RS'][0:train_interval]

In [4]:
ps = range(0, 4, 1)
qs = range(0, 4, 1)
Ps = range(0, 4, 1)
Qs = range(0, 4, 1)

SARIMA_order_list = list(product(ps, qs, Ps, Qs))

d = 1
D = 0
s = 252

In [7]:
order = SARIMA_order_list[1]

In [8]:
model = SARIMAX(
                train, 
                order=(order[0], d, order[1]),
                seasonal_order=(order[2], D, order[3], s),
                simple_differencing=False).fit(disp=False)

In [8]:
SARIMA_order_list[1]

(0, 0, 0, 1)

In [5]:
SARIMA_result_df = optimize_SARIMA(train, SARIMA_order_list, d, D, s)
SARIMA_result_df

  0%|          | 0/256 [00:00<?, ?it/s]

order: (0, 0, 0, 0)||aic: 2880.885026723224
order: (0, 0, 0, 1)||aic: 2880.08107740004
