Refer links:
http://strftime.org/
http://machinelearningmastery.com/grid-search-arima-hyperparameters-with-python/#comment-401609

In [1]:
import warnings
#suppress unnecessary warnings
warnings.filterwarnings("ignore")
from pandas import read_csv
from pandas import datetime
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

In [2]:
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
# prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
# make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        #after generating forecast for an observation, it is added to the history dataset
        #this allows this observation to be used for forecasting value for the subsequent observation
        history.append(test[t])
# calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error

In [3]:
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))

In [6]:
# load dataset
#The timestamps in the time series do not contain an absolute year component.
#We can use a custom date-parsing function when loading the data and baseline the year from 1900
def parser(x):
    #the following code chunk will take care of parsing for two conditions:
    #1. for dates < 10
    #2.for dates > 10
    test = int(x.split('-')[0])
    #print(test)
    if(test < 10):
        return(datetime.strptime("190"+str(x),"%Y-%b"))
    else:
        return(datetime.strptime("19"+str(x),"%Y-%b"))
    series = read_csv('sales-of-shampoo-over-a-three-ye.csv', header=0, parse_dates=[0], index_col=0, 
                  squeeze=True, date_parser=parser)
# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
evaluate_models(series.values, p_values, d_values, q_values)

ARIMA(0, 0, 0) MSE=52425.268
ARIMA(0, 0, 1) MSE=38145.217
ARIMA(0, 0, 2) MSE=23989.597
ARIMA(0, 1, 0) MSE=18003.173
ARIMA(0, 1, 1) MSE=9558.206
ARIMA(0, 2, 0) MSE=67339.808
ARIMA(0, 2, 1) MSE=18322.029
ARIMA(1, 0, 0) MSE=23112.933
ARIMA(1, 1, 0) MSE=7121.365
ARIMA(1, 1, 1) MSE=7003.687
ARIMA(1, 2, 0) MSE=18608.045
ARIMA(2, 0, 0) MSE=10176.546
ARIMA(2, 1, 0) MSE=5689.929
ARIMA(2, 1, 1) MSE=7759.710
ARIMA(2, 2, 0) MSE=9860.932




ARIMA(4, 1, 0) MSE=6649.593
ARIMA(4, 1, 1) MSE=6796.293
ARIMA(4, 2, 0) MSE=7596.331
ARIMA(4, 2, 1) MSE=4694.878




ARIMA(6, 1, 0) MSE=6810.072




ARIMA(6, 2, 0) MSE=6261.110




ARIMA(8, 1, 0) MSE=6579.235




Best ARIMA(4, 2, 1) MSE=4694.878


In [19]:
#as a side, the parse function can also be looked at
from dateutil.parser import parse
a = '1901-Jan'
b = parse(a)
print(b)

1901-01-05 00:00:00


In [64]:
#test code
x = "1-Jan"
y = "10-Jan"
test = int(y.split('-')[0])
print(test)
if(test < 10):
    print(datetime.strptime("190"+str(y),"%Y-%b"))
else:
    print(datetime.strptime("19"+str(y),"%Y-%b"))

10
1910-01-01 00:00:00
