/
predict.py
114 lines (105 loc) · 3.91 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import pandas as pd
import math
import datetime
import statsmodels.api as sm
import statistics
from functools import partial
import code
from scipy.optimize import brute
import json
# this comes from here:
# http://stackoverflow.com/questions/22770352/auto-arima-equivalent-for-python
def objective_function(data, order):
return sm.tsa.ARIMA(data, order).fit(disp=0).aic
def brute_search(data):
obj_func = partial(objective_function, data)
# Back in graduate school professor Lecun said in class that ARIMA models
# typically only need a max parameter of 5, so I doubled it just in case.
upper_bound_AR = 10
upper_bound_I = 10
upper_bound_MA = 10
grid_not_found = True
while grid_not_found:
try:
if upper_bound_AR < 0 or upper_bound_I < 0 or upper_bound_MA < 0:
grid_not_found = False
grid = (
slice(1, upper_bound_AR, 1),
slice(1, upper_bound_I, 1),
slice(1, upper_bound_MA, 1)
)
order = brute(obj_func, grid, finish=None)
return order
except Exception as e:
error_string = str(e)
if "MA" in error_string:
upper_bound_MA -= 1
elif "AR" in error_string:
upper_bound_AR -= 1
else:
upper_bound_I -= 1
# assuming we don't ever hit a reasonable set of upper_bounds,
# it's pretty safe to assume this will work
try:
grid = (
slice(1, 2, 1),
slice(1, 2, 1),
slice(1, 2, 1)
)
order = brute(obj_func, grid, finish=None)
return order
except: # however we don't always meet invertibility conditions
# Here we explicitly test for a single MA
# or AR process being a better fit
# If either has a lower (better) aic score we return that model order
try:
model_ar_one = sm.tsa.ARIMA(data, (1, 0, 0)).fit(disp=0)
model_ma_one = sm.tsa.ARIMA(data, (0, 0, 1)).fit(disp=0)
except:
return None
if model_ar_one.aic < model_ma_one.aic:
return (1, 0, 0)
else:
return (0, 0, 1)
def dict_to_dataframe(data):
df = pd.DataFrame()
for key in data.keys():
df = df.append({"year":key, "cpi":data[key]}, ignore_index=True)
df["year"] = df.apply(lambda x: datetime.date(year=int(x["year"]),month=1,day=1), axis=1)
df["year"] = pd.to_datetime(df["year"])
df = df.set_index("year")
df.sort_index(inplace=True)
return df
def predict(df,steps):
start = df.index[0].year
end = df.index[-1].year
years_captured = [idx.year for idx in df.index]
years_inclusive = [elem for elem in range(start, end+1)]
s = df.T.squeeze()
for year in years_inclusive:
if not year in years_captured:
s = s.set_value(datetime.datetime(year=year,month=1,day=1),math.nan)
s.sort_index(inplace=True)
s = s.interpolate()
data = s.to_frame()
model_order = brute_search(data)
model_order = tuple([int(elem) for elem in model_order])
model = sm.tsa.ARIMA(data, model_order).fit(disp=0)
return model.forecast(steps=steps)[0], end
def main(data, steps):
df = dict_to_dataframe(data)
new_results, last_year = predict(df, steps)
new_years = [datetime.datetime(year=year,month=1,day=1) for year in range(last_year,last_year+steps)]
for index,val in enumerate(new_years) :
s = pd.Series()
s = s.set_value("cpi", new_results[index])
s.name = val
df = df.append(s)
df.sort_index(inplace=True)
dicter = df.to_dict()["cpi"]
new_dict = {}
for key in dicter.keys():
new_dict[key.year] = dicter[key]
return new_dict
#http://stackoverflow.com/questions/13331518/how-to-add-a-single-item-to-a-pandas-series
#http://stackoverflow.com/questions/16824607/pandas-appending-a-row-to-a-dataframe-and-specify-its-index-label