-
Notifications
You must be signed in to change notification settings - Fork 108
/
auto_ts.py
487 lines (460 loc) · 26.3 KB
/
auto_ts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
import warnings
from typing import Dict, Optional
from datetime import datetime
import copy
import pdb
from collections import defaultdict
import operator
import time
# Tabular Data
import pandas as pd # type: ignore
import numpy as np # type: ignore
# Modeling
from sklearn.exceptions import DataConversionWarning # type: ignore
#### The warnings from Sklearn are so annoying that I have to shut it off ####
warnings.filterwarnings("ignore")
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
def warn(*args, **kwargs):
pass
warnings.warn = warn
############################################################
import seaborn as sns # type: ignore
import matplotlib.pyplot as plt # type: ignore
sns.set(style="white", color_codes=True)
#######################################
# Models
from .models import build_arima_model, build_sarimax_model, build_var_model, \
build_pyflux_model, build_prophet_model, run_ensemble_model
#from .models import build_ml, build_prophet, build_pyflux
# Utils
from .utils import colorful, load_ts_data, convert_timeseries_dataframe_to_supervised, \
time_series_plot, print_static_rmse, print_dynamic_rmse
#from .utils import colors, eda, etl, metrics, val
class AutoTimeseries:
def __init__(self, score_type: str ='rmse',
forecast_period: int = 5, time_interval = '', non_seasonal_pdq=None,
seasonality: bool = False, seasonal_period: int = 12, seasonal_PDQ=None,
conf_int: float = 0.95, model_type: str ="stats", verbose: int =0):
"""
Initializae an AutoTimeSeries object
# TODO: Add complete docstring
# TODO: Add object types
####################################################################################
#### Auto Time Series ####
#### Version 0.0.19 Version ####
#### Conceived and Developed by Ram Seshadri ####
#### All Rights Reserved ####
####################################################################################
##################################################################################################
AUTO_TIMESERIES IS A COMPLEX MODEL BUILDING UTILITY FOR TIME SERIES DATA. SINCE IT AUTOMATES MANY
TASKS INVOLVED IN A COMPLEX ENDEAVOR, IT ASSUMES MANY INTELLIGENT DEFAULTS. BUT YOU CAN CHANGE THEM.
Auto_Timeseries will rapidly build predictive models based on Statsmodels ARIMA, Seasonal ARIMA
and Scikit-Learn ML. It will automatically select the BEST model which gives best score specified.
It will return the best model and a dataframe containing predictions for forecast_period (default=2).
#####################################################################################################
INPUT:
#####################################################################################################
trainfile: name of the file along with its data path or a dataframe. It accepts both.
ts_column: name of the datetime column in your dataset (it could be name or number)
target: name of the column you are trying to predict. Target could also be the only column in your data
score_type: 'rmse' is the default. You can choose among "mae", "mse" and "rmse".
forecast_period: default is 2. How many periods out do you want to forecast? It should be an integer
time_interval: default is "Month". What is the time period in your data set. Options are: "days",
model_type: default is "stats". Choice is between "stats", "prophet" and "ml". "All" will build all.
- "stats" will build statsmodels based ARIMA< SARIMAX and VAR models
- "ml" will build a machine learning model using Random Forests provided explanatory vars are given
- "prophet" will build a model using FB Prophet -> this means you must have FB Prophet installed
- "best" will build three of the best models from above which might take some time for large data sets.
We recommend that you choose a small sample from your data set bedfore attempting to run entire data.
#####################################################################################################
and the evaluation metric so it can select the best model. Currently only 2 are supported: RMSE and
Normalized RMSE (ratio of RMSE to the standard deviation of actuals). Other eval metrics will be soon.
the target variable you are trying to predict (if there is more than one variable in your data set),
and the time interval that is in the data. If your data is in a different time interval than given,
Auto_Timeseries will automatically resample your data to the given time interval and learn to make
predictions. Notice that except for filename and ts_column which are required, all others are optional.
Note that optionally you can give a separator for the data in your file. Default is comman (",").
"time_interval" options are: 'Days', 'Weeks', 'Months', 'Qtr', 'Year', 'Minutes', 'Hours', 'Seconds'.
Optionally, you can give seasonal_period as any integer that measures the seasonality in the data.
If not, seasonal_period is assumed automatically as follows: Months = 12, Days = 30, Weeks = 52,
Qtr = 4, Year = 1, Hours = 24, Minutes = 60 and Seconds = 60.
If you want to give your own order, please input it as non_seasonal_pdq and seasonal_PDQ in the input
as tuples. For example, seasonal_PDQ = (2,1,2) and non_seasonal_pdq = (0,0,3). It will accept only tuples.
The defaul is None and Auto_Timeseries will automatically search for the best p,d,q (for Non Seasonal)
and P, D, Q (for Seasonal) orders by searching for all parameters from 0 to 12 for each value of
p,d,q and 0-3 for each P, Q and 0-1 for D.
#####################################################################################################
"""
self.ml_dict: Dict = {}
self.score_type: str = score_type
self.forecast_period = forecast_period
self.time_interval = time_interval
self.non_seasonal_pdq = non_seasonal_pdq
self.seasonality = seasonality
self.seasonal_period = seasonal_period
self.seasonal_PDQ = seasonal_PDQ
self.conf_int = conf_int
self.model_type = model_type
self.verbose = verbose
def fit(self, traindata, ts_column, target, sep=','):
"""
Train the AutoTimeseries object
# TODO: Complete docstring
"""
# start_time = time.time() # Unused
##### Best hyper-parameters in statsmodels chosen using the best aic, bic or whatever. Select here.
stats_scoring = 'aic'
# seed = 99 # Unused
### If run_prophet is set to True, then only 1 model will be run and that is FB Prophet ##
lag = copy.deepcopy(self.forecast_period)-1
if type(self.non_seasonal_pdq) == tuple:
p_max = self.non_seasonal_pdq[0]
d_max = self.non_seasonal_pdq[1]
q_max = self.non_seasonal_pdq[2]
else:
p_max = 3
d_max = 1
q_max = 3
################################
# Check: seasonal_order is not used anywhere in the code, hence commented for now.
# if type(self.seasonal_PDQ) == tuple:
# seasonal_order = copy.deepcopy(self.seasonal_PDQ)
# else:
# seasonal_order = (3, 1, 3)
########## This is where we start the loading of the data file ######################
if isinstance(traindata, str):
if traindata != '':
try:
ts_df = load_ts_data(traindata, ts_column, sep, target)
if isinstance(ts_df, str):
print("""Time Series column %s could not be converted to a Pandas date time column.
Please convert your input into a date-time column and try again""" %ts_column)
return
else:
print(' File loaded successfully. Shape of data set = %s' %(ts_df.shape,))
except:
print('File could not be loaded. Check the path or filename and try again')
return
elif isinstance(traindata, pd.DataFrame):
print('Input is data frame. Performing Time Series Analysis')
ts_df = load_ts_data(traindata, ts_column, sep, target)
if isinstance(ts_df, str):
print("""Time Series column %s could not be converted to a Pandas date time column.
Please convert your input into a date-time column and try again""" %ts_column)
return
else:
print(' Dataframe loaded successfully. Shape of data set = %s' %(ts_df.shape,))
else:
print('File name is an empty string. Please check your input and try again')
return
df_orig = copy.deepcopy(ts_df)
if ts_df.shape[1] == 1:
### If there is only one column, you assume that to be the target column ####
target = list(ts_df)[0]
if not isinstance(ts_column, str):
### If ts_column is a number, then it means you need to convert it to a named variable
ts_column = list(ts_df)[ts_column]
if isinstance(target,list):
target = target[0]
print(' Taking the first column in target list as Target variable = %s' %target)
else:
print(' Target variable = %s' %target)
preds = [x for x in list(ts_df) if x not in [ts_column,target]]
##################################################################################################
### Turn the time series index into a variable and calculate the difference.
### If the difference is not in days, then it is a hourly or minute based time series
### If the difference a multiple of days, then test it for weekly, monthly, qtrly, annual etc.
##################################################################################################
if ts_df.index.dtype=='int' or ts_df.index.dtype=='float':
### You must convert the ts_df index into a date-time series using the ts_column given ####
ts_df = ts_df.set_index(ts_column)
ts_index = ts_df.index
################ IF TIME INTERVAL IS NOT GIVEN DO THIS ########################
####### This is where the program tries to tease out the time period in the data set ###########
##################################################################################################
if self.time_interval == '':
ts_index = pd.to_datetime(ts_df.index)
diff = (ts_index[1] - ts_index[0]).to_pytimedelta()
diffdays = diff.days
diffsecs = diff.seconds
if diffsecs == 0:
diff_in_hours = 0
diff_in_days = abs(diffdays)
else:
diff_in_hours = abs(diffdays*24*3600 + diffsecs)/3600
if diff_in_hours == 0 and diff_in_days >= 1:
print('Time series input in days = %s' % diff_in_days)
if diff_in_days == 7:
print('it is a Weekly time series.')
self.time_interval = 'weeks'
elif diff_in_days == 1:
print('it is a Daily time series.')
self.time_interval = 'days'
elif 28 <= diff_in_days < 89:
print('it is a Monthly time series.')
self.time_interval = 'months'
elif 89 <= diff_in_days < 178:
print('it is a Quarterly time series.')
self.time_interval = 'qtr'
elif 178 <= diff_in_days < 360:
print('it is a Semi Annual time series.')
self.time_interval = 'qtr'
elif diff_in_days >= 360:
print('it is an Annual time series.')
self.time_interval = 'years'
else:
print('Time Series time delta is unknown')
return
if diff_in_days == 0:
if diff_in_hours == 0:
print('Time series input in Minutes or Seconds = %s' % diff_in_hours)
print('it is a Minute time series.')
self.time_interval = 'minutes'
elif diff_in_hours >= 1:
print('it is an Hourly time series.')
self.time_interval = 'hours'
else:
print('It is an Unknown Time Series delta')
return
else:
print('Time Interval is given as %s' % self.time_interval)
################# This is where you test the data and find the time interval #######
self.time_interval = self.time_interval.strip().lower()
if self.time_interval in ['months', 'month', 'm']:
self.time_interval = 'months'
self.seasonal_period = 12
elif self.time_interval in ['days', 'daily', 'd']:
self.time_interval = 'days'
self.seasonal_period = 30
# Commented out b/c resample only works with DatetimeIndex, not Index
# ts_df = ts_df.resample('D').sum()
elif self.time_interval in ['weeks', 'weekly', 'w']:
self.time_interval = 'weeks'
self.seasonal_period = 52
elif self.time_interval in ['qtr', 'quarter', 'q']:
self.time_interval = 'qtr'
self.seasonal_period = 4
elif self.time_interval in ['years', 'year', 'annual', 'y', 'a']:
self.time_interval = 'years'
self.seasonal_period = 1
elif self.time_interval in ['hours', 'hourly', 'h']:
self.time_interval = 'hours'
self.seasonal_period = 24
elif self.time_interval in ['minutes', 'minute', 'min', 'n']:
self.time_interval = 'minutes'
self.seasonal_period = 60
elif self.time_interval in ['seconds', 'second', 'sec', 's']:
self.time_interval = 'seconds'
self.seasonal_period = 60
else:
self.time_interval = 'months'
self.seasonal_period = 12
########################### This is where we store all models in a nested dictionary ##########
mldict = lambda: defaultdict(mldict)
self.ml_dict = mldict()
try:
if self.model_type.lower() == 'best':
print(colorful.BOLD +'WARNING: Running best models will take time... Be Patient...' + colorful.END)
except:
print('Check if your model type is a string or one of the available types of models')
######### This is when you need to use FB Prophet ###################################
### When the time interval given does not match the tested_time_interval, then use FB.
#### Also when the number of rows in data set is very large, use FB Prophet, It is fast.
######### FB Prophet ###################################
if self.model_type.lower() in ['prophet','best']:
name = 'FB_Prophet'
print(colorful.BOLD + '\nRunning Facebook Prophet Model...' + colorful.END)
# try:
#### If FB prophet needs to run, it needs to be installed. Check it here ###
model, forecast_df, rmse, norm_rmse = build_prophet_model(
ts_df, ts_column, target, self.forecast_period, self.time_interval,
self.score_type, self.verbose, self.conf_int)
self.ml_dict[name]['model'] = model
self.ml_dict[name]['forecast'] = forecast_df['yhat'].values
##### Make sure that RMSE works, if not set it to np.inf #########
if self.score_type == 'rmse':
score_val = rmse
else:
score_val = norm_rmse
# except:
# print(' FB Prophet may not be installed or Model is not running...')
# score_val = np.inf
self.ml_dict[name][self.score_type] = score_val
if self.model_type.lower() in ['stats','best']:
##### First let's try the following models in sequence #########################################
nsims = 100 ### this is needed only for M-H models in PyFlux
name = 'PyFlux'
print(colorful.BOLD + '\nRunning PyFlux Model...' + colorful.END)
try:
self.ml_dict[name]['model'], self.ml_dict[name]['forecast'], rmse, norm_rmse = \
build_pyflux_model(ts_df, target, p_max, q_max, d_max, self.forecast_period,
'MLE', nsims, self.score_type, self.verbose)
if isinstance(rmse,str):
print(' PyFlux not installed. Install PyFlux and run it again')
score_val = np.inf
rmse = np.inf
norm_rmse = np.inf
except:
print(' PyFlux model error: predictions not available.')
score_val = np.inf
rmse = np.inf
norm_rmse = np.inf
##### Make sure that RMSE works, if not set it to np.inf #########
if self.score_type == 'rmse':
score_val = rmse
else:
score_val = norm_rmse
self.ml_dict[name][self.score_type] = score_val
################### Let's build an ARIMA Model and add results #################
name = 'ARIMA'
print(colorful.BOLD + '\nRunning Non Seasonal ARIMA Model...' + colorful.END)
try:
self.ml_dict[name]['model'], self.ml_dict[name]['forecast'], rmse, norm_rmse = build_arima_model(ts_df[target],
stats_scoring,p_max,d_max,q_max,
forecast_period=self.forecast_period,method='mle',verbose=self.verbose)
except:
print(' ARIMA model error: predictions not available.')
score_val = np.inf
if self.score_type == 'rmse':
score_val = rmse
else:
score_val = norm_rmse
self.ml_dict[name][self.score_type] = score_val
############# Let's build a SARIMAX Model and get results ########################
name = 'SARIMAX'
print(colorful.BOLD + '\nRunning Seasonal SARIMAX Model...' + colorful.END)
# try:
self.ml_dict[name]['model'], self.ml_dict[name]['forecast'], rmse, norm_rmse = build_sarimax_model(ts_df[target], stats_scoring, self.seasonality,
self.seasonal_period, p_max, d_max, q_max,
self.forecast_period,self.verbose)
# except:
# print(' SARIMAX model error: predictions not available.')
# score_val = np.inf
if self.score_type == 'rmse':
score_val = rmse
else:
score_val = norm_rmse
self.ml_dict[name][self.score_type] = score_val
########### Let's build a VAR Model - but first we have to shift the predictor vars ####
name = 'VAR'
if len(preds) == 0:
print(colorful.BOLD + '\nNo VAR model created since no explanatory variables given in data set' + colorful.END)
rmse = np.inf
norm_rmse = np.inf
else:
try:
if df_orig.shape[1] > 1:
preds = [x for x in list(df_orig) if x not in [target]]
print(colorful.BOLD + '\nRunning VAR Model...' + colorful.END)
print(' Shifting %d predictors by 1 to align prior predictor values with current target values...'
%len(preds))
ts_df[preds] = ts_df[preds].shift(1)
ts_df.dropna(axis=0,inplace=True)
self.ml_dict[name]['model'], self.ml_dict[name]['forecast'], rmse, norm_rmse = build_var_model(ts_df[[target]+preds],stats_scoring,
self.forecast_period, p_max, q_max)
else:
print(colorful.BOLD + '\nNo predictors available. Skipping VAR model...' + colorful.END)
score_val = np.inf
except:
print(' VAR model error: predictions not available.')
rmse = np.inf
norm_rmse = np.inf
################################################################
if self.score_type == 'rmse':
score_val = rmse
else:
score_val = norm_rmse
########################################################################
self.ml_dict[name][self.score_type] = score_val
if self.model_type.lower() in ['ml','best']:
########## Let's build a Machine Learning Model now with Time Series Data ################
name = 'ML'
if len(preds) == 0:
print('No ML model since number of predictors is zero')
rmse = np.inf
norm_rmse = np.inf
else:
try:
if df_orig.shape[1] > 1:
preds = [x for x in list(ts_df) if x not in [target]]
print(colorful.BOLD + '\nRunning Machine Learning Models...' + colorful.END)
print(' Shifting %d predictors by lag=%d to align prior predictor with current target...'
% (len(preds), lag))
# ipdb.set_trace()
dfxs, target, preds = convert_timeseries_dataframe_to_supervised(ts_df[preds+[target]],
preds+[target], target, n_in=lag, n_out=0, dropT=False)
train = dfxs[:-self.forecast_period]
test = dfxs[-self.forecast_period:]
best = run_ensemble_model(train[preds], train[target], 'TimeSeries',
self.score_type, self.verbose)
bestmodel = best[0]
self.ml_dict[name]['model'] = bestmodel
### Certain models dont have random state => so dont do this for all since it will error
#best.set_params(random_state=0)
self.ml_dict[name]['forecast'] = bestmodel.fit(train[preds],train[target]).predict(test[preds])
rmse, norm_rmse = print_dynamic_rmse(test[target].values,
bestmodel.predict(test[preds]),
train[target].values)
#### Plotting actual vs predicted for RF Model #################
plt.figure(figsize=(5, 5))
plt.scatter(train.append(test)[target].values,
np.r_[bestmodel.predict(train[preds]), bestmodel.predict(test[preds])])
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show(block=False)
############ Draw a plot of the Time Series data ######
time_series_plot(dfxs[target], chart_time=self.time_interval)
else:
print(colorful.BOLD + '\nNo predictors available. Skipping Machine Learning model...' + colorful.END)
score_val = np.inf
except:
print(' For ML model, evaluation score is not available.')
score_val = np.inf
################################################################
if self.score_type == 'rmse':
score_val = rmse
else:
score_val = norm_rmse
rmse = np.inf
norm_rmse = np.inf
########################################################################
self.ml_dict[name][self.score_type] = score_val
if not self.model_type.lower() in ['stats','ml', 'prophet', 'best']:
print('The model_type should be either stats, prophet, ml or best. Check your input and try again...')
return self.ml_dict
######## Selecting the best model based on the lowest rmse score ######
# f1_stats = {}
# for key, _ in self.ml_dict.items():
# f1_stats[key] = self.ml_dict[key][self.score_type]
best_model_name = self.get_best_model_name() # min(f1_stats.items(), key=operator.itemgetter(1))[0]
print(colorful.BOLD + '\nBest Model is:' + colorful.END)
print(' %s' % best_model_name)
# best_model = self.ml_dict[best_model_name]['model'] # unused
print(' Best Model Forecasts: %s' %self.ml_dict[best_model_name]['forecast'])
print(' Best Model Score: %0.2f' % self.ml_dict[best_model_name][self.score_type])
return self
def get_best_model_name(self) -> str:
"""
Returns the best model name
"""
f1_stats = {}
for key, _ in self.ml_dict.items():
f1_stats[key] = self.ml_dict[key][self.score_type]
best_model_name = min(f1_stats.items(), key=operator.itemgetter(1))[0]
return best_model_name
def get_best_model(self):
"""
Returns the best model after training
"""
return self.ml_dict.get(self.get_best_model_name()).get('model')
def get_ml_dict(self):
"""
Returns the entire ML Dictionary
"""
return self.ml_dict
def predict(self, model: str = 'best') -> Optional[np.array]:
"""
Predict the results
"""
print("This function has not been implemented yet. But the idea would be that this would make the prediction using the best model or the model type passed as an argument.")
return None