See: https://towardsdatascience.com/time-series-forecasting-with-pycaret-regression-module-237b703a0c63

# Experimenting with PyCaret

In [1]:
from setup import start_spark, extract_data

spark = start_spark(sesh_name = 'Jupyter Sktime Modelling')
train, val = extract_data(spark)
spark.stop()

In [10]:
train.head()

Unnamed: 0_level_0,total_rides,total_takings
pickup_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-08-01,307272.0,4567414.0
2013-08-02,236761.0,3534616.0
2013-08-03,226554.0,3182267.0
2013-08-04,202310.0,3073609.0
2013-08-05,406487.0,6657713.0


# PyCaret Training

With Pycaret we don't have proper timeseries fitting for univariate, we only have automated regression techniques on tabular data.
Would need something like tfresh to generate the right features

In [2]:
# import the regression module
from pycaret.regression import *

In [16]:
train['total_rides'] = train['total_rides'].astype('float')
val['total_rides'] = val['total_rides'].astype('float')

train['month'] = train.index.month
val['month'] = val.index.month

if 'total_takings' in train.columns:
    train = train.drop('total_takings', axis=1)
    
if 'total_takings' in val.columns:
    val = val.drop('total_takings', axis=1)

In [17]:
# initialize setup
s = setup(data = train, test_data = val, target = 'total_rides', fold_strategy = 'timeseries', 
          numeric_features = [], fold = 3, transform_target = True, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,total_rides
2,Original Data,"(730, 2)"
3,Missing Values,False
4,Numeric Features,0
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(730, 12)"


In [18]:
best = compare_models(sort = 'MAPE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
omp,Orthogonal Matching Pursuit,44830.9814,3509219941.249,59085.3053,-0.1768,0.1326,0.0999,0.32
lasso,Lasso Regression,45864.8346,3615132501.3333,59897.7227,-0.213,0.1338,0.1012,0.4133
en,Elastic Net,45864.8346,3615132501.3333,59897.7227,-0.213,0.1338,0.1012,0.2967
llar,Lasso Least Angle Regression,45864.8343,3615132778.632,59897.7206,-0.213,0.1338,0.1012,0.2933
ada,AdaBoost Regressor,45628.013,3577809396.2708,59725.2326,-0.2165,0.1334,0.1014,0.01
rf,Random Forest Regressor,46582.1807,3616513221.8696,60106.0301,-0.2366,0.1334,0.1025,0.0633
gbr,Gradient Boosting Regressor,46906.8847,3652724879.6589,60423.0187,-0.2512,0.134,0.1029,0.0133
lar,Least Angle Regression,47043.0027,3667179374.1527,60543.0431,-0.2566,0.1342,0.1032,0.2967
dt,Decision Tree Regressor,47043.0771,3667189318.1463,60543.1254,-0.2566,0.1342,0.1032,0.29
et,Extra Trees Regressor,47167.7086,3681724856.4727,60665.9186,-0.2622,0.1345,0.1034,0.0533


In [19]:
prediction_holdout = predict_model(best);

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Orthogonal Matching Pursuit,30210.1635,1536954186.9585,39204.0073,-1.1412,0.0966,0.0783


In [21]:
# generate predictions on the original dataset
predictions = predict_model(best, data=val)

In [22]:
# we can see it's just flat cause it's going by the month feature
predictions

Unnamed: 0_level_0,total_rides,month,Label
pickup_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-08-01,457430.0,8,437093.282777
2015-08-02,405241.0,8,437093.282777
2015-08-03,390578.0,8,437093.282777
2015-08-04,418718.0,8,437093.282777
2015-08-05,435400.0,8,437093.282777
2015-08-06,437779.0,8,437093.282777
2015-08-07,440706.0,8,437093.282777
2015-08-08,428376.0,8,437093.282777
2015-08-09,371924.0,8,437093.282777
2015-08-10,359234.0,8,437093.282777
