# Link Google Colab
[Google Colab](https://colab.research.google.com/drive/13Dy-bKmne6Rp89Xdj6Ju0PqLZBaJfqvg?usp=sharing)

# References
- https://pycaret.gitbook.io/docs/learn-pycaret/official-blog/time-series-forecasting-with-pycaret-regression
- https://pycaret.gitbook.io/docs/learn-pycaret/official-blog/time-series-101-for-beginners
- https://pycaret.readthedocs.io/en/latest/api/time_series.html
- https://pycaret.readthedocs.io/en/latest/api/time_series.html#pycaret.time_series.compare_models
- https://pycaret.readthedocs.io/en/stable/api/regression.html


 # Install Libraries

In [None]:
!pip install pycaret[full]



# Import Libraries

In [None]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.model_selection import train_test_split

# Data Understanding

## Download and Load Data

In [None]:
!wget https://github.com/hilmizr/world_fertilizer_price/raw/master/01-09-24-modified_fertilizer_datav5.xlsx

--2024-09-01 01:55:00--  https://github.com/hilmizr/world_fertilizer_price/raw/master/01-09-24-modified_fertilizer_datav5.xlsx
Resolving github.com (github.com)... 140.82.116.4
Connecting to github.com (github.com)|140.82.116.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/hilmizr/world_fertilizer_price/master/01-09-24-modified_fertilizer_datav5.xlsx [following]
--2024-09-01 01:55:00--  https://raw.githubusercontent.com/hilmizr/world_fertilizer_price/master/01-09-24-modified_fertilizer_datav5.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42737 (42K) [application/octet-stream]
Saving to: ‘01-09-24-modified_fertilizer_datav5.xlsx.2’


2024-09-01 01:55:00 (4.70 MB/s) - ‘01-09-24-modified_

In [None]:
date_col = 'date'
target_col = 'dap_price'

In [None]:
data = pd.read_excel('01-09-24-modified_fertilizer_datav5.xlsx')
data[date_col] = pd.to_datetime(data[date_col])
data.head()

Unnamed: 0,date,dap_price,kcl_price,rock_price,tsp_price,urea_price
0,1993-11-01,140.4,112.5,31.5,112.5,82.8
1,1993-12-01,150.38,112.5,31.5,122.5,85.5
2,1994-01-01,150.0,112.5,31.5,125.0,85.5
3,1994-02-01,151.75,112.5,31.5,125.63,85.5
4,1994-03-01,155.88,112.5,31.5,127.5,98.63


In [None]:
# Create 12 month moving average
data['MA12'] = data[target_col].rolling(12).mean()

# Plot the data and MA
fig = px.line(data, x=date_col, y=[target_col, "MA12"], template = 'plotly_dark')
fig.show()

  v = v.dt.to_pydatetime()


# Data Preparation


## Datetime Feature Extraction

In [None]:
temp = data

# extract month and year from dates
data['Month'] = [i.month for i in data[date_col]]
data['Year'] = [i.year for i in data[date_col]]

# create a sequence of numbers
data['Series'] = np.arange(1,len(data)+1)

# drop unnecessary columns and re-arrange
data.drop(['MA12'], axis=1, inplace=True)
data = data[['Series', 'Year', 'Month', target_col]]

# check the head of the dataset
data.head()

Unnamed: 0,Series,Year,Month,dap_price
0,1,1993,11,140.4
1,2,1993,12,150.38
2,3,1994,1,150.0
3,4,1994,2,151.75
4,5,1994,3,155.88


## Train-Test Split

In [None]:
n_test = 6

In [None]:
# split data into train-test set
train, test = train_test_split(data, test_size=n_test, shuffle=False)

# check shape
train.shape, test.shape

((360, 4), (6, 4))

# Modeling

## Initialize Setup

In [None]:
# import the regression module
from pycaret.regression import *

# initialize setup**
s = setup(
    data = train,
    test_data = test,
    target = target_col,
    fold_strategy = 'timeseries',
    numeric_features = ['Year', 'Series'],
    fold = 3,
    transform_target = True,
    data_split_shuffle = False,
    fold_shuffle = False,
    session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,dap_price
2,Target type,Regression
3,Original data shape,"(366, 4)"
4,Transformed data shape,"(366, 4)"
5,Transformed train set shape,"(360, 4)"
6,Transformed test set shape,"(6, 4)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


## Train and Evaluate All Models

In [None]:
best = compare_models(sort = 'MAPE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,125.4524,40701.5096,189.3903,-0.3397,0.4515,0.285,0.11
xgboost,Extreme Gradient Boosting,176.5742,61772.7474,246.7766,-2.8636,0.657,0.372,0.0733
dummy,Dummy Regressor,196.0366,70547.3333,265.5197,-3.177,0.7144,0.4111,0.0633
lasso,Lasso Regression,200.0875,72756.7353,269.5813,-3.2171,0.7429,0.4223,0.0767
llar,Lasso Least Angle Regression,200.0875,72756.7353,269.5813,-3.2171,0.7429,0.4223,0.0533
en,Elastic Net,200.6612,73069.5456,270.142,-3.2227,0.7473,0.424,0.0467
ada,AdaBoost Regressor,215.0549,80164.8853,272.6249,-5.7022,0.5796,0.5157,0.0867
huber,Huber Regressor,223.3269,97437.6915,290.3575,-2.5477,0.625,0.5298,0.07
catboost,CatBoost Regressor,233.5664,109579.3147,314.3291,-9.213,0.626,0.5387,0.89
gbr,Gradient Boosting Regressor,254.1906,117943.0559,323.0457,-10.3388,0.6469,0.5907,0.1033


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

## Predict Test Set

In [None]:
# Check test set score
prediction_holdout = predict_model(best)
prediction_holdout

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,90.4438,8759.9761,93.5947,-9.724,0.1527,0.1596


Unnamed: 0,Series,Year,Month,dap_price,prediction_label
360,361,2023,11,535.630005,638.13863
361,362,2023,12,563.75,638.13863
362,363,2024,1,596.25,671.898464
363,364,2024,2,583.809998,671.898464
364,365,2024,3,617.5,682.264299
365,366,2024,4,545.0,682.264299


## Predict Entire Dataset

In [None]:
# Generate predictions on the original dataset
predictions = predict_model(best, data=data)
predictions

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,26.4877,3009.8721,54.8623,0.9165,0.0893,0.0577


Unnamed: 0,Series,Year,Month,dap_price,prediction_label
0,1,1993,11,140.399994,167.601036
1,2,1993,12,150.380005,167.601036
2,3,1994,1,150.000000,172.425246
3,4,1994,2,151.750000,172.425246
4,5,1994,3,155.880005,173.906369
...,...,...,...,...,...
361,362,2023,12,563.750000,638.138630
362,363,2024,1,596.250000,671.898464
363,364,2024,2,583.809998,671.898464
364,365,2024,3,617.500000,682.264299


## Compare and Visualize

In [None]:
# Recreate the 'Date' column
predictions['Date'] = pd.to_datetime(predictions[['Year', 'Month']].assign(DAY=1))
predictions

Unnamed: 0,Series,Year,Month,dap_price,prediction_label,Date
0,1,1993,11,140.399994,167.601036,1993-11-01
1,2,1993,12,150.380005,167.601036,1993-12-01
2,3,1994,1,150.000000,172.425246,1994-01-01
3,4,1994,2,151.750000,172.425246,1994-02-01
4,5,1994,3,155.880005,173.906369,1994-03-01
...,...,...,...,...,...,...
361,362,2023,12,563.750000,638.138630,2023-12-01
362,363,2024,1,596.250000,671.898464,2024-01-01
363,364,2024,2,583.809998,671.898464,2024-02-01
364,365,2024,3,617.500000,682.264299,2024-03-01


In [None]:
pred_col = 'prediction_label'

In [None]:
predictions = predictions.dropna(subset=[pred_col, target_col])

# Line plot
fig = px.line(predictions, x='Date', y=[target_col, pred_col], template='plotly_dark')
test_start_date = predictions['Date'].iloc[-n_test]
test_end_date = predictions['Date'].iloc[-1]

fig.add_vrect(x0=test_start_date, x1=test_end_date, fillcolor="grey", opacity=0.25, line_width=0)

fig.show()

## Finalize Model

In [None]:
final_best = finalize_model(best)

## Future Predictions

In [None]:
# Determine the last date
last_date = pd.to_datetime(f"{data['Year'].iloc[-1]}-{data['Month'].iloc[-1]}-01")

# Create a date range for the next 4 months
future_dates = pd.date_range(start=last_date + pd.offsets.MonthBegin(1), periods=n_test, freq='MS')

# Create a DataFrame for future predictions
future_df = pd.DataFrame()

future_df['Month'] = [i.month for i in future_dates]
future_df['Year'] = [i.year for i in future_dates]
future_df['Series'] = np.arange(data['Series'].iloc[-1] + 1, data['Series'].iloc[-1] + 1 + len(future_dates))

future_df.head()

Unnamed: 0,Month,Year,Series
0,5,2024,367
1,6,2024,368
2,7,2024,369
3,8,2024,370
4,9,2024,371


In [None]:
# Generate predictions for the future data
predictions_future = predict_model(final_best, data=future_df)
predictions_future.head()

Unnamed: 0,Month,Year,Series,prediction_label
0,5,2024,367,580.575903
1,6,2024,368,580.575903
2,7,2024,369,580.575903
3,8,2024,370,597.57061
4,9,2024,371,599.820946


In [None]:
# Concatenate the original data with the future predictions
concat_df = pd.concat([data, predictions_future], axis=0)

# Create a new date index that includes both the original and future dates
concat_df_i = pd.date_range(start=pd.to_datetime(f"{data['Year'].iloc[0]}-{data['Month'].iloc[0]}-01"),
                            end=future_dates[-1], freq='MS')

# Set the new date index
concat_df.set_index(concat_df_i, inplace=True)

# Plot the data including future predictions
fig = px.line(concat_df, x=concat_df.index, y=[target_col, pred_col], template='plotly_dark')
fig.show()


## Save Model


In [None]:
save_model(final_best, 'ts_regression_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('target_transformation',
                  TransformerWrapperWithInverse(transformer=TargetTransformer(estimator=PowerTransformer(standardize=False)))),
                 ('numerical_imputer',
                  TransformerWrapper(include=['Year', 'Series'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('actual_estimator',
                  LGBMRegressor(n_jobs=-1, random_state=123))]),
 'ts_regression_model.pkl')

# Ensemble Modeling

## Define Ensemble Model

In [None]:
# Train and evaluate all models
best_models = compare_models(sort='MAPE', n_select=2)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,125.4524,40701.5096,189.3903,-0.3397,0.4515,0.285,0.1667
xgboost,Extreme Gradient Boosting,176.5742,61772.7474,246.7766,-2.8636,0.657,0.372,0.06
dummy,Dummy Regressor,196.0366,70547.3333,265.5197,-3.177,0.7144,0.4111,0.0667
lasso,Lasso Regression,200.0875,72756.7353,269.5813,-3.2171,0.7429,0.4223,0.0567
llar,Lasso Least Angle Regression,200.0875,72756.7353,269.5813,-3.2171,0.7429,0.4223,0.0433
en,Elastic Net,200.6612,73069.5456,270.142,-3.2227,0.7473,0.424,0.0533
ada,AdaBoost Regressor,215.0549,80164.8853,272.6249,-5.7022,0.5796,0.5157,0.0967
huber,Huber Regressor,223.3269,97437.6915,290.3575,-2.5477,0.625,0.5298,0.0567
catboost,CatBoost Regressor,233.5664,109579.3147,314.3291,-9.213,0.626,0.5387,1.05
gbr,Gradient Boosting Regressor,254.1906,117943.0559,323.0457,-10.3388,0.6469,0.5907,0.1067


Processing:   0%|          | 0/86 [00:00<?, ?it/s]

In [None]:
# Tune the top 2 models
tuned_models = [tune_model(model) for model in best_models]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,136.1644,70728.2803,265.9479,-0.2626,0.6434,0.2925
1,242.0097,66038.6362,256.9798,-7.8406,0.8359,0.5465
2,208.8677,74419.5442,272.7995,-1.4147,0.6603,0.3918
Mean,195.6806,70395.4869,265.2424,-3.1726,0.7132,0.4103
Std,44.2058,3429.5742,6.4776,3.3341,0.087,0.1045


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 3 folds for each of 10 candidates, totalling 30 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,148.5732,77105.2812,277.6784,-0.3764,0.7283,0.324
1,241.9764,66022.5391,256.9485,-7.8384,0.8357,0.5464
2,136.3782,34745.1953,186.4006,-0.1274,0.3672,0.2707
Mean,175.6426,59291.0052,240.3425,-2.7807,0.6437,0.3804
Std,47.1686,17936.5434,39.0702,3.5777,0.2004,0.1194


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
# Blend the tuned models
blender = blend_models(
    tuned_models,
    # choose_better = True
    )

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,148.6625,77165.927,277.7876,-0.3775,0.7292,0.3242
1,182.0022,40594.2345,201.4801,-4.4343,0.5703,0.4
2,136.902,33884.2505,184.0768,-0.0995,0.3637,0.2778
Mean,155.8556,50548.1373,221.1148,-1.6371,0.5544,0.334
Std,19.1017,19019.9197,40.6986,1.9812,0.1496,0.0504


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

## Predict Test Set

In [None]:
# Predict test set using the blended model
prediction_holdout = predict_model(blender)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,52.409,3364.8602,58.0074,-3.1193,0.0986,0.0936


## Predict Entire Dataset

In [None]:
# Generate predictions on the entire dataset
predictions = predict_model(blender, data=data)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,36.6818,6395.4167,79.9713,0.8226,0.1324,0.0823


## Compare and Visualize

In [None]:
# Recreate the 'Date' column
predictions['Date'] = pd.to_datetime(predictions[['Year', 'Month']].assign(DAY=1))
pred_col = 'prediction_label'
predictions = predictions.dropna(subset=[pred_col, target_col])
predictions

Unnamed: 0,Series,Year,Month,dap_price,prediction_label,Date
0,1,1993,11,140.399994,171.273340,1993-11-01
1,2,1993,12,150.380005,171.273340,1993-12-01
2,3,1994,1,150.000000,173.734228,1994-01-01
3,4,1994,2,151.750000,173.734228,1994-02-01
4,5,1994,3,155.880005,174.480321,1994-03-01
...,...,...,...,...,...,...
361,362,2023,12,563.750000,614.014475,2023-12-01
362,363,2024,1,596.250000,629.733925,2024-01-01
363,364,2024,2,583.809998,629.733925,2024-02-01
364,365,2024,3,617.500000,634.448746,2024-03-01


In [None]:
# Line plot
fig = px.line(predictions, x='Date', y=[target_col, pred_col], template='plotly_dark')
test_start_date = predictions['Date'].iloc[-n_test]
test_end_date = predictions['Date'].iloc[-1]

fig.add_vrect(x0=test_start_date, x1=test_end_date, fillcolor="grey", opacity=0.25, line_width=0)

fig.show()

## Finalize Ensemble Model

In [None]:
# Finalize the blended model
final_best = finalize_model(blender)

## Future Predictions

In [None]:
# Future predictions using the blended model
future_df = pd.DataFrame()
future_df['Month'] = [i.month for i in future_dates]
future_df['Year'] = [i.year for i in future_dates]
future_df['Series'] = np.arange(data['Series'].iloc[-1] + 1, data['Series'].iloc[-1] + 1 + len(future_dates))

predictions_future = predict_model(final_best, data=future_df)
predictions_future

Unnamed: 0,Month,Year,Series,prediction_label
0,5,2024,367,586.178232
1,6,2024,368,586.178232
2,7,2024,369,586.178232
3,8,2024,370,594.702685
4,9,2024,371,595.819223
5,10,2024,372,595.819223


In [None]:
# Concatenate the original data with the future predictions
concat_df = pd.concat([data, predictions_future], axis=0)

# Create a new date index that includes both the original and future dates
concat_df_i = pd.date_range(start=pd.to_datetime(f"{data['Year'].iloc[0]}-{data['Month'].iloc[0]}-01"),
                            end=future_dates[-1], freq='MS')

# Set the new date index
concat_df.set_index(concat_df_i, inplace=True)

# Plot the data including future predictions
fig = px.line(concat_df, x=concat_df.index, y=[target_col, pred_col], template='plotly_dark')
fig.show()

## Save Ensemble Model

In [None]:
# Save the blended model
save_model(final_best, 'ts_blended_regression_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('target_transformation',
                  TransformerWrapperWithInverse(transformer=TargetTransformer(estimator=PowerTransformer(standardize=False)))),
                 ('numerical_imputer',
                  TransformerWrapper(include=['Year', 'Series'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImpu...
                                                            grow_policy=None,
                                                            importance_type=None,
                                                            interaction_constraints=None,
                                                            learning_rate=0.15,
                                                            max_bin=None,
                                                            