In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
wine = pd.read_csv('assets/wines.csv')

In [3]:
wine.head()

Unnamed: 0,Fortified,Drywhite,Sweetwhite,Red,Rose,Sparkling,date
0,2585,1954,85,464,112.0,1686,Jan 1980
1,3368,2302,89,675,118.0,1591,Feb 1980
2,3210,3054,109,703,129.0,2304,Mar 1980
3,3111,2414,95,887,99.0,1712,Apr 1980
4,3756,2226,91,1139,116.0,1471,May 1980


In [4]:
wine.set_index('date', inplace=True)
wine.index = pd.to_datetime(wine.index)

In [5]:
from src.tde import MultivariateTDE

In [6]:
# predicting the next 3 steps
# using the last 3 observations
wine_tde = MultivariateTDE(wine, horizon=3, k=2, target_col='Red')
wine_tde.head()

Unnamed: 0_level_0,Fortified-1,Fortified-0,Drywhite-1,Drywhite-0,Sweetwhite-1,Sweetwhite-0,Red-1,Red-0,Rose-1,Rose-0,Sparkling-1,Sparkling-0,Red+1,Red+2,Red+3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1980-03-01,2585.0,3368.0,1954.0,2302.0,85.0,89.0,464.0,675.0,112.0,118.0,1686.0,1591.0,703,887.0,1139.0
1980-04-01,3368.0,3210.0,2302.0,3054.0,89.0,109.0,675.0,703.0,118.0,129.0,1591.0,2304.0,887,1139.0,1077.0
1980-05-01,3210.0,3111.0,3054.0,2414.0,109.0,95.0,703.0,887.0,129.0,99.0,2304.0,1712.0,1139,1077.0,1318.0
1980-06-01,3111.0,3756.0,2414.0,2226.0,95.0,91.0,887.0,1139.0,99.0,116.0,1712.0,1471.0,1077,1318.0,1260.0
1980-07-01,3756.0,4216.0,2226.0,2725.0,91.0,95.0,1139.0,1077.0,116.0,168.0,1471.0,1377.0,1318,1260.0,1120.0


In [7]:
is_future = wine_tde.columns.str.contains('\+')
X = wine_tde.iloc[:,~is_future]
Y = wine_tde.iloc[:,is_future]

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, shuffle=False)

In [10]:
X_train.head()

Unnamed: 0_level_0,Fortified-1,Fortified-0,Drywhite-1,Drywhite-0,Sweetwhite-1,Sweetwhite-0,Red-1,Red-0,Rose-1,Rose-0,Sparkling-1,Sparkling-0
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980-03-01,2585.0,3368.0,1954.0,2302.0,85.0,89.0,464.0,675.0,112.0,118.0,1686.0,1591.0
1980-04-01,3368.0,3210.0,2302.0,3054.0,89.0,109.0,675.0,703.0,118.0,129.0,1591.0,2304.0
1980-05-01,3210.0,3111.0,3054.0,2414.0,109.0,95.0,703.0,887.0,129.0,99.0,2304.0,1712.0
1980-06-01,3111.0,3756.0,2414.0,2226.0,95.0,91.0,887.0,1139.0,99.0,116.0,1712.0,1471.0
1980-07-01,3756.0,4216.0,2226.0,2725.0,91.0,95.0,1139.0,1077.0,116.0,168.0,1471.0,1377.0


In [11]:
Y_train.head()

Unnamed: 0_level_0,Red+1,Red+2,Red+3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1980-03-01,703,887.0,1139.0
1980-04-01,887,1139.0,1077.0
1980-05-01,1139,1077.0,1318.0
1980-06-01,1077,1318.0,1260.0
1980-07-01,1318,1260.0,1120.0


In [12]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

In [13]:
model = MultiOutputRegressor(RandomForestRegressor())
model.fit(X_train, Y_train)

MultiOutputRegressor(estimator=RandomForestRegressor())

In [14]:
forecasts = model.predict(X_test)
forecasts[:5]

array([[2029.27, 2162.51, 2068.73],
       [2169.58, 2183.65, 2051.57],
       [2008.6 , 2188.03, 1789.79],
       [2150.04, 1302.78, 1540.48],
       [1115.99, 1237.82, 1527.23]])

In [15]:
from sklearn.metrics import mean_absolute_error

In [16]:
mean_absolute_error(Y_test, forecasts)

411.8068518518519

In [17]:
np.abs(Y_test-forecasts).mean()

Red+1    366.432778
Red+2    416.482222
Red+3    452.505556
dtype: float64

In [18]:
np.abs(Y_test-forecasts).mean().mean()

411.80685185185183