Experiment where only one variable (x_3) influences the sales

In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import nnls
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import scipy.io as sio
import os
from nextDoorForecaster import nextDoorForecaster
from math import sqrt
from joblib import Parallel, delayed

In [2]:
num_samples = 500

x_3_values = np.random.randint(0,200, size=(num_samples))
noisy_sales = 10 * np.random.random(size=(num_samples)) + x_3_values

df = pd.DataFrame({
    "x1" : np.random.randint(0, 100, size=(num_samples)),
    "x2" : np.random.randint(0, 200, size=(num_samples)),
    "x3" : x_3_values,
    "sales": noisy_sales
})


min_max_scaler = preprocessing.MinMaxScaler()

inputVars   = ['x1', 'x2', 'x3']
responseVar = 'sales'

In [3]:
df.head()

Unnamed: 0,x1,x2,x3,sales
0,88,3,49,58.265808
1,69,18,0,3.600923
2,35,161,39,44.050364
3,68,12,115,120.963018
4,58,17,139,146.538535


In [4]:
# get datasets
# training
X = df.iloc[0:300][inputVars].values
Y = df.iloc[0:300][responseVar].values

# validate
X_val = df.iloc[300:400][inputVars].values
y_val = df.iloc[300:400][responseVar].values

# test
X_test = df.iloc[400::][inputVars].values
y_test = df.iloc[400::][responseVar].values

In [5]:
# Run the forecaster in parallel
num_frcs = 100
d_predictions = nextDoorForecaster.fit(X,Y,X_val,y_val,X_test,num_frcs)
y_hat = d_predictions['predictions']
errors = nextDoorForecaster.get_frc_errors(y_test, y_hat)
print(f'{num_frcs} forecasters with MSE {errors["MSE"]:.2f} and MAPE {errors["MAPE"]:.2f} and mError {errors["meanError"]:.2f}')

...prediction with 100 forecasters done in 1.42 sec!
100 forecasters with MSE 10.93 and MAPE 4.73 and mError -0.32


In [6]:
nV = nextDoorForecaster.normalise_vector(d_predictions['features'], 100)
var_importance = pd.DataFrame(nV, index=inputVars)
var_importance

Unnamed: 0,0
x1,0.02086
x2,0.0
x3,100.0
