> **Copyright &copy; 2020 CertifAI Sdn. Bhd.**<br>
 **Copyright &copy; 2021 CertifAI Sdn. Bhd.**<br>
 <br>
This program and the accompanying materials are made available under the
terms of the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). \
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and limitations
under the License. <br>
<br>**SPDX-License-Identifier: Apache-2.0**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import random

import warnings
warnings.filterwarnings('ignore')

In [None]:
ads = pd.read_csv('../../datasets/multivariate/ads.csv', index_col=['Time'], parse_dates=['Time'])

In [None]:
ads.head()

In [None]:
ads.shape

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(ads)
plt.title('In-game currency spent (daily data)')
plt.grid(True)
plt.show()

# Feature Engineering


## Date Features

Extract the following details: hour, weekdays and is_weekend

In [None]:
ads_features = ads.copy()
ads_features.columns = ['y']

In [None]:
ads_features['hour'] = ads_features.index.hour
ads_features['weekday'] = ads_features.index.weekday
ads_features['is_weekend'] = 0

ads_features.loc[ads_features.weekday >= 5, 'is_weekend'] = 1

In [None]:
ads_features.head(8)

## Target Encoding

In [None]:
weekday_mean = ads_features.groupby('weekday')['y'].mean()
plt.plot(weekday_mean)

In [None]:
hour_mean = ads_features.groupby('hour')['y'].mean()
plt.plot(hour_mean)

In [None]:
ads_features['weekday_mean'] = ads_features['weekday'].map(lambda x: dict(weekday_mean)[x])
ads_features['hour_mean'] = ads_features['hour'].map(lambda x: dict(hour_mean)[x])

In [None]:
ads_features.head()

## One-Hot Encoded Features

In [None]:
weekday_onehot = pd.get_dummies(ads_features['weekday'], prefix='weekday')
ads_features = pd.concat([ads_features, weekday_onehot], axis=1)

In [None]:
ads_features.head()

## Lag Features

In [None]:
# Create lags features
for i in range(1, 24):
    ads_features["lag_{}".format(i)] = ads_features.y.shift(i)
ads_features.head(5)

In [None]:
ads_features = ads_features.dropna()
ads_features.head()

In [None]:
ads_features.tail()

In [None]:
ads_features.tail(25)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

def timeseries_train_test_split(X, y, test_size):
    # get the index after which test set starts
    test_index = int(len(X)*(1-test_size))
    
    X_train = X.iloc[:test_index]
    y_train = y.iloc[:test_index]
    X_test = X.iloc[test_index:]
    y_test = y.iloc[test_index:]
    
    return X_train, X_test, y_train, y_test

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# data preparation

datasets = {}
datasets['t'] = ads_features

for i in range(23):
    step = i+1
    
    temp_dataset = ads_features.copy()
    temp_dataset['y'] = temp_dataset['y'].shift(-step)
    datasets['t+'+str(step)] =  temp_dataset

In [None]:
datasets.keys()

In [None]:
models = []
errors = []

for i in datasets:
    current_dataset = datasets[i].dropna()
    
    y = current_dataset['y']
    X = current_dataset.drop(['y'], axis=1)

    # reserve 20% of data for testing
    train_X, test_X, train_y, test_y = timeseries_train_test_split(X, y, test_size=0.2)
    
    scaler = StandardScaler().fit(train_X)
    train_X_scaled = scaler.transform(train_X)
    test_X_scaled = scaler.transform(test_X)

    lr = LinearRegression()
    lr.fit(train_X_scaled, train_y)
    
    predictions = lr.predict(test_X_scaled)
    
    error = mean_absolute_percentage_error(predictions, test_y.values)
    
    errors.append(error)
    models.append(lr)
    
    print('training '+i+', error: {}'.format(error))
    
print('average error {}'.format(np.mean(errors)))

In [None]:
test_sample = ads_features.loc['2017-09-21 00:00:00'] # predict 2017-09-21
test_sample = test_sample.drop('y')

scaled_test_sample = scaler.transform(test_sample.values.reshape(1, -1))

scaled_test_sample

In [None]:
prediction_steps=[]

for model in models:
    prediction_steps.append(model.predict(scaled_test_sample)[0])

In [None]:
models[2].predict(scaled_test_sample)

In [None]:
prediction_steps

In [None]:
plt.figure(figsize=(16, 6))
plt.plot(ads_features['y'])
plt.plot(pd.Series(prediction_steps, index=ads_features.loc['2017-09-21 00:00:00':].index))
plt.title('Forecast in-game currency spent for 21-09-2017')
plt.grid(True)
plt.show()