# Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from catboost import CatBoostRegressor
import math


# Processing functions

In [2]:
def label_feature_split(df, column):
    label = df[[column]].values.ravel()
    feature = df.drop([column], axis=1)
    return feature, label

def split_dataset(df, test_size=0.2):
    split_size = round(len(df)*test_size)
    train, test = df[0:-split_size], df[-split_size:]
    return train, test

def load_data(filename):
    df = pd.read_csv(filename,sep='\t')
    df['DateTime'] = pd.to_datetime(df['DateTime']).dt.round('15min')
    df = add_date_data(df)
    return df

def get_diff(df, column):
    df[f'{column}_diff'] = df[column].diff()
    return df

def add_date_data(df):
    df['day_of_week'] = df['DateTime'].dt.day_name()
    df['month_name'] = df['DateTime'].dt.month_name()
    df['day_of_month'] = df['DateTime'].dt.day
    df['hour_of_day'] = df['DateTime'].dt.hour
    df['minute_of_hour'] = df['DateTime'].dt.minute
    return df
    

# Huristics

In [3]:
lag=12
# filename="data/sales_data.csv"
filename="data/traffic_data.csv"

# target_column = "Sales"
target_column ="Traffic"
#the categorical features being sent to the model, these are generated during the loading step
categorical_features=['day_of_week', 'month_name']


In [4]:
df = load_data(filename)
df = get_diff(df,target_column)


In [5]:
def generate_supervised(df, lag, column):
    """
    Generating the test dataset by adding a lag
    """
    supervised_df = df.copy()
    for i in range(1,(lag+1)):
        col_name = 'lag_' + str(i)
        supervised_df[col_name] = supervised_df[f'{column}_diff'].shift(i)
    
    supervised_df = supervised_df.dropna().reset_index(drop=True)
    return supervised_df
supervised_df = generate_supervised(df, lag, target_column)
supervised_df

Unnamed: 0,DateTime,Traffic,day_of_week,month_name,day_of_month,hour_of_day,minute_of_hour,Traffic_diff,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12
0,2018-04-15 13:15:00,7,Sunday,April,15,13,15,-4.0,1.0,1.0,2.0,0.0,1.0,-3.0,7.0,-2.0,1.0,-4.0,3.0,2.0
1,2018-04-15 13:30:00,9,Sunday,April,15,13,30,2.0,-4.0,1.0,1.0,2.0,0.0,1.0,-3.0,7.0,-2.0,1.0,-4.0,3.0
2,2018-04-15 13:45:00,8,Sunday,April,15,13,45,-1.0,2.0,-4.0,1.0,1.0,2.0,0.0,1.0,-3.0,7.0,-2.0,1.0,-4.0
3,2018-04-15 14:00:00,8,Sunday,April,15,14,0,0.0,-1.0,2.0,-4.0,1.0,1.0,2.0,0.0,1.0,-3.0,7.0,-2.0,1.0
4,2018-04-15 14:15:00,18,Sunday,April,15,14,15,10.0,0.0,-1.0,2.0,-4.0,1.0,1.0,2.0,0.0,1.0,-3.0,7.0,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57577,2021-07-31 20:15:00,7,Saturday,July,31,20,15,0.0,4.0,-3.0,1.0,1.0,0.0,-2.0,1.0,2.0,-2.0,1.0,-1.0,4.0
57578,2021-07-31 20:30:00,5,Saturday,July,31,20,30,-2.0,0.0,4.0,-3.0,1.0,1.0,0.0,-2.0,1.0,2.0,-2.0,1.0,-1.0
57579,2021-07-31 20:45:00,3,Saturday,July,31,20,45,-2.0,-2.0,0.0,4.0,-3.0,1.0,1.0,0.0,-2.0,1.0,2.0,-2.0,1.0
57580,2021-07-31 21:00:00,2,Saturday,July,31,21,0,-1.0,-2.0,-2.0,0.0,4.0,-3.0,1.0,1.0,0.0,-2.0,1.0,2.0,-2.0


In [6]:
train_df, test_df = split_dataset(supervised_df)
X_train, y_train=label_feature_split(train_df, target_column)
X_test, y_test=label_feature_split(test_df, target_column)


# Machine learning models and metrics

In [7]:
def catboost_model(X_train, y_train, X_test, categorical_features):
    model = CatBoostRegressor(verbose=False, cat_features=categorical_features)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def dummy_model(y_train, X_test):
    mean_value = y_train.mean()
    mean_prediction = np.empty(len(X_test))
    mean_prediction.fill(mean_value)
    return mean_prediction

def metrics(y_pred, y_test):
    mse = mean_squared_error(y_pred, y_test)
    print(f'- The mean square error is {round(mse, 2)}')
    print(f'- The root mean square error is {round(math.sqrt(mse), 2)}')
    print(f'- The absolute is error is {round(mean_absolute_error(y_pred, y_test), 2)}')   

In [None]:
predictions = catboost_model(X_train, y_train, X_test, categorical_features)
metrics(predictions, y_test)

In [None]:
predictions = dummy_model(y_train, X_test)
metrics(predictions, y_test)