In [6]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def rmse(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

def rmspe(y_true, y_pred):
    """
    Compute Root Mean Square Percentage Error between two arrays.
    
    Parameters:
    y_true (array): The array of actual values
    y_pred (array): The array of predicted values
    
    Returns:
    float: The RMSPE value
    """
    # Ensure that the predicted values are nonzero to avoid division by zero
    if np.any(y_pred == 0):
        raise ValueError("Predicted values contain zero, which would lead to division by zero in RMSPE calculation.")
    
    # Calculate the percentage errors
    percentage_errors = ((y_true - y_pred) / y_true) ** 2
    
    # Compute the mean of the percentage errors
    mean_percentage_errors = np.mean(percentage_errors)
    
    # Return the square root of the mean percentage errors, multiplied by 100 (to convert it into a percentage)
    return np.sqrt(mean_percentage_errors) * 100


In [7]:
# Assuming your data is in a CSV file named 'data.csv'
data = pd.read_csv('../data/processed/rossmann_sales_df.csv')
data = data[data['Open']==1]

In [8]:
# Sort the data by date
data.sort_values(by='Date', inplace=True)

# Define the split point (e.g., 80% for training)
split_index = int(len(data) * 0.8)

# Split the data into training and testing sets
train_data = data[:split_index]
test_data = data[split_index:]

# Separate features (X) and target variable (y) for both sets
X_train = train_data.drop(['Date', 'Customers', 'Sales'], axis=1)
y_train = train_data['Sales']
X_test = test_data.drop(['Date', 'Customers', 'Sales'], axis=1)
y_test = test_data['Sales']


In [9]:
# Convert object columns to categorical
for col in X_train.select_dtypes(include=['object']).columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Train the XGBoost model
model = XGBRegressor(enable_categorical=True)
model.fit(X_train, y_train)

In [10]:
# Make predictions on the test set
y_pred = model.predict(X_test)

y_pred = y_pred[y_test.values != 0]
y_test_filter = y_test[y_test.values != 0]

# Evaluate the model
mse = mean_squared_error(y_test_filter, y_pred)
rmse_val = rmse(y_test_filter, y_pred)
mae = mean_absolute_error(y_test_filter, y_pred)
r2 = r2_score(y_test_filter, y_pred)
rmspe_val = rmspe(y_test_filter, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse_val}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R-squared (R2): {r2}')
print(f'Root Mean Squared Percentage Error (RMSPE): {rmspe_val}')


Mean Squared Error (MSE): 1084198.3753876914
Root Mean Squared Error (RMSE): 1041.2484695727967
Mean Absolute Error (MAE): 715.4985709319209
R-squared (R2): 0.8852897882461548
Root Mean Squared Percentage Error (RMSPE): 16.507490381605695
