In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def rmse(y_true, y_pred):
  return np.sqrt(mean_squared_error(y_true, y_pred))

def rmspe(y_true, y_pred):
    """
    Compute Root Mean Square Percentage Error between two arrays.

    Parameters:
    y_true (array): The array of actual values
    y_pred (array): The array of predicted values

    Returns:
    float: The RMSPE value
    """
    # Ensure that the predicted values are nonzero to avoid division by zero
    if np.any(y_pred == 0):
        raise ValueError("Predicted values contain zero, which would lead to division by zero in RMSPE calculation.")

    # Calculate the percentage errors
    percentage_errors = ((y_true - y_pred) / y_true) ** 2

    # Compute the mean of the percentage errors
    mean_percentage_errors = np.mean(percentage_errors)

    # Return the square root of the mean percentage errors, multiplied by 100 (to convert it into a percentage)
    return np.sqrt(mean_percentage_errors) * 100


In [None]:
# Assuming your data is in a CSV file named 'data.csv'
data = pd.read_csv('../data/processed/rossmann_sales_df.csv')
data = data[data['Open']==1]

In [None]:
# Sort the data by date
data.sort_values(by='Date', inplace=True)

# Define the split point (e.g., 80% for training)
split_index = int(len(data) * 0.8)

# Split the data into training and testing sets
train_data = data[:split_index]
test_data = data[split_index:]

# Separate features (X) and target variable (y) for both sets
X_train = train_data.drop(['Date', 'Customers', 'Sales'], axis=1)
y_train = train_data['Sales']
X_test = test_data.drop(['Date', 'Customers', 'Sales'], axis=1)
y_test = test_data['Sales']


In [None]:
import optuna

def objective(trial):
    # Define the hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=10)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    gamma = trial.suggest_float('gamma', 0, 1)
    subsample = trial.suggest_float('subsample', 0.5, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 1)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 1)

    # Create the XGBoost model with the suggested hyperparameters
    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        min_child_weight=min_child_weight,
        gamma=gamma,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        objective='reg:squarederror',
        random_state=42
    )

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    rmse_val = rmse(y_test, y_pred)

    return rmse_val

# Create a study object
study = optuna.create_study(direction='minimize')

# Run the optimization
study.optimize(objective, n_trials=10)

# Get the best hyperparameters
best_params = study.best_params

# Print the best hyperparameters
print(f"Best parameters: {best_params}")




In [None]:
# Train the model with the best hyperparameters
best_model = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
best_model.fit(X_train, y_train)


In [None]:
# Make predictions on the test set
y_pred_best = best_model.predict(X_test)

# Evaluate the model
print(f"RMSE: {rmse(y_test, y_pred_best)}")
print(f"RMSPE: {rmspe(y_test, y_pred_best)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_best)}")
print(f"R2 Score: {r2_score(y_test, y_pred_best)}")