# XGBoost with Auto-Tuning for Demand Prediction
This notebook uses XGBoost with `RandomizedSearchCV` for automatic hyperparameter tuning.
The goal is to predict `Units Sold` using pricing-related features and contextual variables.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from scipy.stats import uniform, randint


In [None]:
# Load dataset
df = pd.read_csv("retail_store_inventory.csv")
df.dropna(inplace=True)

# Feature engineering
df['Discount'] = df['Discount'] / 100.0
df = pd.get_dummies(df, columns=['Region', 'Seasonality', 'Weather Condition'], drop_first=True)

# Define features
features = ['Price', 'Discount', 'Competitor Pricing', 'Holiday/Promotion', 'Inventory Level'] + \
           [col for col in df.columns if 'Region_' in col or 'Seasonality_' in col or 'Weather Condition_' in col]

X = df[features]
y = df['Units Sold']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Parameter distributions for randomized search
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 8),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3)
}

# Initialize XGBRegressor
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Randomized search with cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=10,
    scoring='r2',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model (can take time on large datasets)
random_search.fit(X_train, y_train)

# Best model results
best_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)
print("Best CV R-squared:", random_search.best_score_)


In [None]:
# Predict and evaluate
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Test R-squared:", r2)
print("Test RMSE:", rmse)
