In [7]:
# import of libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb

In [2]:
# Load the dataset
file_path = '17072024_sales_data/Clean_Data.csv'
data = pd.read_csv(file_path)

In [3]:
# Convert date columns to datetime
data['Order_Date'] = pd.to_datetime(data['Order_Date'])
data['Ship_Date'] = pd.to_datetime(data['Ship_Date'])

In [4]:
# Extract useful date features
data['Order_Year'] = data['Order_Date'].dt.year
data['Order_Month'] = data['Order_Date'].dt.month
data['Order_Day'] = data['Order_Date'].dt.day
data['Ship_Year'] = data['Ship_Date'].dt.year
data['Ship_Month'] = data['Ship_Date'].dt.month
data['Ship_Day'] = data['Ship_Date'].dt.day

# Add more date-related features
data['Order_Weekday'] = data['Order_Date'].dt.weekday
data['Order_Quarter'] = data['Order_Date'].dt.quarter

In [5]:
# Drop original date columns
data = data.drop(columns=['Order_Date', 'Ship_Date'])

In [6]:
# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

In [8]:
# Prepare the data again with additional features
X = data.drop(columns=['Sales'])
y = data['Sales']

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)


In [11]:
# Train the model
xgb_model.fit(X_train, y_train)


In [12]:
# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)


In [13]:
# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

mse_xgb, mae_xgb, r2_xgb

(317603.53944714594, 204.10121716743322, 0.3506815838170064)

# Results 

- # MSE: 317603.53944714594
- # MAE: 204.10121716743322
- # R^2: 0.3506815838170064

# English / improvement attempt 

- Hyperparameter tuning using grid search and cross-validation 

In [16]:
#importing library
from sklearn.model_selection import RandomizedSearchCV

In [17]:
# Define the parameter grid for RandomizedSearchCV
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}


In [18]:
# Initialize RandomizedSearchCV with cross-validation
random_search_xgb = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid_xgb, n_iter=100, cv=3, n_jobs=-1, verbose=2, scoring='r2', random_state=42)


In [19]:
# Fit the random search to the data
random_search_xgb.fit(X_train, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.3, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.3, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=9, n_estimators=200, subsample=0.9; total time=   0.7s
[CV] END colsample_bytree=1.0, learning_rate=0.3, max_depth=9, n_estimators=200, subsample=0.7; total time=   0.8s
[CV] END colsamp

In [20]:
# Best parameters from RandomizedSearchCV
best_params_xgb = random_search_xgb.best_params_


In [21]:
# Train the XGBoost model with the best parameters
best_xgb_model = xgb.XGBRegressor(**best_params_xgb, objective='reg:squarederror', random_state=42)
best_xgb_model.fit(X_train, y_train)


In [22]:
# Predict on the test set with the best model
y_pred_best_xgb = best_xgb_model.predict(X_test)


In [23]:
# Evaluate the best model
mse_best_xgb = mean_squared_error(y_test, y_pred_best_xgb)
mae_best_xgb = mean_absolute_error(y_test, y_pred_best_xgb)
r2_best_xgb = r2_score(y_test, y_pred_best_xgb)

best_params_xgb, mse_best_xgb, mae_best_xgb, r2_best_xgb

({'subsample': 0.7,
  'n_estimators': 100,
  'max_depth': 9,
  'learning_rate': 0.01,
  'colsample_bytree': 0.8},
 382963.0383933282,
 220.42097686316828,
 0.21705861975267915)