In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

In [None]:
# importing data
data = pd.read_csv("/content/Aus_grocery_synthetic_dataset2.csv")

#fill missing values with mean value of the same items
mean_prices = data.groupby('Sku')['unit_price_x'].transform('mean')
data['unit_price_x'].fillna(mean_prices, inplace=True)
data.fillna(method='ffill', inplace=True) #forward fill remaining missing values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['unit_price_x'].fillna(mean_prices, inplace=True)
  data.fillna(method='ffill', inplace=True) #forward fill remaining missing values


In [None]:
def preprocess(data):
  #drop because product name has one to one relationship with sku
  data.drop(['Product_Name'], axis=1, inplace = True)

  #transform datetime column
  # Convert the 'date' column to datetime format
  data['RunDate'] = pd.to_datetime(data['RunDate'], format='%m/%d/%Y')
  # Extract new features
  data['year'] = data['RunDate'].dt.year
  data['month'] = data['RunDate'].dt.month
  data['day_of_month'] = data['RunDate'].dt.day
  data['day_of_week'] = data['RunDate'].dt.dayofweek  # Monday=0, Sunday=6
  #drop the old column
  data.drop(['RunDate'], axis=1, inplace = True)

  # Create lag features for price
  data['unit_price_x_lag1'] = data['unit_price_x'].shift(1)
  data['unit_price_x_lag2'] = data['unit_price_x'].shift(2)
  data['unit_price_x_lag3'] = data['unit_price_x'].shift(3)

  # #fill missing lag values with original values
  data['unit_price_x_lag1'].fillna(data['unit_price_x'], inplace=True)
  data['unit_price_x_lag2'].fillna(data['unit_price_x'], inplace=True)
  data['unit_price_x_lag3'].fillna(data['unit_price_x'], inplace=True)

  #Scale price features
  scaler = RobustScaler()
  data[['unit_price_x_lag1', 'unit_price_x_lag2', 'unit_price_x_lag3']] = scaler.fit_transform(data[['unit_price_x_lag1', 'unit_price_x_lag2', 'unit_price_x_lag3']])

  #one hot encoding
  data = pd.get_dummies(data, columns=['Category', 'Sub_category', 'Product_Group', 'Brand', 'Sku', 'year', 'month', 'day_of_month', 'day_of_week'])

  return data

data = preprocess(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['unit_price_x_lag1'].fillna(data['unit_price_x'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['unit_price_x_lag2'].fillna(data['unit_price_x'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Separating into features and target
X = data.drop(['unit_price_x'], axis=1)
y = data['unit_price_x']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # For regression tasks
    'learning_rate': 0.1,  # Step size shrinkage
    'max_depth': 6,  # Maximum tree depth
    'subsample': 0.8,  # Subsample ratio of training instances
    'colsample_bytree': 0.8,  # Subsample ratio of columns when constructing each tree
    'random_state': 42,
    'eval_metric': 'rmse'  # Evaluation metric for regression
}

# Convert the test set into DMatrix format
dtest = xgb.DMatrix(X_test, label=y_test)

# Number of boosting rounds
num_boost_round = 100

# Initialize the model as None for incremental training
bst = None

# Batch size
batch_size = 1000  # Adjust based on memory

# Train in batches
for start in range(0, len(X_train), batch_size):
    end = min(start + batch_size, len(X_train))
    X_batch = X_train.iloc[start:end]
    y_batch = y_train.iloc[start:end]

    # Convert batch to DMatrix
    dtrain = xgb.DMatrix(X_batch, label=y_batch)

    # Train incrementally
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=1,  # Incremental training with one round per batch
        xgb_model=bst  # Use the previously trained model
    )

# Evaluate the model on the test set
y_pred = bst.predict(dtest)


In [None]:
# # seperating into features and target
# X = data.drop(['unit_price_x'], axis=1)
# y = data['unit_price_x']

# # Train test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# # Initialize the XGBoost Regressor
# xgb_model = xgb.XGBRegressor(
#     n_estimators=100,  # Number of trees
#     learning_rate=0.1,  # Step size shrinkage
#     max_depth=6,  # Maximum tree depth
#     subsample=0.8,  # Subsample ratio of training instances
#     colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree
#     random_state=42
# )

# # Train the model
# xgb_model.fit(X_train, y_train)


In [None]:
# #Grid search

# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 6, 10],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }

# grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=42),
#                            param_grid=param_grid,
#                            scoring='neg_mean_squared_error',
#                            cv=3,
#                            verbose=2,
#                            n_jobs=-1)

# grid_search.fit(X_train, y_train)

# print("Best Parameters:", grid_search.best_params_)


In [None]:
# #testing
# y_pred = xgb_model.predict(X_test)

#mse
mse = mean_squared_error(y_test, y_pred)
print("MSE = ", mse)

MSE =  83.93937915158811
