In [None]:
!pip install holidays



In [23]:
import pandas as pd
import numpy as np
import holidays
from datetime import timedelta
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import classification_report
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
warnings.filterwarnings('ignore')

In [24]:
# MORE PREPROCESSING

purchase_df =  pd.read_csv('process-purchases(in).csv')

purchase_df['order_date'] = pd.to_datetime(purchase_df['Order Date'])

print(purchase_df.columns)




Index(['Order Date', 'Category', 'Survey ResponseID', 'Q-demos-gender',
       'Q-demos-age', 'Q-demos-income', 'Q-demos-race', 'order_date'],
      dtype='object')


In [25]:
monthly_orders = (
    purchase_df
      .groupby(['Category', pd.Grouper(key='order_date', freq='M')])
      .size()
      .reset_index(name='num_orders')
      .sort_values(['Category','order_date'])
)

print(monthly_orders.head())

     Category order_date  num_orders
0  3D_PRINTER 2018-03-31           1
1  3D_PRINTER 2018-06-30           3
2  3D_PRINTER 2018-11-30           2
3  3D_PRINTER 2018-12-31           1
4  3D_PRINTER 2019-02-28           1


In [26]:
top_cats = monthly_orders.groupby('Category')['num_orders'].sum().nlargest(5).index
monthly_orders = monthly_orders[monthly_orders['Category'].isin(top_cats)]
top_cats

Index(['ABIS_BOOK', 'PET_FOOD', 'SHIRT', 'GIFT_CARD',
       'NUTRITIONAL_SUPPLEMENT'],
      dtype='object', name='Category')

In [27]:
monthly_orders['month'] = monthly_orders['order_date'].dt.month
monthly_orders['year'] = monthly_orders['order_date'].dt.year
monthly_orders['season'] = monthly_orders['month'].map({
    12:'Winter',1:'Winter',2:'Winter',
    3:'Spring',4:'Spring',5:'Spring',
    6:'Summer',7:'Summer',8:'Summer',
    9:'Fall',10:'Fall',11:'Fall'
})

monthly_orders['is_weekend'] = monthly_orders['order_date'].dt.dayofweek.isin([5,6]).astype(int)

us_holidays = holidays.UnitedStates()

monthly_orders['is_holiday'] = monthly_orders['order_date'].isin(us_holidays)


def mark_pre_holiday_dates(dates, holiday_dict, window=7):
  """Return list marking whether each date is within N days before a holiday."""
  holiday_dates = pd.to_datetime(list(holiday_dict.keys()))
  return [
      any((h - timedelta(days=window) <= d < h) for h in holiday_dates)
      for d in pd.to_datetime(dates)
  ]

def time_to_nearest_holiday(date):
    diffs = [(h - date).days for h in us_holidays]
    diffs = [d for d in diffs if d >= 0]
    return min(diffs) if diffs else 30  # cap at 30 days

monthly_orders['days_to_holiday'] = monthly_orders['order_date'].apply(time_to_nearest_holiday)


monthly_orders['is_preholiday'] = (
    mark_pre_holiday_dates(monthly_orders['order_date'], us_holidays, window=7)
)
monthly_orders['is_preholiday'] = monthly_orders['is_preholiday'].astype(int)

monthly_orders['is_holiday_season'] = monthly_orders['month'].isin([11,12]).astype(int)
monthly_orders['is_back_to_school'] = monthly_orders['month'].isin([8,9]).astype(int)

monthly_orders['prev_orders'] = monthly_orders.groupby('Category')['num_orders'].shift(1)
monthly_orders['order_diff']  = monthly_orders['num_orders'] - monthly_orders['prev_orders']
monthly_orders['growth_rate'] = monthly_orders['order_diff'] / (monthly_orders['prev_orders'] + 1)

monthly_orders['roll3_mean'] = (
    monthly_orders.groupby('Category')['num_orders'].rolling(3).mean().reset_index(0,drop=True)
)
monthly_orders['roll6_mean'] = (
    monthly_orders.groupby('Category')['num_orders'].rolling(6).mean().reset_index(0,drop=True)
)

# Fill and create diff features
monthly_orders['momentum_3'] = monthly_orders['num_orders'] - monthly_orders['roll3_mean']
monthly_orders['momentum_6'] = monthly_orders['num_orders'] - monthly_orders['roll6_mean']

monthly_orders = monthly_orders.fillna(0)


monthly_orders['month_sin'] = np.sin(2 * np.pi * X['month'] / 12)
monthly_orders['month_cos'] = np.cos(2 * np.pi * X['month'] / 12)






In [28]:
from sklearn.preprocessing import LabelEncoder
X = monthly_orders[[
    'month','is_holiday_season',
    'prev_orders','growth_rate',
    'roll3_mean','roll6_mean',
    'momentum_3','momentum_6',
    'month_sin','month_cos'
]]



y = monthly_orders['Category']

le = LabelEncoder()
y_enc = le.fit_transform(y)

In [29]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in sss.split(X, y_enc):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y_enc[train_idx], y_enc[test_idx]

In [37]:
import xgboost as xgb
import lightgbm as lgb

model = xgb.XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    learning_rate=0.03,
    max_depth=5,
    n_estimators=500,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=2,
    gamma=0.5,
    reg_lambda=1.2,
    reg_alpha=0.2,
    random_state=42,
    n_jobs=1,
    verbosity=0
)



model.fit(X_train, y_train)

y_pred_enc = model.predict(X_test)
y_pred = le.inverse_transform(y_pred_enc)
y_true = le.inverse_transform(y_test)

In [38]:
print("Accuracy:", accuracy_score(y_true, y_pred))
print(classification_report(y_true, y_pred))

Accuracy: 0.578125
                        precision    recall  f1-score   support

             ABIS_BOOK       1.00      1.00      1.00        13
             GIFT_CARD       0.36      0.42      0.38        12
NUTRITIONAL_SUPPLEMENT       0.57      0.62      0.59        13
              PET_FOOD       0.64      0.69      0.67        13
                 SHIRT       0.22      0.15      0.18        13

              accuracy                           0.58        64
             macro avg       0.56      0.58      0.57        64
          weighted avg       0.56      0.58      0.57        64



**Part 2 - Predicting the Next Category**