# Return Prediction Baseline Model
Using our real ASOS Graphics + TheLook e-commerce data from our group project

In [None]:
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, f1_score
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# ===============================
# 1. LOAD YOUR AVAILABLE DATASETS
# ===============================

order_items  = pd.read_csv("data/order_items.csv")
products     = pd.read_csv("data/products.csv")
users        = pd.read_csv("data/users.csv")

print(f"Order items: {order_items.shape}")
print(f"Products:    {products.shape}")
print(f"Users:       {users.shape}")

# ===============================
# 2. CREATE TARGET VARIABLE
# ===============================
# Using status from order_items
order_items["is_returned"] = order_items["status"].isin(["Returned", "Cancelled"]).astype(int)
print(f"Return rate (from status): {order_items['is_returned'].mean():.1%}")

# ===============================
# 3. FIX DATETIME PARSING (UTC + MIXED FORMAT)
# ===============================

order_items["created_at"] = pd.to_datetime(
    order_items["created_at"],
    format="mixed",
    utc=True,
    errors="coerce"
)

order_items["delivered_at"] = pd.to_datetime(
    order_items["delivered_at"],
    format="mixed",
    utc=True,
    errors="coerce"
)

order_items["delivery_time_days"] = (
    order_items["delivered_at"] - order_items["created_at"]
).dt.days

order_items["delivery_time_days"] = order_items["delivery_time_days"].fillna(30)

# ===============================
# 4. MERGE PRODUCT + USER FEATURES
# ===============================

df = order_items.merge(
    products,
    left_on="product_id",
    right_on="id",
    how="left",
    suffixes=("", "_product")
)

df = df.merge(
    users,
    left_on="user_id",
    right_on="id",
    how="left",
    suffixes=("", "_user")
)

print(f"Merged df: {df.shape}")

# ===============================
# 5. SELECT FEATURES
# ===============================

features = [
    "sale_price",         # order_items
    "cost",               # products
    "retail_price",       # products
    "delivery_time_days",
    "category",           # products
    "department",         # products
    "brand",              # products
    "age",                # users
    "traffic_source",     # users
]

missing = [col for col in features if col not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

X = df[features].copy()
y = df["is_returned"].copy()

# We can drop df now to free some RAM
del df
gc.collect()

# ===============================
# 6. ENCODE CATEGORICALS + CLEAN DATA
# ===============================

cat_cols = ["category", "department", "brand", "traffic_source"]
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Save feature names BEFORE converting to numpy
feature_names = list(X.columns)

# Memory-friendly numeric NaN filling
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].fillna(0)

print(f"Feature matrix after encoding: {X.shape}")

# ===============================
# 7. CONVERT TO NUMPY (float32) TO REDUCE MEMORY
# ===============================

X_np = X.to_numpy(dtype=np.float32)
y_np = y.to_numpy(dtype=np.int32)

# Drop pandas objects to free memory
del X
del y
gc.collect()

# ===============================
# 8. TRAIN/TEST SPLIT
# ===============================

X_train, X_test, y_train, y_test = train_test_split(
    X_np, y_np, test_size=0.2, random_state=42, stratify=y_np
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# OPTIONAL: Subsample training data if RAM is still tight
max_train_rows = 80000  # you can reduce this to e.g. 50000 if needed
if X_train.shape[0] > max_train_rows:
    rng = np.random.RandomState(42)
    idx = rng.choice(X_train.shape[0], size=max_train_rows, replace=False)
    X_train = X_train[idx]
    y_train = y_train[idx]
    print(f"Subsampled training data to {X_train.shape[0]} rows")

# ===============================
# 9. TRAIN XGBOOST MODEL (LIGHTER CONFIG)
# ===============================

scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

model = xgb.XGBClassifier(
    n_estimators=150,          # fewer trees to save memory
    max_depth=5,              # shallower trees
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric="logloss",
    tree_method="hist",       # memory-efficient histogram-based
    subsample=0.9,
    colsample_bytree=0.8,
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]

print("\n=== MODEL RESULTS ===")
print(f"F1-score  : {f1_score(y_test, preds):.3f}")
print(f"ROC-AUC   : {roc_auc_score(y_test, probs):.3f}")
print(classification_report(y_test, preds))

# ===============================
# 10. SAVE MODEL + FEATURE NAMES FOR FASTAPI
# ===============================

os.makedirs("src/model", exist_ok=True)

model_path = "src/model/return_predictor.json"
feature_path = "src/model/feature_names.pkl"

model.save_model(model_path)
joblib.dump(feature_names, feature_path)

print(f"\nModel saved → {model_path}")
print(f"Feature names saved → {feature_path}")

Order items: (180952, 11)
Products:    (29120, 9)
Users:       (100000, 15)
Return rate (from status): 25.0%
Merged df: (180952, 37)
Feature matrix after encoding: (180952, 2789)
Train shape: (144761, 2789), Test shape: (36191, 2789)
Subsampled training data to 80000 rows
scale_pos_weight: 3.00

=== MODEL RESULTS ===
F1-score  : 0.334
ROC-AUC   : 0.536
              precision    recall  f1-score   support

           0       0.77      0.66      0.71     27127
           1       0.28      0.41      0.33      9064

    accuracy                           0.59     36191
   macro avg       0.53      0.53      0.52     36191
weighted avg       0.65      0.59      0.61     36191


Model saved → src/model/return_predictor.json
Feature names saved → src/model/feature_names.pkl
