# Return Prediction Baseline Model
Using our real ASOS Graphics + TheLook e-commerce data from our group project

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, f1_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Load the exact files you just added
orders       = pd.read_csv('data/orders.csv')
order_items  = pd.read_csv('data/order_items.csv')
products     = pd.read_csv('data/products.csv')
users        = pd.read_csv('data/users.csv')

# If you have a separate returns file (some versions do)
try:
    returns = pd.read_csv('data/returns.csv')
    print("returns.csv found – using real return labels")
except:
    returns = None
    print("No returns.csv – will create proxy from status")

print(f"Orders: {orders.shape}")
print(f"Order items: {order_items.shape}")
print(f"Products: {products.shape}")

In [None]:
# Merge everything
df = order_items.merge(order_items, on='order_id', how='left')
df = df.merge(products, on='product_id', how='left')
df = df.merge(users, on='user_id', how='left')

# Create target variable – this matches what your group did
if returns is not None:
    df = df.merge(returns[['order_id', 'returned']], on='order_id', how='left')
    df['is_returned'] = df['returned'].fillna(0).astype(int)
else:
    # Proxy most groups used
    df['is_returned'] = df['status'].isin(['Returned', 'Cancelled', 'Returned']).astype(int)

print(f"Final dataset: {df.shape}")
print(f"Return rate: {df['is_returned'].mean():.1%}")

In [None]:
# Feature engineering – exactly the ones your group found important
df['created_at'] = pd.to_datetime(df['created_at'])
df['delivered_at'] = pd.to_datetime(df['delivered_at'])
df['delivery_time_days'] = (df['delivered_at'] - df['created_at']).dt.days.fillna(30)

features = [
    'sale_price', 'cost', 'retail_price',
    'delivery_time_days',
    'category', 'department', 'brand',
    'age', 'traffic_source'
]

X = df[features].copy()
y = df['is_returned']

# Simple encoding
X = pd.get_dummies(X, columns=['category', 'department', 'brand', 'traffic_source'], drop_first=True)
X = X.fillna(X.median(numeric_only=True))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.1,
    scale_pos_weight=(len(y_train)-y_train.sum())/y_train.sum(),
    random_state=42,
    eval_metric='logloss'
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]

print("\n=== MODEL RESULTS ===")
print(f"F1-score  : {f1_score(y_test, preds):.3f}")
print(f"ROC-AUC   : {roc_auc_score(y_test, probs):.3f}")
print(classification_report(y_test, preds))

In [None]:
# Save the model for FastAPI & Streamlit
model.save_model('src/model/return_predictor.json')
print("Model saved → src/model/return_predictor.json")

# Also save feature list for the API later
import joblib
joblib.dump(list(X.columns), 'src/model/feature_names.pkl')
print("Feature names saved")