# FINAL ROBUST MODEL â€“ WILL WORK NO MATTER WHAT DATA YOU HAVE

# ðŸ§  AI-Powered Product Return Prediction (ET6 CDSP Group 17)

This notebook builds the **machine-learning model** used by the backend API and frontend dashboard.  
It follows the steps:

1. Load the e-commerce orders dataset (orders, order_items, products, users)
2. Merge into a clean training dataset
3. Create features (price, delivery time, category, brand, age, etc.)
4. Train a gradient-boosted XGBoost return-prediction model
5. Evaluate with F1 score and ROC-AUC
6. Save the model + feature names for FastAPI

This notebook is the **official Milestone 4 modelling deliverable** and is connected to:

- `backend/main.py` (API)
- `app.py` (Streamlit UI)
- `src/model/return_predictor.json`
- `src/model/feature_names.pkl`

Run the next cell to perform all modelling steps.


CWD: c:\Users\ADMIN\ET6-CDSP-group-17-repo\ai-return-prevention
Root contents: ['.git', '.gitignore', '.ipynb_checkpoints', '.temp_structure_placeholder.txt', '.venv', '01_baseline_model.ipynb', 'app.py', 'app_cloud.py', 'backend', 'data', 'demo.py', 'frontend', 'notebooks', 'README.md', 'requirements.txt', 'src']
Data contents: ['distribution_centers.csv', 'order_items.csv', 'products.csv', 'README.md', 'users.csv']

Loaded shapes:
order_items: (180952, 11)
products:    (29120, 9)
users:       (100000, 15)

Merged df: (180952, 35)
Return rate: 0.25045315884875546
Final feature matrix: (180952, 2790)


ValueError: feature_names must be string, and may not contain [, ] or <

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, f1_score
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

print("CWD:", os.getcwd())
print("Root contents:", os.listdir())
print("Data contents:", os.listdir("data"))

# =========================================================
# 1. LOAD DATA (ONLY FILES THAT EXIST)
# =========================================================

order_items  = pd.read_csv('data/order_items.csv')
products     = pd.read_csv('data/products.csv')
users        = pd.read_csv('data/users.csv')

print("\nLoaded shapes:")
print(f"order_items: {order_items.shape}")
print(f"products:    {products.shape}")
print(f"users:       {users.shape}")

# =========================================================
# 2. MERGE INTO SINGLE DATAFRAME
#    (use product_id -> products.id, user_id -> users.id)
# =========================================================

df = order_items.merge(
    products,
    left_on="product_id",
    right_on="id",
    how="left",
    suffixes=("", "_prod")
)

df = df.merge(
    users,
    left_on="user_id",
    right_on="id",
    how="left",
    suffixes=("", "_user")
)

print("\nMerged df:", df.shape)

# =========================================================
# 3. CREATE RETURN LABEL
# =========================================================

df['is_returned'] = df['status'].isin(['Returned', 'Cancelled']).astype(int)
print("Return rate:", df['is_returned'].mean())

# =========================================================
# 4. FEATURE ENGINEERING
# =========================================================

# Parse timestamps from order_items
df['created_at']   = pd.to_datetime(df['created_at'], errors='coerce')
df['delivered_at'] = pd.to_datetime(df['delivered_at'], errors='coerce')

df["delivery_time_days"] = (df["delivered_at"] - df["created_at"]).dt.days
df["delivery_time_days"] = df["delivery_time_days"].fillna(
    df["delivery_time_days"].median()
)

# Select features that we KNOW exist:
features = [
    "sale_price",            # from order_items
    "cost",                  # from products
    "retail_price",          # from products
    "delivery_time_days",    # engineered
    "category",              # from products
    "department",            # from products
    "brand",                 # from products
    "age",                   # from users
    "traffic_source",        # from users
]

X = df[features].copy()
y = df["is_returned"]

# One-hot encode categoricals
cat_cols = ["category", "department", "brand", "traffic_source"]
num_cols = ["sale_price", "cost", "retail_price", "delivery_time_days", "age"]

# Ensure numeric
X[num_cols] = X[num_cols].apply(
    lambda col: pd.to_numeric(col, errors="coerce")
)
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# Fill categorical missing with "Unknown"
for c in cat_cols:
    X[c] = X[c].fillna("Unknown")

X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# ðŸ”§ SANITIZE COLUMN NAMES FOR XGBOOST
X.columns = (
    X.columns.astype(str)
        .str.replace('[', '(', regex=False)
        .str.replace(']', ')', regex=False)
        .str.replace('<', 'lt', regex=False)
)

print("Final feature matrix:", X.shape)

# =========================================================
# 5. TRAIN MODEL
# =========================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.7,
    colsample_bytree=0.7,
    eval_metric="logloss",
    tree_method="hist",
    random_state=42,
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]

print("\n=== MODEL RESULTS ===")
print("F1:", round(f1_score(y_test, preds), 3))
print("ROC-AUC:", round(roc_auc_score(y_test, probs), 3))
print("\nClassification report:\n")
print(classification_report(y_test, preds))

# =========================================================
# 6. SAVE MODEL + FEATURE NAMES
# =========================================================

os.makedirs('src/model', exist_ok=True)

model.save_model('src/model/return_predictor.json')
joblib.dump(list(X.columns), 'src/model/feature_names.pkl')

print("\nModel & feature names saved successfully!")
print(" - src/model/return_predictor.json")
print(" - src/model/feature_names.pkl")
