 STEP 1: Load & Prepare Data

In [None]:
import kagglehub
path = kagglehub.dataset_download("yasserh/instacart-online-grocery-basket-analysis-dataset")

Using Colab cache for faster access to the 'instacart-online-grocery-basket-analysis-dataset' dataset.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import joblib


In [None]:
df = pd.read_csv(f'{path}/aisles.csv')

In [None]:
# Load data
orders = pd.read_csv(f'{path}/orders.csv')
prior = pd.read_csv(f'{path}/order_products__prior.csv')
#train = pd.read_csv(f'{path}/order_products__train.csv')

# Merge
data = prior.merge(orders, on="order_id", how="left")

# Sort for temporal logic
data = data.sort_values(["user_id", "order_number"])


STEP 2: Define Cutoff (LEAK-PROOF)

In [None]:
CUTOFF_ORDER = 25

past_data = data[data["order_number"] <= CUTOFF_ORDER]
future_data = data[
    (data["order_number"] > CUTOFF_ORDER) &
    (data["order_number"] <= CUTOFF_ORDER + 3)  # ~30 days in Instacart cycles
]


 STEP 3: Feature Engineering (PAST ONLY)

In [None]:
features = past_data.groupby("user_id").agg(
    total_orders=("order_id", "nunique"),
    total_products=("product_id", "count"),
    reorder_rate=("reordered", "mean"),
    avg_days_between_orders=("days_since_prior_order", "mean"),
    recency_days=("days_since_prior_order", "last"),
    orders_last_5=("order_number", lambda x: (x > CUTOFF_ORDER - 5).sum())
).reset_index()


 STEP 4: Target Creation (NEXT 30 DAYS)

In [None]:
target = future_data.groupby("user_id").agg(
    future_spend_30d=("product_id", "count")
).reset_index()

dataset = features.merge(target, on="user_id", how="left")
dataset["future_spend_30d"] = dataset["future_spend_30d"].fillna(0)


STEP 5: Stage 1 — Buy / No Buy Model

In [None]:
dataset["will_buy_30d"] = (dataset["future_spend_30d"] > 0).astype(int)

X = dataset.drop(columns=["user_id", "future_spend_30d", "will_buy_30d"])
y_class = dataset["will_buy_30d"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000))
])

clf.fit(X_train, y_train)

proba_test = clf.predict_proba(X_test)[:, 1]
print("Stage 1 AUC:", roc_auc_score(y_test, proba_test))


Stage 1 AUC: 0.9979030106826238


 STEP 6: Stage 2 — Spend Regression (ONLY BUYERS)

In [None]:
buyers = dataset[dataset["future_spend_30d"] > 0].copy()
buyers["log_spend"] = np.log1p(buyers["future_spend_30d"])

X_reg = buyers.drop(columns=["user_id", "future_spend_30d", "log_spend", "will_buy_30d"])
y_reg = buyers["log_spend"]

Xr_train, Xr_test, yr_train, yr_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)


MODEL 1: Linear Regression (Baseline)

In [None]:
lr = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

lr.fit(Xr_train, yr_train)
pred_lr = np.expm1(lr.predict(Xr_test))
true_lr = np.expm1(yr_test)

print("Linear MAE:", mean_absolute_error(true_lr, pred_lr))


Linear MAE: 10.217727325759661


MODEL 2: Random Forest Regressor

In [None]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf.fit(Xr_train, yr_train)
pred_rf = np.expm1(rf.predict(Xr_test))

print("RandomForest MAE:", mean_absolute_error(true_lr, pred_rf))


RandomForest MAE: 8.490630678292819


MODEL 3: LightGBM (BEST MODEL)

In [None]:
lgb_model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

lgb_model.fit(Xr_train, yr_train)
pred_lgb = np.expm1(lgb_model.predict(Xr_test))

print("LightGBM MAE:", mean_absolute_error(true_lr, pred_lgb))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 993
[LightGBM] [Info] Number of data points in the train set: 28772, number of used features: 5
[LightGBM] [Info] Start training from score 3.202597
LightGBM MAE: 8.509700508220291


STEP 7: Final CLV Prediction Logic

In [None]:
# Final prediction = P(buy) × Expected spend

test_users = X_test.copy()

buy_prob = clf.predict_proba(test_users)[:, 1]
expected_spend = np.expm1(lgb_model.predict(test_users))

final_clv = buy_prob * expected_spend


 RMSE & R²

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


In [None]:
rmse_lgb = np.sqrt(mean_squared_error(true_lr, pred_lgb))
r2_lgb = r2_score(true_lr, pred_lgb)

print("LightGBM RMSE:", rmse_lgb)
print("LightGBM R2  :", r2_lgb)


LightGBM RMSE: 12.116639199355193
LightGBM R2  : 0.6127120741297154


In [None]:
rmse_rf = np.sqrt(mean_squared_error(true_lr, pred_rf))
r2_rf = r2_score(true_lr, pred_rf)

print("RandomForest RMSE:", rmse_rf)
print("RandomForest R2  :", r2_rf)


RandomForest RMSE: 12.108283967036447
RandomForest R2  : 0.6132460117775613


 STEP 8: Save Models (For FastAPI)

In [None]:
joblib.dump(clf, "buy_probability_model.pkl")
joblib.dump(lgb_model, "spend_regression_model.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

['feature_columns.pkl']

In [None]:
import joblib
import numpy as np
import pandas as pd

# ===============================
# 1. LOAD MODELS
# ===============================

buy_model = joblib.load("buy_probability_model.pkl")
spend_model = joblib.load("spend_regression_model.pkl")

# ===============================
# 2. FEATURE ORDER (KNOWN)
# ===============================

FEATURE_ORDER = [
    "total_orders",
    "total_products",
    "reorder_rate",
    "avg_days_between_orders",
    "recency_days",
    "orders_last_5"
]

# ===============================
# 3. PREDICTION FUNCTION
# ===============================

def predict_30d_clv(input_features: dict):
    """
    input_features: dict with keys exactly matching FEATURE_ORDER
    returns: predicted 30-day CLV
    """

    # Convert input dict → DataFrame
    X = pd.DataFrame([input_features])[FEATURE_ORDER]

    # Stage 1: Buy probability
    buy_prob = buy_model.predict_proba(X)[:, 1][0]

    # Stage 2: Expected spend (log scale → original)
    expected_spend = np.expm1(spend_model.predict(X))[0]

    # Final CLV
    final_clv = buy_prob * expected_spend

    return {
        "buy_probability": round(float(buy_prob), 4),
        "expected_spend_if_buy": round(float(expected_spend), 2),
        "predicted_30d_clv": round(float(final_clv), 2)
    }

# ===============================
# 4. EXAMPLE USAGE
# ===============================

if __name__ == "__main__":
    sample_input = {
        "total_orders": 30,
        "total_products": 180,
        "reorder_rate": 0.65,
        "avg_days_between_orders": 7.2,
        "recency_days": 3,
        "orders_last_5": 4
    }

    prediction = predict_30d_clv(sample_input)
    print(prediction)


{'buy_probability': 1.0, 'expected_spend_if_buy': 8.96, 'predicted_30d_clv': 8.96}
