
# Customer Segmentation & Targeting — MIP + QUBO on One Dataset (Colab-Ready)

This notebook demonstrates a **segmentation → targeting** workflow on the **same synthetic dataset** in two ways:

1. **Classical path**
   - Lightweight **k-means** (no sklearn) to build K segments
   - **MILP targeting** to maximize expected profit under a budget (PuLP with greedy fallback)

2. **QUBO path (quantum-ready)**
   - **k-medoids via QUBO**: pick exactly \(K\) medoids + assign each customer to a medoid
   - Optional **QUBO knapsack**: choose targets under budget

All code is Colab-ready; it will install PuLP and D-Wave `dimod`/`neal` if needed.


In [None]:

# Install dependencies if needed
def _silent_imports():
    flags = {"pulp": False, "dimod": False, "neal": False}
    try:
        import pulp
        flags["pulp"] = True
    except Exception:
        pass
    try:
        import dimod
        flags["dimod"] = True
    except Exception:
        pass
    try:
        import neal
        flags["neal"] = True
    except Exception:
        pass
    return flags

flags = _silent_imports()
if not flags["pulp"]:
    %pip -q install pulp
if not flags["dimod"] or not flags["neal"]:
    %pip -q install dimod neal

flags = _silent_imports()
print("PuLP:", flags["pulp"], "| dimod:", flags["dimod"], "| neal:", flags["neal"])


In [None]:

# ==== One Synthetic Dataset ====
import numpy as np, pandas as pd

rng = np.random.default_rng(2026)

N = 250           # customers
D = 3             # feature dimensions
K = 4             # number of clusters
budget = 2000.0   # marketing budget

true_centers = np.array([[ -2,  0,  1],
                         [  2,  1, -1],
                         [  0, -2,  0.5],
                         [  3, -2,  2]], dtype=float)
labels_true = rng.integers(0, len(true_centers), size=N)
X = true_centers[labels_true] + rng.normal(0, 0.8, size=(N, D))

cost = rng.uniform(3, 20, size=N)
value = rng.uniform(40, 200, size=N)
uplift = rng.uniform(0.02, 0.20, size=N)
profit = uplift * value - cost

df = pd.DataFrame(X, columns=[f"x{d+1}" for d in range(D)])
df["cost"] = cost
df["value"] = value
df["uplift"] = uplift
df["profit"] = profit
df["label_true"] = labels_true

print("Customers:", N, " Features:", D, " K (clusters):", K, " Budget:", budget)
df.head()



## Part 1 — Classical Segmentation (k-means) + MILP Targeting


In [None]:

import numpy as np, pandas as pd

def kmeans(X, K, max_iter=100, rng=None):
    n, d = X.shape
    r = np.random.default_rng(0) if rng is None else rng
    centers = X[r.choice(n, size=K, replace=False)].copy()
    labels = np.zeros(n, dtype=int)
    for it in range(max_iter):
        dists = np.linalg.norm(X[:, None, :] - centers[None, :, :], axis=2)
        new_labels = dists.argmin(axis=1)
        if np.all(new_labels == labels) and it > 0:
            break
        labels = new_labels
        for k in range(K):
            pts = X[labels == k]
            centers[k] = pts.mean(axis=0) if len(pts)>0 else X[r.integers(0, n)]
    return centers, labels

X_arr = df[[c for c in df.columns if c.startswith("x")]].values
centers_km, labels_km = kmeans(X_arr, K, max_iter=100, rng=np.random.default_rng(7))
df["segment_kmeans"] = labels_km

profiles = df.groupby("segment_kmeans").agg(
    count=("segment_kmeans","size"),
    avg_x1=("x1","mean"), avg_x2=("x2","mean"), avg_x3=("x3","mean"),
    avg_value=("value","mean"), avg_uplift=("uplift","mean"),
    avg_cost=("cost","mean"), avg_profit=("profit","mean")
).reset_index().sort_values("segment_kmeans")

try:
    import pulp
    HAVE_PULP = True
except Exception:
    HAVE_PULP = False

def solve_targeting_milp(cost, profit, budget):
    n = len(cost)
    prob = pulp.LpProblem("Targeting", pulp.LpMaximize)
    z = [pulp.LpVariable(f"z_{i}", lowBound=0, upBound=1, cat="Binary") for i in range(n)]
    prob += pulp.lpSum(float(profit[i]) * z[i] for i in range(n))
    prob += pulp.lpSum(float(cost[i]) * z[i] for i in range(n)) <= float(budget)
    _ = prob.solve(pulp.PULP_CBC_CMD(msg=False))
    status = pulp.LpStatus[prob.status]
    z_sol = np.array([int(round(pulp.value(z[i]) or 0)) for i in range(n)])
    obj = float(pulp.value(prob.objective))
    return status, z_sol, obj

def greedy_targeting(cost, profit, budget):
    idx = np.arange(len(cost))
    density = profit / np.maximum(cost, 1e-9)
    order = idx[np.argsort(-density)]
    spend = 0.0
    chosen = []
    for i in order:
        if spend + cost[i] <= budget and profit[i] > 0:
            spend += cost[i]
            chosen.append(i)
    z = np.zeros(len(cost), dtype=int)
    z[chosen] = 1
    obj = float((profit * z).sum())
    return "Heuristic", z, obj

if HAVE_PULP:
    status_milp, z_milp, obj_milp = solve_targeting_milp(df["cost"].values, df["profit"].values, budget)
else:
    status_milp, z_milp, obj_milp = greedy_targeting(df["cost"].values, df["profit"].values, budget)

df["target_classical"] = z_milp
selected_idx = np.where(z_milp==1)[0]
spend_classical = float(df.loc[selected_idx,"cost"].sum())

print("Targeting status:", status_milp, "| Profit objective:", obj_milp, "| Spend:", spend_classical)
profiles.head(), df.head()


In [None]:

# Summaries
summary_segments = df.groupby("segment_kmeans").agg(
    size=("segment_kmeans","size"),
    targeted=("target_classical","sum"),
    avg_profit=("profit","mean")
).reset_index().sort_values("segment_kmeans")

summary_target = pd.DataFrame({
    "num_targeted":[int(df["target_classical"].sum())],
    "total_spend":[float((df["target_classical"]*df["cost"]).sum())],
    "total_profit":[float((df["target_classical"]*df["profit"]).sum())]
})
summary_segments, summary_target


In [None]:

import matplotlib.pyplot as plt

plt.figure()
plt.scatter(df["x1"], df["x2"], c=df["segment_kmeans"])
sel = df["target_classical"]==1
plt.scatter(df.loc[sel,"x1"], df.loc[sel,"x2"], marker='x')
plt.title("K-means Segments (color) + Targeted (x)")
plt.xlabel("x1"); plt.ylabel("x2")
plt.tight_layout()



## Part 2 — QUBO Segmentation (k-medoids / facility-location) + Optional QUBO Targeting


In [None]:

from collections import defaultdict
import numpy as np, pandas as pd
import dimod, neal

X = df[[c for c in df.columns if c.startswith("x")]].values
N = X.shape[0]

Dmat = np.linalg.norm(X[:,None,:] - X[None,:,:], axis=2)

A = float(10.0 * np.max(Dmat))  # assignment penalty
B = float(10.0 * np.max(Dmat))  # open-if-assigned penalty
G = float(10.0 * np.max(Dmat))  # cardinality penalty

def y_label(i): return f"y|{i}"
def z_label(i,j): return f"z|{i}|{j}"

Q = defaultdict(float)

# Cardinality: (sum y - K)^2
for i in range(N):
    Q[(y_label(i), y_label(i))] += G * (1 - 2*K)
for i in range(N):
    for j in range(i+1, N):
        Q[(y_label(i), y_label(j))] += 2*G

# Assignment-equals-one for each customer j
for j in range(N):
    for i in range(N):
        Q[(z_label(i,j), z_label(i,j))] += (-A)
    for i in range(N):
        for k in range(i+1, N):
            Q[(z_label(i,j), z_label(k,j))] += 2*A

# Open-if-assigned
for i in range(N):
    for j in range(N):
        Q[(z_label(i,j), z_label(i,j))] += B
        Q[(z_label(i,j), y_label(i))] += (-B)

# Distance objective
for i in range(N):
    for j in range(N):
        Q[(z_label(i,j), z_label(i,j))] += Dmat[i,j]

bqm = dimod.BinaryQuadraticModel.from_qubo(dict(Q))
sampleset = neal.SimulatedAnnealingSampler().sample(bqm, num_reads=300)
best = sampleset.first

assign = {(i,j): int(best.sample.get(z_label(i,j), 0)) for i in range(N) for j in range(N)}
open_medoids = [i for i in range(N) if int(best.sample.get(y_label(i), 0))==1]

labels_qubo = np.zeros(N, dtype=int)
viol = []
for j in range(N):
    choices = [i for i in range(N) if assign[(i,j)]==1]
    if len(choices)==0:
        if open_medoids:
            nn = min(open_medoids, key=lambda i: Dmat[i,j])
        else:
            nn = int(np.argmin(Dmat[:,j]))
        labels_qubo[j] = nn
        viol.append(f"No assignment for {j}, fallback used.")
    else:
        nn = min(choices, key=lambda i: Dmat[i,j])
        labels_qubo[j] = nn

df["segment_qubo_medoid"] = labels_qubo
df["is_medoid"] = 0
df.loc[open_medoids, "is_medoid"] = 1

print("Medoids selected:", len(open_medoids), "(target K =", K, ")")
if viol:
    print("Assignment issues:", viol[:5], ("... (+more)" if len(viol)>5 else ""))


In [None]:

import matplotlib.pyplot as plt

plt.figure()
plt.scatter(df["x1"], df["x2"], c=df["segment_qubo_medoid"])
med = df["is_medoid"]==1
plt.scatter(df.loc[med,"x1"], df.loc[med,"x2"], marker='x')
plt.title("QUBO k-medoids Segments (color) + Medoids (x)")
plt.xlabel("x1"); plt.ylabel("x2")
plt.tight_layout()


In [None]:

# Optional QUBO targeting (knapsack with penalty)
import numpy as np, dimod, neal

profit_vec = df["profit"].values
cost_vec = df["cost"].values
BUD = float(budget)

lam = float(10.0 * max(1.0, np.max(np.abs(profit_vec))))

Q2 = {}
for i in range(N):
    Q2[(i,i)] = Q2.get((i,i), 0.0) - float(profit_vec[i])
for i in range(N):
    Q2[(i,i)] = Q2.get((i,i), 0.0) + lam*(cost_vec[i]**2 - 2*BUD*cost_vec[i])
for i in range(N):
    for j in range(i+1, N):
        Q2[(i,j)] = Q2.get((i,j), 0.0) + lam*(2*cost_vec[i]*cost_vec[j])

bqm2 = dimod.BinaryQuadraticModel.from_qubo(Q2)
sampleset2 = neal.SimulatedAnnealingSampler().sample(bqm2, num_reads=1200)
best2 = sampleset2.first
s = np.array([best2.sample.get(i,0) for i in range(N)], dtype=int)

spend = float((s * cost_vec).sum())
gain = float((s * profit_vec).sum())
df["target_qubo"] = s

print("QUBO targeting — spend:", round(spend,2), " (budget:", BUD, ")  | profit:", round(gain,2))



## Wrap-up & Comparison

- **Segmentation:** k-means (centroids) vs k-medoids (representatives).  
- **Targeting:** MILP is exact for linear profit; QUBO knapsack is a smooth penalty alternative.  
- Adjust **K** and **budget** at the top to explore different behaviors.
