In [5]:
!pip install pandas numpy matplotlib seaborn scikit-learn plotly streamlit



In [6]:
import pandas as pd
import numpy as np

np.random.seed(42)

n = 5000  # number of customers

ages = np.random.randint(18, 65, n)
income = np.random.normal(45000, 12000, n).astype(int)

# Product preferences
products = ["Toothpaste", "Mouthwash", "Toothbrush", "Whitening Kit"]
product_pref = np.random.choice(products, n)

# Purchase behavior
transactions = np.random.poisson(8, n)
avg_spend = np.round(np.random.normal(7.5, 2.5, n), 2)
recency = np.random.randint(1, 120, n)

# Loyalty score based on synthetic logic
loyalty = np.round(
    0.4*(20-transactions) +
    0.6*(recency/10) +
    np.random.normal(0, 1, n), 2
)
loyalty = np.abs(100 - (loyalty * 3)).astype(int)

df = pd.DataFrame({
    "customer_id": np.arange(1, n+1),
    "age": ages,
    "income": income,
    "preferred_product": product_pref,
    "transactions_last_year": transactions,
    "avg_spend": avg_spend,
    "recency_days": recency,
    "loyalty_score": loyalty
})

df.to_csv("colgate_customers.csv", index=False)
df.head()

Unnamed: 0,customer_id,age,income,preferred_product,transactions_last_year,avg_spend,recency_days,loyalty_score
0,1,56,46738,Toothbrush,13,6.34,94,72
1,2,46,45286,Mouthwash,5,8.27,71,65
2,3,32,40783,Whitening Kit,12,10.36,20,89
3,4,60,63760,Toothbrush,4,9.85,39,76
4,5,25,35178,Whitening Kit,9,10.08,3,86


In [7]:
df["R"] = df["recency_days"]
df["F"] = df["transactions_last_year"]
df["M"] = df["avg_spend"] * df["transactions_last_year"]

rfm = df[["customer_id", "R", "F", "M"]]
rfm.head()


Unnamed: 0,customer_id,R,F,M
0,1,94,13,82.42
1,2,71,5,41.35
2,3,20,12,124.32
3,4,39,4,39.4
4,5,3,9,90.72


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

X = rfm[["R", "F", "M"]]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
df["cluster"] = kmeans.fit_predict(X_scaled)

df.to_csv("colgate_customers_clustered.csv", index=False)
import joblib
joblib.dump(kmeans, "colgate_kmeans_model.pkl")
joblib.dump(scaler, "colgate_scaler.pkl")


['colgate_scaler.pkl']