In [1]:
from pathlib import Path
import pandas as pd

BASE_DIR = Path.cwd()          # current working directory
DATA_DIR = BASE_DIR / "data"
MIN_ORDERS = 5

In [2]:
# 1. Load tables
customers = pd.read_csv(DATA_DIR / "olist_customers_dataset.csv")
orders    = pd.read_csv(DATA_DIR / "olist_orders_dataset.csv")
items     = pd.read_csv(DATA_DIR / "olist_order_items_dataset.csv")
reviews   = pd.read_csv(DATA_DIR / "olist_order_reviews_dataset.csv")
payments  = pd.read_csv(DATA_DIR / "olist_order_payments_dataset.csv")

# 2. Filter completed orders
orders_ok = orders[
    (orders.order_status == "delivered") &
    (orders.order_delivered_customer_date.notna())
]

# 3. Attach customer identity
orders_ok = orders_ok.merge(
    customers[["customer_id", "customer_unique_id"]],
    on="customer_id",
    how="left"
)

# 4. Aggregate items → order level
items_agg = (
    items
    .groupby("order_id")
    .agg(
        total_items=("order_item_id", "count"),
        total_price=("price", "sum"),
        total_freight=("freight_value", "sum"),
        avg_item_price=("price", "mean")
    )
    .reset_index()
)

# 5. Join everything
orders_enriched = (
    orders_ok
    .merge(items_agg, on="order_id", how="left")
    .merge(reviews[["order_id", "review_score"]], on="order_id", how="left")
)


In [3]:
for name, df in {
    "customers": customers,
    "orders": orders,
    "items": items,
    "reviews": reviews,
    "payments": payments
}.items():
    print(f"{name}: {df.shape}")


customers: (99441, 5)
orders: (99441, 8)
items: (112650, 7)
reviews: (99224, 7)
payments: (103886, 5)


In [4]:
customers[["customer_id", "customer_unique_id"]].nunique()

customer_id           99441
customer_unique_id    96096
dtype: int64

In [5]:
orders_ok = orders[
    (orders["order_status"] == "delivered") &
    (orders["order_delivered_customer_date"].notna())
].copy()

orders_ok.shape

(96470, 8)

In [6]:
orders_ok = orders_ok.merge(
    customers[["customer_id", "customer_unique_id"]],
    on="customer_id",
    how="left"
)

orders_ok["customer_unique_id"].isna().sum()

0

In [7]:
items_agg = (
    items
    .groupby("order_id")
    .agg(
        total_items=("order_item_id", "count"),
        total_price=("price", "sum"),
        total_freight=("freight_value", "sum"),
        avg_item_price=("price", "mean")
    )
    .reset_index()
)

In [8]:
reviews_agg = (
    reviews
    .groupby("order_id")
    .agg(
        review_score=("review_score", "mean")
    )
    .reset_index()
)

In [9]:
orders_enriched = (
    orders_ok
    .merge(items_agg, on="order_id", how="left")
    .merge(reviews_agg, on="order_id", how="left")
)

In [10]:
orders_enriched["order_purchase_timestamp"] = pd.to_datetime(
    orders_enriched["order_purchase_timestamp"],
    errors="raise"
)

In [11]:
assert orders_enriched["order_id"].is_unique
assert orders_enriched["customer_unique_id"].notna().all()
assert (orders_enriched["total_price"] >= 0).all()

In [12]:
orders_enriched["total_price"].describe()
orders_enriched["review_score"].value_counts(dropna=False)

review_score
5.000000    56697
4.000000    18868
1.000000     9312
3.000000     7915
2.000000     2916
NaN           646
4.500000       53
2.500000       30
3.500000       23
1.500000        8
3.333333        1
4.333333        1
Name: count, dtype: int64

In [13]:
reference_date = orders_enriched["order_purchase_timestamp"].max()
reference_date

Timestamp('2018-08-29 15:00:37')

# Customer Segmentation

In [14]:
customer_features = (
    orders_enriched
    .groupby("customer_unique_id")
    .agg(
        n_orders=("order_id", "count"),
        total_spend=("total_price", "sum"),
        avg_order_value=("total_price", "mean"),
        avg_items_per_order=("total_items", "mean"),
        avg_item_price=("avg_item_price", "mean"),

        first_purchase=("order_purchase_timestamp", "min"),
        last_purchase=("order_purchase_timestamp", "max"),
    )
    .reset_index()
)

In [15]:
customer_features["recency_days"] = (
    reference_date - customer_features["last_purchase"]
).dt.days

customer_features["customer_lifetime_days"] = (
    customer_features["last_purchase"] - customer_features["first_purchase"]
).dt.days

In [16]:
customer_features[[
    "n_orders",
    "total_spend",
    "recency_days"
]].describe(percentiles=[0.25, 0.5, 0.75, 0.9])

Unnamed: 0,n_orders,total_spend,recency_days
count,93350.0,93350.0,93350.0
mean,1.033423,141.620235,236.95007
std,0.209106,215.702028,152.589932
min,1.0,0.85,0.0
25%,1.0,47.65,113.0
50%,1.0,89.7,218.0
75%,1.0,154.6975,345.0
90%,1.0,279.99,465.0
max,15.0,13440.0,713.0


In [17]:
q_orders = customer_features["n_orders"].quantile([0.5, 0.75])
q_spend  = customer_features["total_spend"].quantile([0.5, 0.75])
q_rec    = customer_features["recency_days"].quantile([0.25, 0.5])

In [18]:
def customer_segment(row):
    if row.n_orders >= 3:
        return "Repeat customers"
    if row.n_orders == 2:
        return "Returning customers"
    if row.n_orders == 1 and row.total_spend >= customer_features["total_spend"].median():
        return "High value one-time"
    return "Low value one-time"

In [19]:
customer_features["customer_segment"] = customer_features.apply(
    customer_segment, axis=1
)

In [20]:
customer_features["customer_segment"].value_counts()

customer_segment
Low value one-time     46198
High value one-time    44351
Returning customers     2573
Repeat customers         228
Name: count, dtype: int64

In [21]:
customer_features.groupby("customer_segment")[[
    "n_orders",
    "total_spend",
    "recency_days"
]].mean()

Unnamed: 0_level_0,n_orders,total_spend,recency_days
customer_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High value one-time,1.0,232.599135,236.653018
Low value one-time,1.0,47.098055,238.305706
Repeat customers,3.399123,426.025658,201.280702
Returning customers,2.0,245.345861,220.890789


# Product Segmentation

In [22]:
product_features = (
    items
    .groupby("product_id")
    .agg(
        n_orders=("order_id", "nunique"),
        total_units=("order_item_id", "count"),
        total_revenue=("price", "sum"),
        avg_price=("price", "mean"),
    )
    .reset_index()
)

In [23]:
product_features.describe(percentiles=[0.5, 0.75, 0.9])

Unnamed: 0,n_orders,total_units,total_revenue,avg_price
count,32951.0,32951.0,32951.0,32951.0
mean,3.108403,3.418713,412.480462,145.302464
std,9.456937,10.619709,1371.945598,246.895756
min,1.0,1.0,2.2,0.85
50%,1.0,1.0,136.75,79.0
75%,2.0,3.0,329.0,154.9
90%,6.0,6.0,801.6,295.225
max,467.0,527.0,63885.0,6735.0


In [24]:
items_with_customers = (
    items
    .merge(
        orders_enriched[["order_id", "customer_unique_id"]],
        on="order_id",
        how="left"
    )
    .merge(
        customer_features[["customer_unique_id", "customer_segment"]],
        on="customer_unique_id",
        how="left"
    )
)

In [25]:
product_segment_matrix = (
    items_with_customers
    .groupby(["product_id", "customer_segment"])
    .size()
    .unstack(fill_value=0)
)

In [26]:
def product_segment(row):
    if row["Repeat customers"] > row.sum() * 0.4:
        return "Loyalty products"
    if row["High value one-time"] > row.sum() * 0.5:
        return "Premium impulse"
    if row["Low value one-time"] > row.sum() * 0.5:
        return "Entry-level"
    return "Mixed audience"

In [27]:
product_segment_matrix

customer_segment,High value one-time,Low value one-time,Repeat customers,Returning customers
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00066f42aeeb9f3007548bb9d3f33c38,1,0,0,0
00088930e925c41fd95ebfe695fd2655,1,0,0,0
0009406fd7479715e4bef61dd91f2462,1,0,0,0
000b8f95fcb9e0096488278317764d19,0,2,0,0
000d9be29b5207b54e86aa1b1ac54872,1,0,0,0
...,...,...,...,...
fff6177642830a9a94a0f2cba5e476d1,2,0,0,0
fff81cc3158d2725c0655ab9ba0f712c,1,0,0,0
fff9553ac224cec9d15d49f5a263411f,0,1,0,0
fffdb2d0ec8d6a61f0a0a0db3f25b441,0,5,0,0


# Normalize product preferences

In [28]:
product_segment_share = product_segment_matrix.div(
    product_segment_matrix.sum(axis=1),
    axis=0
)

In [29]:
product_segment_share.head()
product_segment_share.describe()

customer_segment,High value one-time,Low value one-time,Repeat customers,Returning customers
count,32214.0,32214.0,32214.0,32214.0
mean,0.483372,0.446048,0.01111,0.05947
std,0.480644,0.477458,0.093937,0.206153
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.369396,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0


In [30]:
def product_segment(row):
    if row["Repeat customers"] >= 0.4:
        return "Loyalty product"
    if row["Returning customers"] >= 0.4:
        return "Retention product"
    if row["High value one-time"] >= 0.5:
        return "Premium impulse"
    if row["Low value one-time"] >= 0.5:
        return "Entry-level"
    return "Mixed audience"

In [31]:
product_segments = (
    product_segment_share
    .apply(product_segment, axis=1)
    .rename("product_segment")
    .reset_index()
)

In [32]:
product_features = product_features.merge(
    product_segments,
    on="product_id",
    how="left"
)

In [33]:
product_features["product_segment"].value_counts()

product_segment
Premium impulse      15622
Entry-level          14261
Retention product     1883
Loyalty product        357
Mixed audience          91
Name: count, dtype: int64

In [34]:
recommendation_map = {
    "Low value one-time": ["Entry-level"],
    "High value one-time": ["Premium impulse"],
    "Returning customers": ["Retention product", "Mixed audience"],
    "Repeat customers": ["Loyalty product", "Mixed audience"],
}

In [35]:
def recommend_products(customer_id, top_n=10):
    segment = customer_features.loc[
        customer_features.customer_unique_id == customer_id,
        "customer_segment"
    ].iloc[0]

    target_segments = recommendation_map[segment]

    return (
        product_features[
            product_features.product_segment.isin(target_segments)
        ]
        .sort_values("total_revenue", ascending=False)
        .head(top_n)
    )

In [36]:
customer_products = (
    items_with_customers
    .groupby("customer_unique_id")["product_id"]
    .apply(set)
)

In [37]:
def recommend_products(customer_id, top_n=10):
    # 1. Identify customer segment
    segment = customer_features.loc[
        customer_features.customer_unique_id == customer_id,
        "customer_segment"
    ].iloc[0]

    # 2. Which product segments we allow for this customer
    target_segments = recommendation_map[segment]

    # 3. Build candidate set
    candidates = product_features[
        product_features.product_segment.isin(target_segments)
    ].copy()

    # 4. Attach affinity score (segment-aware!)
    affinity = (
        product_segment_share[[segment]]
        .rename(columns={segment: "affinity"})
        .reset_index()
    )

    candidates = candidates.merge(
        affinity,
        on="product_id",
        how="left"
    )

    # Missing affinity = 0
    candidates["affinity"] = candidates["affinity"].fillna(0)

    # 5. Rank: affinity first, revenue second
    return (
        candidates
        .sort_values(
            ["affinity", "total_revenue"],
            ascending=False
        )
        .head(top_n)
    )

In [38]:
customer_features["customer_segment"].sample(5)

76625     Low value one-time
8723     High value one-time
3545     High value one-time
72180     Low value one-time
3553     High value one-time
Name: customer_segment, dtype: object

In [39]:
customer_features[
    ["customer_unique_id", "customer_segment"]
].sample(10)

customer_id_1 = customer_features.loc[
    customer_features.customer_segment == "Low value one-time",
    "customer_unique_id"
].iloc[0]

customer_id_2 = customer_features.loc[
    customer_features.customer_segment == "Repeat customers",
    "customer_unique_id"
].iloc[0]

recommend_products(customer_id_1)
recommend_products(customer_id_2)

Unnamed: 0,product_id,n_orders,total_units,total_revenue,avg_price,product_segment,affinity
13,060965aa6dfa817b80abda0c9413cc77,2,2,1669.12,834.56,Loyalty product,1.0
370,d35d1fa56f3c5a2777f5542a84a085dd,1,1,1013.4,1013.4,Loyalty product,1.0
15,07755a0056db849613241c6d91eaf45f,1,1,849.0,849.0,Loyalty product,1.0
287,a3f487d2926f4af42e86454a8ec474c1,1,3,567.0,189.0,Loyalty product,1.0
184,65f037a133d9ffc71164a5d69dee187e,1,3,518.7,172.9,Loyalty product,1.0
194,6e2a68aa94eb2a3b5b3499bf2314d544,1,2,503.84,251.92,Loyalty product,1.0
298,ad1280b6c5b1af7cfd97372249be66c9,1,1,469.9,469.9,Loyalty product,1.0
24,0b00b16606aeb46da3543bbabff4d6b3,1,2,465.98,232.99,Loyalty product,1.0
111,3ed43e8f95f9801cd8099b6f73e07335,1,1,429.9,429.9,Loyalty product,1.0
166,596dd8bfc1289b4336ef857ffb960d47,1,1,429.9,429.9,Loyalty product,1.0


In [40]:
def sample_customers_by_segment(n=3):
    return (
        customer_features
        .groupby("customer_segment")
        .sample(n, random_state=42)
        [["customer_unique_id", "customer_segment"]]
    )

In [41]:
def recommend_products(customer_id, top_n=10):
    segment = customer_features.loc[
        customer_features.customer_unique_id == customer_id,
        "customer_segment"
    ].iloc[0]

    target_segments = recommendation_map[segment]

    return (
        product_features[
            (product_features.product_segment.isin(target_segments)) &
            (product_features.n_orders >= MIN_ORDERS)
        ]
        .sort_values("total_revenue", ascending=False)
        .head(top_n)
    )

In [42]:
customer_product_history = (
    items_with_customers
    .groupby(["customer_unique_id", "product_id"])
    .agg(
        n_purchases=("order_id", "count"),
        total_spent=("price", "sum")
    )
    .reset_index()
)

In [43]:
segment_product_affinity = (
    items_with_customers
    .groupby(["customer_segment", "product_id"])
    .size()
    .rename("segment_purchases")
    .reset_index()
)

In [44]:
segment_totals = (
    segment_product_affinity
    .groupby("customer_segment")["segment_purchases"]
    .sum()
    .rename("segment_total")
    .reset_index()
)

segment_product_affinity = segment_product_affinity.merge(
    segment_totals,
    on="customer_segment"
)

segment_product_affinity["affinity"] = (
    segment_product_affinity["segment_purchases"]
    / segment_product_affinity["segment_total"]
)

In [45]:
def recommend_products(customer_id, top_n=10):
    segment = customer_features.loc[
        customer_features.customer_unique_id == customer_id,
        "customer_segment"
    ].iloc[0]

    allowed_product_segments = recommendation_map[segment]

    candidates = (
        product_features[
            product_features.product_segment.isin(allowed_product_segments)
        ]
        .merge(
            segment_product_affinity[
                segment_product_affinity.customer_segment == segment
            ][["product_id", "affinity"]],
            on="product_id",
            how="left"
        )
        .fillna({"affinity": 0})
    )

    return (
        candidates
        .sort_values(
            ["affinity", "total_revenue"],
            ascending=False
        )
        .head(top_n)
    )

In [46]:
recommend_products(customer_id_1)
recommend_products(customer_id_2)

Unnamed: 0,product_id,n_orders,total_units,total_revenue,avg_price,product_segment,affinity
431,f3720bc68555b1bff49b9ffd41b017ac,7,20,3627.5,181.375,Loyalty product,0.011976
83,2fb9e46750ac55362f7b642f12b5835b,2,8,120.0,15.0,Loyalty product,0.007984
292,a8d2c5e8f29550a539f377d977f10a52,5,7,638.0,91.142857,Loyalty product,0.005988
201,70906e04da1eebf3d1b8791bd09ffe85,12,17,1193.39,70.199412,Mixed audience,0.00499
87,325e75d20ca67d859f707129be35878e,2,5,224.95,44.99,Loyalty product,0.00499
342,c2b534c5a4a6cbfc41aeaf362fb0c060,1,5,102.5,20.5,Loyalty product,0.00499
289,a5b15c8b0abb9a1e7eb10546441925d0,3,5,60.0,12.0,Loyalty product,0.00499
318,b655ebf10fa7727c97d82cffcfe96ab9,1,5,47.5,9.5,Loyalty product,0.00499
1,00faa46f36261af8bbf3a4d37fa4841b,10,10,2800.0,280.0,Loyalty product,0.003992
59,202bd859659a841de892b00c341300ff,11,13,455.0,35.0,Mixed audience,0.003992


In [47]:
def recommend_products(customer_id, top_n=10):
    segment = customer_features.loc[
        customer_features.customer_unique_id == customer_id,
        "customer_segment"
    ].iloc[0]

    target_segments = recommendation_map[segment]

    # products already bought by this customer
    bought_products = customer_product_history.loc[
        customer_product_history.customer_unique_id == customer_id,
        "product_id"
    ]

    candidates = (
        product_features[
            product_features.product_segment.isin(target_segments)
        ]
        .merge(
            segment_product_affinity[
                segment_product_affinity.customer_segment == segment
            ][["product_id", "affinity"]],
            on="product_id",
            how="left"
        )
        .fillna({"affinity": 0})
    )

    candidates = candidates[
        ~candidates.product_id.isin(bought_products)
    ]

    return (
        candidates
        .sort_values(
            ["affinity", "total_revenue"],
            ascending=False
        )
        .head(top_n)
    )

In [48]:
# 1. Compare two customers from different segments
recommend_products(customer_features.customer_unique_id.iloc[0])
recommend_products(customer_features.customer_unique_id.iloc[-1])

Unnamed: 0,product_id,n_orders,total_units,total_revenue,avg_price,product_segment,affinity
9525,aca2eb7d00ea1a7b8ebd4e68314663af,431,527,37608.9,71.364137,Entry-level,0.006669
1199,154e7e31ebfa092203795c972e5804a6,269,281,6325.19,22.509573,Entry-level,0.00448
3147,389d119b48cf3043d311335e499d9c6b,311,392,21440.59,54.695383,Entry-level,0.004075
3015,368c6c730842d78016ad823897a372db,291,388,21056.8,54.270103,Entry-level,0.003933
6932,7c1bd920dbdf22470b68bde975dd3ccf,225,231,13866.69,60.028961,Entry-level,0.003689
9317,a92930c327948861c015c919a0bcb4a8,158,160,12475.0,77.96875,Entry-level,0.002939
604,0aabfb375647d9738ad0f7b4ea3653b1,138,142,3416.7,24.061268,Entry-level,0.002757
3544,3fbc0ef745950c7932d5f2a446189725,144,150,9862.5,65.75,Entry-level,0.002635
12467,e0cf79767c5b016251fe139915c59a26,131,137,4096.3,29.9,Entry-level,0.002635
397,06edb72f1e0c64b14c5b79353f7abea3,130,143,5831.77,40.781608,Entry-level,0.002534


In [49]:
items_with_customers = (
    items_with_customers
    .merge(
        orders_enriched[["order_id", "order_purchase_timestamp"]],
        on="order_id",
        how="left"
    )
)

In [50]:
items_with_customers["order_purchase_timestamp"].isna().sum()

2461

In [51]:
items_with_customers["days_ago"] = (
    reference_date - items_with_customers["order_purchase_timestamp"]
).dt.days

In [52]:
import numpy as np

LAMBDA = 0.01   # decay speed (safe default)

items_with_customers["time_weight"] = np.exp(
    -LAMBDA * items_with_customers["days_ago"]
)

In [53]:
items_with_customers[["order_purchase_timestamp", "days_ago", "time_weight"]].head()

Unnamed: 0,order_purchase_timestamp,days_ago,time_weight
0,2017-09-13 08:59:02,350.0,0.030197
1,2017-04-26 10:53:06,490.0,0.007447
2,2018-01-14 14:33:31,227.0,0.103312
3,2018-08-08 10:00:35,21.0,0.810584
4,2017-02-04 13:57:51,571.0,0.003313


In [54]:
def recommend_products_static(customer_id, top_n=10):
    segment = customer_features.loc[
        customer_features.customer_unique_id == customer_id,
        "customer_segment"
    ].iloc[0]

    target_segments = recommendation_map[segment]

    return (
        product_features[
            product_features.product_segment.isin(target_segments)
        ]
        .sort_values("total_revenue", ascending=False)
        .head(top_n)
        .assign(method="static")
    )

In [55]:
product_affinity_time = (
    items_with_customers
    .groupby(["product_id", "customer_segment"])
    .agg(
        affinity=("time_weight", "sum")
    )
    .reset_index()
)

In [56]:
product_affinity_time["affinity"] = (
    product_affinity_time
    .groupby("customer_segment")["affinity"]
    .transform(lambda x: x / x.sum())
)

In [57]:
product_affinity_time = product_affinity_time.merge(
    product_features,
    on="product_id",
    how="left"
)

In [58]:
def recommend_products_timeaware(customer_id, top_n=10):
    segment = customer_features.loc[
        customer_features.customer_unique_id == customer_id,
        "customer_segment"
    ].iloc[0]

    target_segments = recommendation_map[segment]

    return (
        product_affinity_time[
            (product_affinity_time.customer_segment == segment) &
            (product_affinity_time.product_segment.isin(target_segments))
        ]
        .sort_values("affinity", ascending=False)
        .head(top_n)
        .assign(method="time_aware")
    )

In [59]:
customer_id = customer_features.customer_unique_id.sample(1).iloc[0]

In [60]:
static_rec = recommend_products_static(customer_id)
time_rec   = recommend_products_timeaware(customer_id)

comparison = pd.concat([
    static_rec[["product_id", "total_revenue", "product_segment", "method"]],
    time_rec[["product_id", "affinity", "product_segment", "method"]],
])

comparison

Unnamed: 0,product_id,total_revenue,product_segment,method,affinity
8613,422879e10f46682990de24d770e7f83d,26577.22,Mixed audience,static,
10840,53759a2ecddad2bb87a079a1f1519f73,20387.2,Mixed audience,static,
8227,3f14d740544f37ece8a9e7bc8349797e,7731.03,Mixed audience,static,
28327,dbb67791e405873b259e4656bf971246,6975.37,Mixed audience,static,
7032,362b773250263786dd58670d2df42c3b,3782.7,Mixed audience,static,
31347,f3720bc68555b1bff49b9ffd41b017ac,3627.5,Loyalty product,static,
28325,dbb4ce89c8ed5fb6fd901e2e51093179,3623.95,Mixed audience,static,
19262,962a6951154f98f2c8e9a5b8b2bcf4a9,3459.8,Mixed audience,static,
114,00faa46f36261af8bbf3a4d37fa4841b,2800.0,Loyalty product,static,
11575,593236d0ff46b4299b4787fb8d43f7f0,2742.0,Mixed audience,static,


In [61]:
overlap = set(static_rec.product_id) & set(time_rec.product_id)

len(overlap), overlap

(0, set())

In [62]:
def recommend_for_all_customers(top_n=10):
    results = []

    for customer_id in customer_features.customer_unique_id:
        recs = recommend_products_timeaware(customer_id, top_n=top_n)
        recs = recs.assign(customer_unique_id=customer_id)
        results.append(recs)

    return pd.concat(results, ignore_index=True)

In [63]:
final_recommendations = recommend_for_all_customers(top_n=10)

In [64]:
final_recommendations.to_csv(
    DATA_DIR / "recommendations_timeaware.csv",
    index=False
)

customer_features.to_csv(
    DATA_DIR / "customer_segments.csv",
    index=False
)

product_features.to_csv(
    DATA_DIR / "product_segments.csv",
    index=False
)

In [65]:
final_recommendations.groupby("product_segment").size()

product_segment
Entry-level          461980
Loyalty product        2052
Mixed audience        13093
Premium impulse      443510
Retention product     12865
dtype: int64

In [66]:
dataset_overview = pd.DataFrame({
    "table": [
        "customers",
        "orders",
        "order_items",
        "reviews"
    ],
    "rows": [
        customers.shape[0],
        orders.shape[0],
        items.shape[0],
        reviews.shape[0]
    ],
    "columns": [
        customers.shape[1],
        orders.shape[1],
        items.shape[1],
        reviews.shape[1]
    ]
})

dataset_overview


Unnamed: 0,table,rows,columns
0,customers,99441,5
1,orders,99441,8
2,order_items,112650,7
3,reviews,99224,7


In [67]:
podstawowe_statystyki = pd.DataFrame({
    "Metryka": [
        "Liczba rekordów klientów",
        "Liczba unikalnych klientów",
        "Liczba zamówień (wszystkie)",
        "Liczba zamówień dostarczonych",
    ],
    "Wartość": [
        customers.shape[0],
        customers["customer_unique_id"].nunique(),
        orders.shape[0],
        orders_ok.shape[0],
    ]
})

podstawowe_statystyki

Unnamed: 0,Metryka,Wartość
0,Liczba rekordów klientów,99441
1,Liczba unikalnych klientów,96096
2,Liczba zamówień (wszystkie),99441
3,Liczba zamówień dostarczonych,96470


In [68]:
klienci_zamowienia = (
    customer_features
    .groupby("n_orders")
    .size()
    .reset_index(name="liczba_klientów")
)

klienci_zamowienia["procent_klientów"] = (
    klienci_zamowienia["liczba_klientów"]
    / klienci_zamowienia["liczba_klientów"].sum()
    * 100
).round(2)

klienci_zamowienia


Unnamed: 0,n_orders,liczba_klientów,procent_klientów
0,1,90549,97.0
1,2,2573,2.76
2,3,181,0.19
3,4,28,0.03
4,5,9,0.01
5,6,5,0.01
6,7,3,0.0
7,9,1,0.0
8,15,1,0.0


In [69]:
segmenty_klientów = (
    customer_features
    .groupby("customer_segment")
    .agg(
        liczba_klientów=("customer_unique_id", "count"),
        łączny_przychód=("total_spend", "sum")
    )
    .reset_index()
)

segmenty_klientów["procent_klientów"] = (
    segmenty_klientów["liczba_klientów"]
    / segmenty_klientów["liczba_klientów"].sum()
    * 100
).round(2)

segmenty_klientów["procent_przychodu"] = (
    segmenty_klientów["łączny_przychód"]
    / segmenty_klientów["łączny_przychód"].sum()
    * 100
).round(2)

segmenty_klientów


Unnamed: 0,customer_segment,liczba_klientów,łączny_przychód,procent_klientów,procent_przychodu
0,High value one-time,44351,10316004.24,47.51,78.03
1,Low value one-time,46198,2175835.94,49.49,16.46
2,Repeat customers,228,97133.85,0.24,0.73
3,Returning customers,2573,631274.9,2.76,4.78


In [70]:
produkty_statystyki = pd.DataFrame({
    "Metryka": [
        "Liczba unikalnych produktów",
        "Produkty sprzedane tylko raz (%)",
        "Produkty sprzedane ≤ 3 razy (%)"
    ],
    "Wartość": [
        product_features.shape[0],
        (product_features["n_orders"] == 1).mean() * 100,
        (product_features["n_orders"] <= 3).mean() * 100
    ]
}).round(2)

produkty_statystyki


Unnamed: 0,Metryka,Wartość
0,Liczba unikalnych produktów,32951.0
1,Produkty sprzedane tylko raz (%),59.37
2,Produkty sprzedane ≤ 3 razy (%),83.19


In [71]:
segmenty_produktów = (
    product_features
    .groupby("product_segment")
    .size()
    .reset_index(name="liczba_produktów")
)

segmenty_produktów["procent_produktów"] = (
    segmenty_produktów["liczba_produktów"]
    / segmenty_produktów["liczba_produktów"].sum()
    * 100
).round(2)

segmenty_produktów


Unnamed: 0,product_segment,liczba_produktów,procent_produktów
0,Entry-level,14261,44.27
1,Loyalty product,357,1.11
2,Mixed audience,91,0.28
3,Premium impulse,15622,48.49
4,Retention product,1883,5.85


In [72]:
segmenty_produktów_przychód = (
    product_features
    .groupby("product_segment")
    .agg(
        liczba_produktów=("product_id", "count"),
        łączny_przychód=("total_revenue", "sum")
    )
    .reset_index()
)

segmenty_produktów_przychód["procent_przychodu"] = (
    segmenty_produktów_przychód["łączny_przychód"]
    / segmenty_produktów_przychód["łączny_przychód"].sum()
    * 100
).round(2)

segmenty_produktów_przychód


Unnamed: 0,product_segment,liczba_produktów,łączny_przychód,procent_przychodu
0,Entry-level,14261,2291391.18,17.07
1,Loyalty product,357,64897.45,0.48
2,Mixed audience,91,117917.32,0.88
3,Premium impulse,15622,10580623.55,78.82
4,Retention product,1883,368757.16,2.75


In [73]:
macierz_segmentów = (
    items_with_customers
    .merge(
        product_features[["product_id", "product_segment"]],
        on="product_id",
        how="left"
    )
    .groupby(["customer_segment", "product_segment"])
    .size()
    .reset_index(name="liczba_zakupów")
    .sort_values("liczba_zakupów", ascending=False)
)

macierz_segmentów.head(10)


Unnamed: 0,customer_segment,product_segment,liczba_zakupów
3,High value one-time,Premium impulse,48764
5,Low value one-time,Entry-level,45653
0,High value one-time,Entry-level,3577
19,Returning customers,Retention product,2555
8,Low value one-time,Premium impulse,2248
15,Returning customers,Entry-level,1770
18,Returning customers,Premium impulse,1604
7,Low value one-time,Mixed audience,867
2,High value one-time,Mixed audience,758
9,Low value one-time,Retention product,495


In [74]:
podsumowanie = pd.DataFrame({
    "Wniosek": [
        "Ponad 95% klientów to klienci jednorazowi",
        "Segment lojalny generuje wysoki przychód mimo małej liczby klientów",
        "Większość produktów sprzedaje się bardzo rzadko",
        "Rekomendacje oparte są na realnych wzorcach zakupowych"
    ]
})

podsumowanie


Unnamed: 0,Wniosek
0,Ponad 95% klientów to klienci jednorazowi
1,Segment lojalny generuje wysoki przychód mimo ...
2,Większość produktów sprzedaje się bardzo rzadko
3,Rekomendacje oparte są na realnych wzorcach za...


In [75]:
# Sprawdzenie zgodności rekomendacji z faktycznymi zakupami

# Mapowanie: segment klienta -> dozwolone segmenty produktów
recommendation_map = {
    "Low value one-time": ["Entry-level"],
    "High value one-time": ["Premium impulse"],
    "Returning customers": ["Retention product", "Mixed audience"],
    "Repeat customers": ["Loyalty product", "Mixed audience"],
}

# Dołączenie segmentu produktu do pozycji zamówień
items_eval = (
    items_with_customers
    .merge(
        product_features[["product_id", "product_segment"]],
        on="product_id",
        how="left"
    )
)

# Sprawdzenie czy zakup pasuje do rekomendowanego segmentu
items_eval["is_recommended_match"] = items_eval.apply(
    lambda row: row["product_segment"] in recommendation_map.get(row["customer_segment"], []),
    axis=1
)

# Agregacja wyników walidacji
validation_summary = (
    items_eval
    .groupby("customer_segment")
    .agg(
        liczba_zakupow=("product_id", "count"),
        trafione_rekomendacje=("is_recommended_match", "sum")
    )
)

validation_summary["trafnosc_%"] = (
    validation_summary["trafione_rekomendacje"] /
    validation_summary["liczba_zakupow"] * 100
).round(2)

validation_summary


Unnamed: 0_level_0,liczba_zakupow,trafione_rekomendacje,trafnosc_%
customer_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High value one-time,53646,48764,90.9
Low value one-time,49330,45653,92.55
Repeat customers,1002,534,53.29
Returning customers,6211,2818,45.37


In [76]:
from pathlib import Path
import numpy as np
import pandas as pd

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"

# Load only what we need for customer-level features
customers = pd.read_csv(DATA_DIR / "olist_customers_dataset.csv")
orders    = pd.read_csv(DATA_DIR / "olist_orders_dataset.csv")
items     = pd.read_csv(DATA_DIR / "olist_order_items_dataset.csv")

# Delivered orders only (same spirit as your notebook)
orders_ok = orders[
    (orders["order_status"] == "delivered") &
    (orders["order_delivered_customer_date"].notna())
].copy()

# Attach customer_unique_id
orders_ok = orders_ok.merge(
    customers[["customer_id", "customer_unique_id"]],
    on="customer_id",
    how="left"
)

# Aggregate item-level to order-level
items_agg = (
    items.groupby("order_id")
    .agg(
        total_items=("order_item_id", "count"),
        total_price=("price", "sum"),
        avg_item_price=("price", "mean"),
    )
    .reset_index()
)

orders_enriched = orders_ok.merge(items_agg, on="order_id", how="left")

orders_enriched["order_purchase_timestamp"] = pd.to_datetime(
    orders_enriched["order_purchase_timestamp"], errors="raise"
)

# sanity
assert orders_enriched["customer_unique_id"].notna().all()


In [77]:
reference_date = orders_enriched["order_purchase_timestamp"].max()

customer_features = (
    orders_enriched
    .groupby("customer_unique_id")
    .agg(
        n_orders=("order_id", "count"),
        total_spend=("total_price", "sum"),
        avg_order_value=("total_price", "mean"),
        avg_items_per_order=("total_items", "mean"),
        first_purchase=("order_purchase_timestamp", "min"),
        last_purchase=("order_purchase_timestamp", "max"),
    )
    .reset_index()
)

customer_features["recency_days"] = (reference_date - customer_features["last_purchase"]).dt.days
customer_features["customer_lifetime_days"] = (customer_features["last_purchase"] - customer_features["first_purchase"]).dt.days

customer_features[["n_orders","total_spend","recency_days"]].describe()


Unnamed: 0,n_orders,total_spend,recency_days
count,93350.0,93350.0,93350.0
mean,1.033423,141.620235,236.95007
std,0.209106,215.702028,152.589932
min,1.0,0.85,0.0
25%,1.0,47.65,113.0
50%,1.0,89.7,218.0
75%,1.0,154.6975,345.0
max,15.0,13440.0,713.0


In [78]:
spend_median = customer_features["total_spend"].median()

def customer_segment(row):
    if row.n_orders >= 3:
        return "Repeat customers"
    if row.n_orders == 2:
        return "Returning customers"
    if row.n_orders == 1 and row.total_spend >= spend_median:
        return "High value one-time"
    return "Low value one-time"

customer_features["customer_segment"] = customer_features.apply(customer_segment, axis=1)

customer_features["customer_segment"].value_counts(normalize=True).round(3)


customer_segment
Low value one-time     0.495
High value one-time    0.475
Returning customers    0.028
Repeat customers       0.002
Name: proportion, dtype: float64

In [79]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Keep it aligned with your segmentation intent: frequency, monetary, recency
X = customer_features[["n_orders", "total_spend", "recency_days"]].copy()

# handle skew
X["n_orders_log"] = np.log1p(X["n_orders"])
X["total_spend_log"] = np.log1p(X["total_spend"])

X_model = X[["n_orders_log", "total_spend_log", "recency_days"]]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_model)

kmeans = KMeans(n_clusters=4, n_init="auto", random_state=42)
customer_features["kmeans_cluster"] = kmeans.fit_predict(X_scaled)

customer_features["kmeans_cluster"].value_counts().sort_index()


kmeans_cluster
0    27138
1    31021
2     2801
3    32390
Name: count, dtype: int64

In [80]:
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score,
    adjusted_rand_score,
    normalized_mutual_info_score
)

# --- (A) Cluster quality (internal, for KMeans only) ---
sil = silhouette_score(X_scaled, customer_features["kmeans_cluster"])
db  = davies_bouldin_score(X_scaled, customer_features["kmeans_cluster"])
ch  = calinski_harabasz_score(X_scaled, customer_features["kmeans_cluster"])

quality_metrics = pd.DataFrame([{
    "silhouette (higher better)": sil,
    "davies_bouldin (lower better)": db,
    "calinski_harabasz (higher better)": ch
}]).round(4)

# --- (B) Agreement between rule segments and kmeans partition ---
ari = adjusted_rand_score(customer_features["customer_segment"], customer_features["kmeans_cluster"])
nmi = normalized_mutual_info_score(customer_features["customer_segment"], customer_features["kmeans_cluster"])

agreement_metrics = pd.DataFrame([{
    "ARI rule vs kmeans (higher better)": ari,
    "NMI rule vs kmeans (higher better)": nmi
}]).round(4)

# --- (C) Stability of KMeans (does it change with seed?) ---
labels = []
for seed in [0, 1, 2]:
    km = KMeans(n_clusters=4, n_init="auto", random_state=seed)
    labels.append(km.fit_predict(X_scaled))

stability = pd.DataFrame([{
    "ARI(seed0 vs seed1)": adjusted_rand_score(labels[0], labels[1]),
    "ARI(seed0 vs seed2)": adjusted_rand_score(labels[0], labels[2]),
}]).round(4)

# --- (D) Cross-tab: rule segment vs cluster (counts + row %) ---
ct_counts = pd.crosstab(customer_features["customer_segment"], customer_features["kmeans_cluster"])
ct_rowpct = pd.crosstab(
    customer_features["customer_segment"],
    customer_features["kmeans_cluster"],
    normalize="index"
).round(3)

# --- (E) Cluster profiling (so you can name clusters) ---
cluster_profile = (
    customer_features
    .groupby("kmeans_cluster")
    .agg(
        size=("customer_unique_id", "count"),
        pct=("customer_unique_id", lambda s: round(len(s)/len(customer_features)*100, 2)),
        n_orders_median=("n_orders", "median"),
        total_spend_median=("total_spend", "median"),
        recency_median=("recency_days", "median"),
    )
    .sort_values("total_spend_median", ascending=False)
)

quality_metrics, agreement_metrics, stability, ct_counts, ct_rowpct, cluster_profile

(   silhouette (higher better)  davies_bouldin (lower better)  \
 0                      0.3659                         0.7822   
 
    calinski_harabasz (higher better)  
 0                          69332.188  ,
    ARI rule vs kmeans (higher better)  NMI rule vs kmeans (higher better)
 0                              0.4909                              0.5678,
    ARI(seed0 vs seed1)  ARI(seed0 vs seed2)
 0               0.9975               0.9964,
 kmeans_cluster           0      1     2      3
 customer_segment                              
 High value one-time  12623      0     0  31728
 Low value one-time   14515  31021     0    662
 Repeat customers         0      0   228      0
 Returning customers      0      0  2573      0,
 kmeans_cluster           0      1    2      3
 customer_segment                             
 High value one-time  0.285  0.000  0.0  0.715
 Low value one-time   0.314  0.671  0.0  0.014
 Repeat customers     0.000  0.000  1.0  0.000
 Returning customers 

In [81]:
import numpy as np
import pandas as pd

from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# ---- 0) sanity check ----
required_cols = {"customer_segment", "kmeans_cluster"}
missing = required_cols - set(customer_features.columns)
if missing:
    raise ValueError(f"Missing columns in customer_features: {missing}")

y_rule = customer_features["customer_segment"].astype(str)
y_km   = customer_features["kmeans_cluster"].astype(int)

# ---- 1) Agreement metrics (ARI, NMI) ----
ari = adjusted_rand_score(y_rule, y_km)
nmi = normalized_mutual_info_score(y_rule, y_km)

agreement_metrics = pd.DataFrame([{
    "ARI rule vs kmeans (higher better)": ari,
    "NMI rule vs kmeans (higher better)": nmi
}]).round(4)

agreement_metrics


Unnamed: 0,ARI rule vs kmeans (higher better),NMI rule vs kmeans (higher better)
0,0.4909,0.5678


In [82]:
# ---- 2) Crosstab counts ----
ct_counts = pd.crosstab(
    index=customer_features["customer_segment"],
    columns=customer_features["kmeans_cluster"]
).sort_index()

ct_counts

kmeans_cluster,0,1,2,3
customer_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High value one-time,12623,0,0,31728
Low value one-time,14515,31021,0,662
Repeat customers,0,0,228,0
Returning customers,0,0,2573,0


In [83]:
# ---- 3) Row-normalized matrix (heatmap-style table) ----
ct_rowpct = pd.crosstab(
    index=customer_features["customer_segment"],
    columns=customer_features["kmeans_cluster"],
    normalize="index"
)

# version as % with 1 decimal (e.g., 71.5%)
ct_rowpct_pct = (ct_rowpct * 100).round(1)

ct_rowpct_pct

kmeans_cluster,0,1,2,3
customer_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High value one-time,28.5,0.0,0.0,71.5
Low value one-time,31.4,67.1,0.0,1.4
Repeat customers,0.0,0.0,100.0,0.0
Returning customers,0.0,0.0,100.0,0.0


In [84]:
def top_cluster_share(row, top_n=2):
    # row is a Series of percentages
    s = row.sort_values(ascending=False).head(top_n)
    parts = [f"{val:.1f}% cluster {idx}" for idx, val in s.items()]
    return " + ".join(parts)

summary_lines = (
    ct_rowpct_pct
    .apply(lambda r: top_cluster_share(r, top_n=2), axis=1)
    .rename("Top shares (row-normalized)")
    .to_frame()
)

summary_lines

Unnamed: 0_level_0,Top shares (row-normalized)
customer_segment,Unnamed: 1_level_1
High value one-time,71.5% cluster 3 + 28.5% cluster 0
Low value one-time,67.1% cluster 1 + 31.4% cluster 0
Repeat customers,100.0% cluster 2 + 0.0% cluster 0
Returning customers,100.0% cluster 2 + 0.0% cluster 0
