<a href="https://colab.research.google.com/github/Ashvitharavichandran/CI-CD/blob/main/MBA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
#library
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [40]:
#Load Dataset
df = pd.read_csv("Groceries_dataset.csv")
print(df.shape)

(38765, 3)


In [41]:
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [None]:
# STEP 2: DATA CLEANING
# Convert date
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
df["Date"]


In [47]:
# Clean itemDescription to avoid duplicate products
# - strip spaces
# - convert to lowercase
# - remove commas
df["itemDescription"] = (
    df["itemDescription"]
    .str.strip()
    .str.lower()
    .str.replace(",", "", regex=False)
)

In [48]:
print(df.shape)

(38765, 3)


In [49]:
# STEP 3: CREATE TRANSACTION ID
# Since there is NO invoice ID in this dataset,


df["Transaction_ID"] = (
    df["Member_number"].astype(str) + "_" +
    df["Date"].dt.strftime("%Y-%m-%d")
)

# Each Transaction_ID now represents one shopping basket


In [57]:
# STEP 4: CUSTOMER SEGMENTATION USING KMEANS

# Aggregate item-level data to customer-level behavior
customer_features = df.groupby("Member_number").agg(
    Num_Transactions=("Transaction_ID", "nunique"),  # how often customer shops
    Total_Items=("itemDescription", "count")          # how many items customer buys
)

customer_features.head()

Unnamed: 0_level_0,Num_Transactions,Total_Items
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1
1000,5,13
1001,5,12
1002,4,8
1003,4,8
1004,8,21


In [58]:
# 4.2 FEATURE SCALING (MANDATORY FOR KMEANS)

# KMeans is distance-based → features must be on same scale
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features)

In [59]:
#4.3 APPLY KMEANS CLUSTERING

# Create KMeans model with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)



In [60]:
# Assign each customer to a cluster
customer_features["Segment"] = kmeans.fit_predict(scaled_features)

In [61]:
# 4.4 INTERPRET CLUSTERS USING CLUSTER CENTERS

# Convert cluster centers back to original scale
cluster_centers = pd.DataFrame(
    scaler.inverse_transform(kmeans.cluster_centers_),
    columns=customer_features.columns[:2]
)


In [62]:
# Add cluster id
cluster_centers["Cluster"] = cluster_centers.index

cluster_centers

Unnamed: 0,Num_Transactions,Total_Items,Cluster
0,4.454201,11.487682,0
1,2.192666,5.423749,1
2,6.943049,18.864322,2


In [64]:
# 4.5 MAP CLUSTERS TO MEANINGFUL SEGMENT NAMES

# Initialize all as Regular Buyers
cluster_centers["Segment_Name"] = "Regular Buyers"

# Cluster with highest transactions → Frequent Buyers
cluster_centers.loc[
    cluster_centers["Num_Transactions"].idxmax(),
    "Segment_Name"
] = "Frequent Buyers"

# Cluster with lowest transactions → Occasional Buyers
cluster_centers.loc[
    cluster_centers["Num_Transactions"].idxmin(),
    "Segment_Name"
] = "Occasional Buyers"


In [65]:
# Create mapping from cluster number to segment name
segment_map = cluster_centers.set_index("Cluster")["Segment_Name"].to_dict()
segment_map

{0: 'Regular Buyers', 1: 'Occasional Buyers', 2: 'Frequent Buyers'}

In [66]:
# Assign readable segment names to customers
customer_features["Segment_Name"] = customer_features["Segment"].map(segment_map)
customer_features

Unnamed: 0_level_0,Num_Transactions,Total_Items,Segment,Segment_Name
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000,5,13,0,Regular Buyers
1001,5,12,0,Regular Buyers
1002,4,8,0,Regular Buyers
1003,4,8,0,Regular Buyers
1004,8,21,2,Frequent Buyers
...,...,...,...,...
4996,3,10,0,Regular Buyers
4997,2,6,1,Occasional Buyers
4998,1,2,1,Occasional Buyers
4999,6,16,2,Frequent Buyers


STEP 5: MARKET BASKET ANALYSIS

In [67]:
# 5.1 CREATE BASKET MATRIX
# Rows → Transaction_ID (basket)
# Columns → itemDescription (products)
# Values → count of item in basket
basket = (
    df.groupby(["Transaction_ID", "itemDescription"])
    .size()
    .unstack(fill_value=0)
)

# Convert counts to binary (1 = item present, 0 = absent)
basket = (basket > 0).astype(int)
basket

itemDescription,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,beverages,bottled beer,...,uht-milk,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Transaction_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000_2014-06-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1000_2015-03-15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1000_2015-05-27,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000_2015-07-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000_2015-11-25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999_2015-05-16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4999_2015-12-26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5000_2014-03-09,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5000_2014-11-16,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [71]:
# 5.2 RUN APRIORI ALGORITHM
# Find frequent itemsets with minimum 2% support
frequent_itemsets = apriori(
    basket,
    min_support=0.001,
    use_colnames=True
)

In [72]:
# 5.3 GENERATE ASSOCIATION RULES
# Generate rules using lift as strength metric
rules = association_rules(
    frequent_itemsets,
    metric="lift",
    min_threshold=1.2
)

# View strongest rules
rules.sort_values("lift", ascending=False).head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
82,"(yogurt, whole milk)",(sausage),0.011161,0.060349,0.00147,0.131737,2.182917,1.0,0.000797,1.082219,0.548014,0.020992,0.075973,0.07805
87,(sausage),"(yogurt, whole milk)",0.060349,0.011161,0.00147,0.024363,2.182917,1.0,0.000797,1.013532,0.576701,0.020992,0.013351,0.07805
84,"(whole milk, sausage)",(yogurt),0.008955,0.085879,0.00147,0.164179,1.91176,1.0,0.000701,1.093681,0.481231,0.015748,0.085657,0.09065
85,(yogurt),"(whole milk, sausage)",0.085879,0.008955,0.00147,0.017121,1.91176,1.0,0.000701,1.008307,0.521727,0.015748,0.008239,0.09065
28,(specialty chocolate),(citrus fruit),0.015973,0.053131,0.001403,0.087866,1.653762,1.0,0.000555,1.038081,0.401735,0.020731,0.036684,0.057141


STEP 6: SEGMENT-WISE MARKET BASKET ANALYSIS

In [73]:
# 6.1 MERGE SEGMENT INFO BACK TO TRANSACTIONS
df_seg = df.merge(
    customer_features[["Segment_Name"]],
    left_on="Member_number",
    right_index=True
)

In [89]:
# 6.2 FUNCTION TO RUN MBA FOR A GIVEN SEGMENT

def mba_by_segment(segment_name):
    print("\nRunning for segment:", segment_name)

    temp = df_seg[df_seg["Segment_Name"] == segment_name]
    print("Rows after filtering:", temp.shape[0])

    if temp.empty:
        print(" No data for this segment")
        return pd.DataFrame()

    basket = (
        temp.groupby(["Transaction_ID", "itemDescription"])
        .size()
        .unstack(fill_value=0)
    )

    print("Basket shape:", basket.shape)

    if basket.empty:
        print(" Basket empty")
        return pd.DataFrame()

    basket = (basket > 0).astype(int)

    freq = apriori(basket, min_support=0.005, use_colnames=True)
    print("Frequent itemsets:", freq.shape)

    if freq.empty:
        print("No frequent itemsets")
        return pd.DataFrame()

    rules = association_rules(freq, metric="lift", min_threshold=1.1)
    print("Rules generated:", rules.shape)

    return rules.sort_values("lift", ascending=False)



In [90]:
# ------------------------------------------------------------
# 6.3 RUN SEGMENT-WISE MBA
# ------------------------------------------------------------

frequent_rules = mba_by_segment("Frequent Buyers")
regular_rules = mba_by_segment("Regular Buyers")
occasional_rules = mba_by_segment("Occasional Buyers")



Running for segment: Frequent Buyers
Rows after filtering: 11262
Basket shape: (4145, 164)
Frequent itemsets: (151, 2)
Rules generated: (18, 14)

Running for segment: Regular Buyers
Rows after filtering: 18185
Basket shape: (7051, 164)
Frequent itemsets: (127, 2)
Rules generated: (4, 14)

Running for segment: Occasional Buyers
Rows after filtering: 9318
Basket shape: (3767, 163)
Frequent itemsets: (116, 2)
Rules generated: (6, 14)


In [126]:
# STEP 7: CROSS-SELL RECOMMENDATIONS
recommendations = (
    frequent_rules
    .sort_values("lift", ascending=False)
    .head(10)[["antecedents", "consequents", "confidence", "lift"]]
)

recommendations.head()


Unnamed: 0,antecedents,consequents,confidence,lift
6,(rolls/buns),(chocolate),0.045952,1.685579
7,(chocolate),(rolls/buns),0.185841,1.685579
11,(rolls/buns),(frankfurter),0.056893,1.446752
10,(frankfurter),(rolls/buns),0.159509,1.446752
17,(sausage),(yogurt),0.109929,1.248373


In [129]:
recommendations = frequent_rules[
    (frequent_rules["confidence"] > 0.1) &
    (frequent_rules["lift"] > 1.4) &
    (frequent_rules["antecedents"].apply(lambda x: len(x) == 1))
][["antecedents", "consequents", "confidence", "lift"]]

recommendations.head(2)


Unnamed: 0,antecedents,consequents,confidence,lift
7,(chocolate),(rolls/buns),0.185841,1.685579
10,(frankfurter),(rolls/buns),0.159509,1.446752


In [121]:
print("Confidence > 0.4:",
      (frequent_rules["confidence"] > 0.1).sum())

print("Lift > 1.5:",
      (frequent_rules["lift"] > 1.5.).sum())


Confidence > 0.4: 7
Lift > 1.5: 0
