## Recommendation Engines
A Focus Friday Feature on Data Science<br>
Machine Learning - Association Rule Mining<br><br>
<span style="color:blue"> Created with: Gemini 2.5 Flash<br>
Modified by: Bertrand Tan, MDDI<br>
Date: 29 Jul 2025</span><hr>

In [25]:
# --- Google Colab Setup ---
# Install necessary libraries: mlxtend for association rules, plotly for interactive charts.
!pip install mlxtend plotly



In [26]:
# --- Import Libraries ---
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import plotly.express as px # Used for interactive visualizations

In [27]:
# Generate Synthetic Transactional Data (50 rows)
# This dataset simulates customer transactions with demographic details and varying item counts.
data = {
    'TransactionID': list(range(1, 51)),
    'Gender': ['Female', 'Male'] * 25, # Alternating Gender for 50 rows
    'Occupation': ['Engineer', 'Student', 'Teacher', 'Engineer', 'Student'] * 10, # Cycle through occupations
    'HomeOwnership': ['Own', 'Rent', 'Own', 'Own', 'Rent'] * 10, # Cycle through home ownership
    'Items': [
        'Milk,Bread,Eggs', 'Milk,Diapers,Beer,Chips', 'Bread,Butter,Jam', 'Milk,Cheese', 'Coffee,Sugar,Milk,Biscuits',
        'Diapers,Beer', 'Bread,Eggs,Coffee,Juice', 'Milk,Jam', 'Coffee,Sugar,Bread,Butter', 'Milk,Cheese,Jam',
        'Milk,Bread,Eggs', 'Diapers,Beer,Milk,Soda', 'Bread,Butter,Coffee', 'Milk,Eggs, Diapers, Beet', 'Coffee,Sugar,Jam,Cereal',
        'Diapers,Chips', 'Bread,Eggs,Milk', 'Milk,Coffee', 'Coffee,Sugar,Bread,Jam', 'Milk,Cheese,Biscuits',
        'Bread,Juice,Cereal', 'Milk,Diapers,Beer', 'Coffee,Biscuits', 'Bread,Jam,Butter', 'Milk,Eggs,Cheese,Coffee',
        'Diapers,Beer,Chips,Soda', 'Coffee,Sugar,Milk', 'Bread,Eggs,Juice', 'Milk,Butter,Jam,Cereal', 'Coffee,Biscuits,Jam',
        'Milk,Bread,Coffee', 'Diapers,Beer,Chips', 'Jam,Butter,Cereal', 'Cheese,Milk', 'Sugar,Biscuits,Coffee',
        'Beer,Diapers,Soda', 'Eggs,Juice,Bread', 'Jam,Milk', 'Butter,Coffee,Bread', 'Cheese,Jam,Milk',
        'Eggs,Milk,Bread', 'Soda,Diapers,Beer', 'Coffee,Butter,Bread', 'Eggs,Milk', 'Cereal,Jam,Coffee',
        'Chips,Diapers', 'Milk,Eggs,Bread', 'Coffee,Milk', 'Jam,Coffee,Sugar', 'Biscuits,Cheese,Milk'
    ]
}

#### --- Part 1: Market Basket Analysis of All Items ---

In [28]:
print("--- Part 1: Market Basket Analysis of All Items ---")

df = pd.DataFrame(data)

print("\nSample of the DataFrame (first 5 rows):")
print(df.head())
print(f"\nUnique occupations: {df['Occupation'].unique()}")

--- Part 1: Market Basket Analysis of All Items ---

Sample of the DataFrame (first 5 rows):
   TransactionID  Gender Occupation HomeOwnership                       Items
0              1  Female   Engineer           Own             Milk,Bread,Eggs
1              2    Male    Student          Rent     Milk,Diapers,Beer,Chips
2              3  Female    Teacher           Own            Bread,Butter,Jam
3              4    Male   Engineer           Own                 Milk,Cheese
4              5  Female    Student          Rent  Coffee,Sugar,Milk,Biscuits

Unique occupations: ['Engineer' 'Student' 'Teacher']


In [29]:
# Preprocessing: One-hot encode the 'Items' column for Apriori.
item_list = df['Items'].apply(lambda x: x.split(','))
te = TransactionEncoder()
df_items = pd.DataFrame(te.fit(item_list).transform(item_list), columns=te.columns_)

print("\nSample of One-Hot Encoded Transaction Data (first 5 rows):")
print(df_items.head())


Sample of One-Hot Encoded Transaction Data (first 5 rows):
    Beet   Diapers   Beer  Biscuits  Bread  Butter  Cereal  Cheese  Chips  \
0  False     False  False     False   True   False   False   False  False   
1  False     False   True     False  False   False   False   False   True   
2  False     False  False     False   True    True   False   False  False   
3  False     False  False     False  False   False   False    True  False   
4  False     False  False      True  False   False   False   False  False   

   Coffee  Diapers   Eggs    Jam  Juice   Milk   Soda  Sugar  
0   False    False   True  False  False   True  False  False  
1   False     True  False  False  False   True  False  False  
2   False    False  False   True  False  False  False  False  
3   False    False  False  False  False   True  False  False  
4    True    False  False  False  False   True  False   True  


In [30]:
# 2. Apply Apriori Algorithm to find Frequent Itemsets.
# min_support is set to 0.1 for general analysis with 50 rows to ensure enough rules are found.
frequent_itemsets = apriori(df_items, min_support=0.1, use_colnames=True)
print("\nFrequent Itemsets (min_support=0.1):")
print(frequent_itemsets)


Frequent Itemsets (min_support=0.1):
    support             itemsets
0      0.16               (Beer)
1      0.12           (Biscuits)
2      0.34              (Bread)
3      0.16             (Butter)
4      0.10             (Cereal)
5      0.14             (Cheese)
6      0.10              (Chips)
7      0.36             (Coffee)
8      0.20            (Diapers)
9      0.22               (Eggs)
10     0.26                (Jam)
11     0.50               (Milk)
12     0.14              (Sugar)
13     0.16      (Diapers, Beer)
14     0.12      (Butter, Bread)
15     0.14      (Bread, Coffee)
16     0.16        (Eggs, Bread)
17     0.12        (Milk, Bread)
18     0.14       (Cheese, Milk)
19     0.10     (Diapers, Chips)
20     0.10        (Jam, Coffee)
21     0.12       (Milk, Coffee)
22     0.14      (Sugar, Coffee)
23     0.16         (Milk, Eggs)
24     0.10          (Milk, Jam)
25     0.10  (Milk, Eggs, Bread)


In [31]:
# 3. Generate Association Rules from frequent itemsets.
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
rules = rules.sort_values(by=['lift', 'confidence'], ascending=[False, False])

# Convert frozensets to strings for Plotly hover_data compatibility.
# Plotly's hover_data expects JSON-serializable types, and frozensets are not.
rules['antecedents_str'] = rules['antecedents'].apply(lambda x: ', '.join(list(x)))
rules['consequents_str'] = rules['consequents'].apply(lambda x: ', '.join(list(x)))


print("\nGenerated Association Rules (min_lift=1.0, sorted by Lift):")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


Generated Association Rules (min_lift=1.0, sorted by Lift):
      antecedents    consequents  support  confidence      lift
1          (Beer)      (Diapers)     0.16    1.000000  5.000000
11        (Chips)      (Diapers)     0.10    1.000000  5.000000
10      (Diapers)        (Chips)     0.10    0.500000  5.000000
0       (Diapers)         (Beer)     0.16    0.800000  5.000000
22         (Eggs)  (Milk, Bread)     0.10    0.454545  3.787879
19  (Milk, Bread)         (Eggs)     0.10    0.833333  3.787879
15       (Coffee)        (Sugar)     0.14    0.388889  2.777778
14        (Sugar)       (Coffee)     0.14    1.000000  2.777778
2        (Butter)        (Bread)     0.12    0.750000  2.205882
3         (Bread)       (Butter)     0.12    0.352941  2.205882
6          (Eggs)        (Bread)     0.16    0.727273  2.139037
7         (Bread)         (Eggs)     0.16    0.470588  2.139037
8        (Cheese)         (Milk)     0.14    1.000000  2.000000
9          (Milk)       (Cheese)     0.14  

In [32]:
# 4. Visualize Association Rules using Plotly.
# Plotly provides interactive charts, allowing mouseover to see details like itemsets.
fig = px.scatter(rules, x="support", y="confidence", size="lift", color="lift",
                 hover_data=["antecedents_str", "consequents_str", "support", "confidence", "lift"],
                 title='Association Rules: Support vs Confidence (sized and colored by Lift)',
                 labels={"support": "Support (Proportion of Transactions)",
                         "confidence": "Confidence (P(Consequent | Antecedent))",
                         "lift": "Lift (Measure of Significance)",
                         "antecedents_str": "Antecedents", # Label for hover data
                         "consequents_str": "Consequents"}, # Label for hover data
                 color_continuous_scale=px.colors.sequential.Viridis,
                 width=800, height=500) # Reduced chart size
fig.show()

**Support** - the proportion of orders that include the item set. If an item’s support is above a set threshold, its occurrence can be considered significant.<br>
**Confidence** – measures how many times Y is purchased / consumed when X is purchased / consumed; expressed as X → Y.<br>
**Lift** – measures how likely Y is purchased / consumed when X is purchased/ consumed; while controlling for how popular Y is. Lift > 1 means high association between X and Y.

#### --- Part 2: Market Basket Analysis with Combined Demographic Segmentation ---

In [33]:
print("\n--- Part 2: Market Basket Analysis with Combined Demographic Segmentation ---")
print("Analyzing purchasing patterns by combining Gender, Occupation, and Home Ownership.")

# Create a combined demographic column for granular segmentation.
df['CombinedDemographic'] = df['Gender'] + '_' + df['Occupation'] + '_' + df['HomeOwnership']
current_demographic_col = 'CombinedDemographic'

# Get all unique combined demographic segments.
unique_segments = df[current_demographic_col].unique()


--- Part 2: Market Basket Analysis with Combined Demographic Segmentation ---
Analyzing purchasing patterns by combining Gender, Occupation, and Home Ownership.


In [34]:
# Iterate through each unique combined demographic segment and perform analysis.
for segment_value in unique_segments:
    if pd.isna(segment_value): # Skip NaN segments if any.
        continue

    print(f"\n--- Current Segment: '{segment_value}' ---")

    # Filter data for the current segment.
    df_segment = df[df[current_demographic_col] == segment_value].copy()

    # Skip if segment is empty or has insufficient data (less than 2 transactions).
    if df_segment.empty or df_segment.shape[0] < 2:
        print(f"  Insufficient data for {segment_value}. Skipping analysis.")
        continue

    # Preprocess items for the current segment.
    item_list_segment = df_segment['Items'].apply(lambda x: x.split(','))
    te_segment = TransactionEncoder()
    df_items_segment = pd.DataFrame(te_segment.fit(item_list_segment).transform(item_list_segment),
                                    columns=te_segment.columns_)

    # Skip if no unique items after encoding.
    if df_items_segment.shape[1] == 0:
        print(f"  No unique items for {segment_value} after preprocessing. Skipping.")
        continue

    # Dynamically adjust min_support for smaller segments.
    # For combined segments, min_support is often lower (e.g., 0.05) to find niche rules.
    min_support_segment = 0.1 if df_items_segment.shape[0] >= 10 else 0.05

    try:
        # Find frequent itemsets for the segment.
        frequent_itemsets_segment = apriori(df_items_segment, min_support=min_support_segment, use_colnames=True)

        # Skip rule generation if no frequent itemsets are found.
        if frequent_itemsets_segment.empty:
            print(f"  No frequent itemsets found for {segment_value} with min_support={min_support_segment}. Skipping rule generation.")
            continue

        # Generate association rules for the segment.
        rules_segment = association_rules(frequent_itemsets_segment, metric="lift", min_threshold=1.0)
        rules_segment = rules_segment.sort_values(by=['lift', 'confidence'], ascending=[False, False])

        # Convert frozensets to strings for Plotly hover_data compatibility.
        rules_segment['antecedents_str'] = rules_segment['antecedents'].apply(lambda x: ', '.join(list(x)))
        rules_segment['consequents_str'] = rules_segment['consequents'].apply(lambda x: ', '.join(list(x)))

        print(f"\n  Association Rules for '{segment_value}' (min_support={min_support_segment}, min_lift=1.0):")
        if rules_segment.empty:
            print("  No significant rules found for this segment.")
        else:
            print(rules_segment[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

        # Visualize rules for the segment using Plotly.
        fig_segment = px.scatter(rules_segment, x="support", y="confidence", size="lift", color="lift",
                                 hover_data=["antecedents_str", "consequents_str", "support", "confidence", "lift"],
                                 title=f'Association Rules for {segment_value} (sized and colored by Lift)',
                                 labels={"support": "Support (Proportion of Transactions)",
                                         "confidence": "Confidence (P(Consequent | Antecedent))",
                                         "lift": "Lift (Measure of Significance)",
                                         "antecedents_str": "Antecedents",
                                         "consequents_str": "Consequents"},
                                 color_continuous_scale=px.colors.sequential.Viridis,
                                 width=800, height=500) # Reduced chart size
        fig_segment.show()

    except Exception as e:
        print(f"  An error occurred while processing segment {segment_value}: {e}")
        print("  This might be due to insufficient data or no rules meeting criteria.")


--- Current Segment: 'Female' ---

  Association Rules for 'Female' (min_support=0.1, min_lift=1.0):
         antecedents       consequents  support  confidence      lift
13          (Cereal)             (Jam)     0.16    0.800000  2.857143
12             (Jam)          (Cereal)     0.16    0.571429  2.857143
33     (Jam, Coffee)           (Sugar)     0.12    0.750000  2.678571
36           (Sugar)     (Jam, Coffee)     0.12    0.428571  2.678571
30            (Eggs)     (Milk, Bread)     0.20    0.625000  2.604167
27     (Milk, Bread)            (Eggs)     0.20    0.833333  2.604167
21   (Bread, Coffee)          (Butter)     0.16    0.571429  2.040816
24          (Butter)   (Bread, Coffee)     0.16    0.571429  2.040816
17            (Eggs)            (Milk)     0.24    0.750000  1.875000
16            (Milk)            (Eggs)     0.24    0.600000  1.875000
28     (Eggs, Bread)            (Milk)     0.20    0.714286  1.785714
29            (Milk)     (Eggs, Bread)     0.20    0.50000


--- Current Segment: 'Male' ---

  Association Rules for 'Male' (min_support=0.1, min_lift=1.0):
         antecedents       consequents  support  confidence      lift
5             (Soda)            (Beer)     0.16    1.000000  3.125000
20   (Diapers, Milk)            (Beer)     0.12    1.000000  3.125000
25   (Diapers, Soda)            (Beer)     0.16    1.000000  3.125000
29            (Soda)   (Diapers, Beer)     0.16    1.000000  3.125000
4             (Beer)            (Soda)     0.16    0.500000  3.125000
24   (Diapers, Beer)            (Soda)     0.16    0.500000  3.125000
28            (Beer)   (Diapers, Soda)     0.16    0.500000  3.125000
23            (Beer)   (Diapers, Milk)     0.12    0.375000  3.125000
3             (Beer)         (Diapers)     0.32    1.000000  2.500000
9            (Chips)         (Diapers)     0.20    1.000000  2.500000
11            (Soda)         (Diapers)     0.16    1.000000  2.500000
16     (Beer, Chips)         (Diapers)     0.12    1.000000  2

In [35]:
print("\n--- Demonstration Complete! ---")


--- Demonstration Complete! ---
