In [1]:
# Import libraries & Load Dataset
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_excel("Online retail.xlsx")

# Show basic info
print("Shape of dataset:", df.shape)
df.head()


Shape of dataset: (7500, 1)


Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


# Data Preprocessing

In [None]:
# Show the actual column names in the dataset
print("Column names in dataset:\n", df.columns)

# Now try to standardize column names 
df.columns = df.columns.str.strip()

# Print again after cleaning
print("\nCleaned column names:\n", df.columns)

# Drop rows with missing CustomerID, InvoiceNo, or Description (if they exist after cleaning)
required_cols = [col for col in ['CustomerID', 'InvoiceNo', 'Description'] if col in df.columns]
df = df.dropna(subset=required_cols)

# Remove duplicates if any
df = df.drop_duplicates()

# Show dataset shape after cleaning
print("Shape after preprocessing:", df.shape)

# Display first few rows after cleaning
df.head()


Column names in dataset:
 Index(['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'], dtype='object')

Cleaned column names:
 Index(['shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'], dtype='object')
Shape after preprocessing: (5175, 1)


Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt


# Convert Data to Basket Format

In [12]:
# ----- Convert transaction strings into basket format -----

# Each row is a string like "shrimp,almonds,avocado,..."
transactions = df.iloc[:,0].apply(lambda x: x.split(','))

# Remove duplicates inside each transaction
transactions = transactions.apply(lambda items: list(set(items)))

# Create list of unique items across all transactions
all_items = sorted(set(item for sublist in transactions for item in sublist))

# Create a binary dataframe (basket format)
basket = pd.DataFrame(0, index=np.arange(len(transactions)), columns=all_items)

# Fill with 1 if item exists in transaction
for i, items in enumerate(transactions):
    basket.loc[i, items] = 1

print("Basket shape:", basket.shape)
basket.head()


Basket shape: (5175, 120)


Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Inspect column names (to see if there are hidden spaces or different spellings)
print("Dataset columns:\n")
for col in df.columns:
    print(repr(col))   # repr() shows exact text, including spaces


Dataset columns:

'shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil'


# Apriori & Association Rules

In [13]:
# ----- Apply Apriori and Generate Association Rules -----
from mlxtend.frequent_patterns import apriori, association_rules

# Find frequent itemsets with minimum support = 5%
frequent_itemsets = apriori(basket, min_support=0.05, use_colnames=True)

print("Frequent itemsets found:", frequent_itemsets.shape[0])
display(frequent_itemsets.head())

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Sort rules by confidence
rules = rules.sort_values(by="confidence", ascending=False)

print("Total rules generated:", rules.shape[0])
display(rules.head())


Frequent itemsets found: 40




Unnamed: 0,support,itemsets
0,0.113816,(burgers)
1,0.103575,(cake)
2,0.054879,(champagne)
3,0.083865,(chicken)
4,0.205217,(chocolate)


Total rules generated: 20


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
10,(ground beef),(mineral water),0.135845,0.29971,0.058744,0.432432,1.442835,1.0,0.01803,1.233844,0.355168,0.155897,0.189525,0.314218
13,(ground beef),(spaghetti),0.135845,0.229565,0.055845,0.411095,1.790756,1.0,0.02466,1.30825,0.510993,0.1804,0.23562,0.327181
15,(milk),(mineral water),0.170048,0.29971,0.067826,0.398864,1.330831,1.0,0.016861,1.164943,0.299523,0.16875,0.141589,0.312585
8,(frozen vegetables),(mineral water),0.129855,0.29971,0.050435,0.388393,1.295895,1.0,0.011516,1.144999,0.262407,0.133028,0.126637,0.278336
18,(spaghetti),(mineral water),0.229565,0.29971,0.085024,0.37037,1.235762,1.0,0.016221,1.112225,0.24763,0.191388,0.100901,0.327029


# Analysis and Interpretation

From the generated association rules, we can observe interesting patterns:

- **Support** shows how frequently an itemset appears in the dataset.  
- **Confidence** tells us how often items are bought together, given the presence of the antecedent.  
- **Lift** indicates the strength of a rule compared to random chance.  
  - Lift > 1 → items are positively related (purchased together more often than random).  
  - Lift = 1 → items are independent.  
  - Lift < 1 → items are negatively related.  

By analyzing rules with **high lift and high confidence**, we can identify strong product relationships and customer buying behavior.  
For example, if `{mineral water} → {salmon}` has high support and lift, it means customers who buy mineral water often also buy salmon.  

This insight can help in:
- Product placement in stores
- Cross-selling strategies
- Designing promotions and combos


In [14]:

strong_rules = rules[(rules['confidence'] > 0.5) & (rules['lift'] > 1.2)]

print("Number of strong rules:", strong_rules.shape[0])
display(strong_rules.head(10))


Number of strong rules: 0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski


# Interview Questions

**Q1. What are Association Rules?**  
Association rules are a rule-based machine learning method for discovering relationships between items in large datasets.  
- Example: `{bread, butter} → {jam}` means people who buy bread and butter are also likely to buy jam.  
- Key metrics: Support, Confidence, and Lift.


**Q2. What is the difference between Support, Confidence, and Lift?**  
- **Support:** Frequency of an itemset in the dataset.  
  Example: If 20 out of 100 transactions contain milk, Support(milk) = 20%.  

- **Confidence:** Probability that a rule is correct given the antecedent.  
  Example: If 15 out of 20 milk buyers also buy bread, Confidence(milk → bread) = 75%.  

- **Lift:** Strength of association compared to chance.  
  Example: If Lift(milk → bread) = 1.5, milk buyers are 1.5 times more likely to buy bread than random customers.  



**Q3. How can Association Rule Mining be used in business?**  
- **Market Basket Analysis:** Suggesting products that are often purchased together.  
- **Cross-Selling & Promotions:** Creating bundles (e.g., chips + soft drink).  
- **Store Layout Optimization:** Placing related items close together to increase sales.  
- **Customer Personalization:** Recommending items based on buying history.  
