In [18]:
# Step 1: Load Libraries
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns

# For display
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)



 Data Preprocessing

In [19]:
# Load Excel file
file_path = "Online retail.xlsx" 
df = pd.read_excel(file_path, header=None)

# Set column name
df.columns = ['Items']
df.dropna(inplace=True)

# Display first few rows
df.head()


Unnamed: 0,Items
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


In [20]:
 #Step 4: Convert Text into List of Items (Transactions)
# Convert comma-separated strings into list of items
transactions = df['Items'].apply(lambda x: [item.strip() for item in str(x).split(',')])
transactions = transactions.tolist()

# Preview the first transaction
transactions[:5]

[['shrimp',
  'almonds',
  'avocado',
  'vegetables mix',
  'green grapes',
  'whole weat flour',
  'yams',
  'cottage cheese',
  'energy drink',
  'tomato juice',
  'low fat yogurt',
  'green tea',
  'honey',
  'salad',
  'mineral water',
  'salmon',
  'antioxydant juice',
  'frozen smoothie',
  'spinach',
  'olive oil'],
 ['burgers', 'meatballs', 'eggs'],
 ['chutney'],
 ['turkey', 'avocado'],
 ['mineral water', 'milk', 'energy bar', 'whole wheat rice', 'green tea']]

🔹 Step 5: One-Hot Encode Transactions

In [21]:
# Transform the transaction list into a one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# View encoded data
df_encoded.head()


Unnamed: 0,almonds,antioxydant juice,asparagus,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,body spray,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,True,True,False,True,False,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


🔹 Step 6: Apply Apriori Algorithm

In [22]:
# Generate frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.02, use_colnames=True)

# View top results
frequent_itemsets.head()


Unnamed: 0,support,itemsets
0,0.020397,(almonds)
1,0.033329,(avocado)
2,0.033729,(brownies)
3,0.087188,(burgers)
4,0.030129,(butter)


 Step 7: Generate Association Rules

In [23]:
# Generate rules from frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# View top rules
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(burgers),(eggs),0.087188,0.179709,0.028796,0.330275,1.83783,1.0,0.013128,1.224818,0.499424,0.120941,0.183552,0.245256
1,(eggs),(burgers),0.179709,0.087188,0.028796,0.160237,1.83783,1.0,0.013128,1.086988,0.555754,0.120941,0.080026,0.245256
2,(burgers),(french fries),0.087188,0.170911,0.021997,0.252294,1.476173,1.0,0.007096,1.108844,0.353384,0.093168,0.09816,0.190499
3,(french fries),(burgers),0.170911,0.087188,0.021997,0.128705,1.476173,1.0,0.007096,1.04765,0.389069,0.093168,0.045482,0.190499
4,(burgers),(mineral water),0.087188,0.238368,0.024397,0.279817,1.173883,1.0,0.003614,1.057552,0.162275,0.081009,0.05442,0.191083


In [24]:
# Save itemsets and rules to Excel files
frequent_itemsets.to_excel("frequent_itemsets.xlsx", index=False)
rules.to_excel("association_rules.xlsx", index=False)


In [25]:
# Display results
print("Frequent Itemsets:")
print(frequent_itemsets.head())

print("\nAssociation Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

Frequent Itemsets:
    support    itemsets
0  0.020397   (almonds)
1  0.033329   (avocado)
2  0.033729  (brownies)
3  0.087188   (burgers)
4  0.030129    (butter)

Association Rules:
      antecedents      consequents   support  confidence      lift
0       (burgers)           (eggs)  0.028796    0.330275  1.837830
1          (eggs)        (burgers)  0.028796    0.160237  1.837830
2       (burgers)   (french fries)  0.021997    0.252294  1.476173
3  (french fries)        (burgers)  0.021997    0.128705  1.476173
4       (burgers)  (mineral water)  0.024397    0.279817  1.173883


1. What is Lift and why is it important in Association Rules?
Lift is a metric used to evaluate the strength of an association rule over the random co-occurrence of items.

Formula:

Lift(A→𝐵)=Support(𝐴∪𝐵)/Support(𝐴)×Support(𝐵)

Interpretation:
Lift = 1: A and B appear together as often as expected if they were statistically independent.
Lift > 1: A and B appear together more often than expected → positive correlation.
Lift < 1: A and B appear together less often than expected → negative correlation.

Why it’s important: Lift helps identify how much more likely two items are to be bought together than by chance. A high lift value indicates a strong relationship and more interesting rules.

2. What is Support and Confidence? How do you calculate them?
Definition: The proportion of transactions in the dataset that contain the itemset.

Formula:
Support(𝐴)=Number of transactions containing 𝐴/Total number of transactions

Purpose: Helps filter out rules that are not frequent or relevant.

Confidence
Definition: The conditional probability that transaction contains B given that it contains A.

Formula:
Confidence(𝐴→𝐵)=Support(𝐴∪𝐵)/Support(𝐴)
Purpose: Measures the reliability of the inference made by a rule.

3. What are some limitations or challenges of Association Rule Mining?
a. Large number of rules
Association rule mining can generate a vast number of rules, many of which might be redundant or uninteresting.

b. Sparsity
In many datasets, meaningful itemsets appear infrequently, making it difficult to extract strong rules.

c. Computational complexity
As the dataset grows, the number of item combinations increases exponentially (combinatorial explosion).

d. Choosing thresholds
Setting appropriate support, confidence, and lift values is crucial and non-trivial. Too high may miss good rules, too low may result in noise.

e. Lack of semantic meaning
Rules may have strong metrics but no practical or logical significance unless interpreted properly.