In [10]:
# Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Task:

## Data Preprocessing:
    Pre-process the dataset to ensure it is suitable for Association rules, this may include handling missing values, removing duplicates, and converting the data to appropriate format.  


## Association Rule Mining:
    •	Implement an Apriori algorithm using tool like python with libraries such as Pandas and Mlxtend etc.
    •	 Apply association rule mining techniques to the pre-processed dataset to discover interesting relationships between products purchased together.
    •	Set appropriate threshold for support, confidence and lift to extract meaning full rules.


## Analysis and Interpretation:
    •	Analyse the generated rules to identify interesting patterns and relationships between the products.
    •	Interpret the results and provide insights into customer purchasing behaviour based on the discovered rules.


In [6]:
# Load the dataset
df = pd.read_excel(r"F:\Drive\ExcelR\Assignments\Association Rules\Association Rules\Online retail.xlsx", header=None)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


In [11]:
# Convert the single column into a list of transactions
transactions = df[0].apply(lambda x: x.split(','))

In [12]:
# One-hot encoding of the transaction data using TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)


In [13]:
basket = pd.DataFrame(te_ary, columns=te.columns_)

In [14]:
# Display the preprocessed basket (one-hot encoded) data
basket.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [27]:
# Apply the Apriori algorithm to generate frequent itemsets
frequent_itemsets = apriori(basket, min_support=0.005, use_colnames=True) # minimum support threshold to 0.01 

In [28]:
# Display the top 10 frequent itemsets
frequent_itemsets.head(10)

Unnamed: 0,support,itemsets
0,0.020397,(almonds)
1,0.008932,(antioxydant juice)
2,0.033329,(avocado)
3,0.008666,(bacon)
4,0.010799,(barbecue sauce)
5,0.014265,(black tea)
6,0.009199,(blueberries)
7,0.011465,(body spray)
8,0.033729,(brownies)
9,0.008666,(bug spray)


In [29]:
# Generate association rules using the 'lift' metric
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0.5)

In [30]:
rules.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(almonds),(burgers),0.020397,0.087188,0.005199,0.254902,2.923577,0.003421,1.225089,0.671653
1,(burgers),(almonds),0.087188,0.020397,0.005199,0.059633,2.923577,0.003421,1.041724,0.720799
2,(almonds),(chocolate),0.020397,0.163845,0.005999,0.294118,1.795099,0.002657,1.184553,0.45215
3,(chocolate),(almonds),0.163845,0.020397,0.005999,0.036615,1.795099,0.002657,1.016834,0.529719
4,(eggs),(almonds),0.179709,0.020397,0.006532,0.03635,1.782108,0.002867,1.016555,0.535014


In [31]:
# Filter the rules based on a confidence of 20%, support of 1%, and lift of 3
rules_filtered = rules[(rules['confidence'] >= 0.1) & (rules['support'] >= 0.005) & (rules['lift'] >= 1)]

In [32]:
# Sort the filtered rules by lift for better insights
rules_sorted = rules_filtered.sort_values(by='lift', ascending=False)

In [34]:
# Display the sorted rules
rules_sorted.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
431,(pasta),(escalope),0.015731,0.079323,0.005866,0.372881,4.700812,0.004618,1.468107,0.799853
854,(pasta),(shrimp),0.015731,0.071457,0.005066,0.322034,4.506672,0.003942,1.369601,0.790543
839,(whole wheat pasta),(olive oil),0.029463,0.065858,0.007999,0.271493,4.12241,0.006059,1.28227,0.780417
838,(olive oil),(whole wheat pasta),0.065858,0.029463,0.007999,0.121457,4.12241,0.006059,1.104713,0.810823
1728,"(herb & pepper, spaghetti)",(ground beef),0.016264,0.098254,0.006399,0.393443,4.00436,0.004801,1.486663,0.762677
1722,"(herb & pepper, mineral water)",(ground beef),0.017064,0.098254,0.006666,0.390625,3.975683,0.004989,1.479789,0.761465
676,(tomato sauce),(ground beef),0.014131,0.098254,0.005333,0.377358,3.840659,0.003944,1.448259,0.75023
427,(mushroom cream sauce),(escalope),0.019064,0.079323,0.005733,0.300699,3.790833,0.00422,1.316568,0.750514
1882,(soup),"(mineral water, olive oil)",0.050527,0.027596,0.005199,0.102902,3.728844,0.003805,1.083944,0.770764
1879,"(mineral water, olive oil)",(soup),0.027596,0.050527,0.005199,0.188406,3.728844,0.003805,1.169887,0.752589


In [35]:
# Analyze the top 10 rules
for index, rule in rules_sorted.head(10).iterrows():
    print(f"Rule {index + 1}:")
    print(f"Antecedents: {rule['antecedents']}")
    print(f"Consequents: {rule['consequents']}")
    print(f"Support: {rule['support']}")
    print(f"Confidence: {rule['confidence']}")
    print(f"Lift: {rule['lift']}")
    print("----------\n")

Rule 432:
Antecedents: frozenset({'pasta'})
Consequents: frozenset({'escalope'})
Support: 0.005865884548726837
Confidence: 0.3728813559322034
Lift: 4.700811850163794
----------

Rule 855:
Antecedents: frozenset({'pasta'})
Consequents: frozenset({'shrimp'})
Support: 0.005065991201173177
Confidence: 0.3220338983050847
Lift: 4.506672147735896
----------

Rule 840:
Antecedents: frozenset({'whole wheat pasta'})
Consequents: frozenset({'olive oil'})
Support: 0.007998933475536596
Confidence: 0.2714932126696833
Lift: 4.122410097642296
----------

Rule 839:
Antecedents: frozenset({'olive oil'})
Consequents: frozenset({'whole wheat pasta'})
Support: 0.007998933475536596
Confidence: 0.12145748987854252
Lift: 4.1224100976422955
----------

Rule 1729:
Antecedents: frozenset({'herb & pepper', 'spaghetti'})
Consequents: frozenset({'ground beef'})
Support: 0.006399146780429276
Confidence: 0.3934426229508197
Lift: 4.004359721511667
----------

Rule 1723:
Antecedents: frozenset({'herb & pepper', 'minera

### Interpretation example:
    - If 'mineral water' is purchased, the chances of buying 'salmon' are 4 times higher than random chance.
    - Cross-selling strategies could place mineral water and salmon together in online recommendations.

# Interview Questions

### 1) What is Lift and Why is It Important in Association Rules?

- Lift measures the strength of a rule over the random occurrence of the antecedent and consequent.**It's calculated as:**
    
     **Lift=Support(𝐴∧𝐵) / Support(𝐴) × Support(𝐵)**

- A lift greater than 1 suggests a strong association, meaning that buying item A makes buying item B more likely.

### 2) What is Support and Confidence?

- **Support** measures how often a rule occurs in the dataset:

    **Support(𝐴)=Transactions containing 𝐴 / Total transactions**
 
- **Confidence** is the likelihood that the consequent is bought when the antecedent is bought:

    **Confidence(𝐴⇒𝐵) = Support(𝐴∧𝐵)/Support(𝐴)**

### 3) What are the Limitations of Association Rules?

- **Scalability:** Processing large datasets can be time-consuming.
- **Redundant Rules:** Many rules generated may not be useful.
- **Data Sparsity:** Low-frequency item combinations can result in poor-quality rules.