## Import necessary libraries



In [4]:
import pandas as pd

In [6]:
!pip install mlxtend


Defaulting to user installation because normal site-packages is not writeable
Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Collecting scikit-learn>=1.3.1 (from mlxtend)
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn>=1.3.1->mlxtend)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.4 MB 495.5 kB/s eta 0:00:03
   --- ------------------------------------ 0.1/1.4 MB 939.4 kB/s eta 0:00:02
   ------ --------------------------------- 0.2/1.4 MB 1.5 MB/s eta 0:00:01
   ---------- ----------------------------- 0.4/1.4 MB 1.8 MB/s eta 0:00:01
   ---------------- ----------------------- 0.6/1.4 MB 2.3 MB/s eta 0:00:01
   --

In [53]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

## Load dataset

In [57]:
df = pd.read_csv("Online retail.csv", encoding='latin1', header=None)


## Display basic information about the dataset

 ## Check for missing values

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7501 entries, 0 to 7500
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7501 non-null   object
dtypes: object(1)
memory usage: 58.7+ KB


## Check for missing values

In [63]:
print("Missing values:")
print(df.isnull().sum())



Missing values:
0    0
dtype: int64


## Check the first few rows of the dataset

## Convert InvoiceNo to string

In [66]:
df.head()


Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."


## Rename the single column to 'Transactions'

In [69]:
df.columns = ['Transactions']


## Drop any rows with missing values

In [72]:
df.dropna(inplace=True)


## Split items in each transaction into a list


In [75]:
df['Transactions'] = df['Transactions'].apply(lambda x: x.split(','))


## Convert transactions into a format suitable for market basket analysis

In [78]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(df['Transactions']).transform(df['Transactions'])
basket = pd.DataFrame(te_ary, columns=te.columns_)


In [82]:
basket.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


## Apply Apriori algorithm to find frequent item sets

In [85]:
frequent_itemsets = apriori(basket, min_support=0.05, use_colnames=True)


## Generate association rules

In [88]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)


## Display association rules

In [91]:
rules.head()


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(chocolate),(mineral water),0.163845,0.238368,0.05266,0.3214,1.348332,1.0,0.013604,1.122357,0.308965,0.150648,0.109018,0.271158
1,(mineral water),(chocolate),0.238368,0.163845,0.05266,0.220917,1.348332,1.0,0.013604,1.073256,0.339197,0.150648,0.068256,0.271158
2,(eggs),(mineral water),0.179709,0.238368,0.050927,0.283383,1.188845,1.0,0.00809,1.062815,0.193648,0.138707,0.059103,0.248515
3,(mineral water),(eggs),0.238368,0.179709,0.050927,0.213647,1.188845,1.0,0.00809,1.043158,0.208562,0.138707,0.041372,0.248515
4,(spaghetti),(mineral water),0.17411,0.238368,0.059725,0.343032,1.439085,1.0,0.018223,1.159314,0.369437,0.169312,0.137421,0.296796


## Explanation of Columns

In [94]:
# antecedents: Items on the left-hand side of the rule.
# consequents: Items on the right-hand side of the rule.
# support: The proportion of transactions that contain both antecedents and consequents.
# confidence: How often the rule has been true (transactions with antecedents also had consequents).
# lift: Strength of the rule (greater than 1 indicates a strong association).
# leverage & conviction: Additional metrics that indicate the strength and reliability of the rule.


In [2]:
# Q1: What is lift and why is it important in Association rules?

print("Lift:\n"
      "- Lift is a metric used in Association Rule Mining to measure how much more likely two items are to be bought together than expected by chance.\n"
      "- Formula: Lift = (Support of A and B) / (Support of A * Support of B).\n"
      "- A lift value > 1 indicates a strong positive association, while < 1 indicates a negative correlation.\n\n"
      "Importance of Lift:\n"
      "- Helps identify truly meaningful associations beyond mere co-occurrence.\n"
      "- Used in market basket analysis, recommendation systems, and fraud detection.\n"
      "- A high lift value suggests strong predictive power for recommendation models.")


Lift:
- Lift is a metric used in Association Rule Mining to measure how much more likely two items are to be bought together than expected by chance.
- Formula: Lift = (Support of A and B) / (Support of A * Support of B).
- A lift value > 1 indicates a strong positive association, while < 1 indicates a negative correlation.

Importance of Lift:
- Helps identify truly meaningful associations beyond mere co-occurrence.
- Used in market basket analysis, recommendation systems, and fraud detection.
- A high lift value suggests strong predictive power for recommendation models.


In [4]:
# Q2: What is support and confidence? How do you calculate them?

print("Support:\n"
      "- Support measures how frequently an itemset appears in the dataset.\n"
      "- Formula: Support(A) = (Transactions containing A) / (Total transactions).\n\n"
      "Confidence:\n"
      "- Confidence measures the likelihood that item B is purchased when item A is purchased.\n"
      "- Formula: Confidence(A → B) = Support(A and B) / Support(A).\n\n"
      "These metrics are key in Association Rule Mining to determine the strength and reliability of discovered rules.")


Support:
- Support measures how frequently an itemset appears in the dataset.
- Formula: Support(A) = (Transactions containing A) / (Total transactions).

Confidence:
- Confidence measures the likelihood that item B is purchased when item A is purchased.
- Formula: Confidence(A → B) = Support(A and B) / Support(A).

These metrics are key in Association Rule Mining to determine the strength and reliability of discovered rules.
