In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("Online retail(1).csv")
df

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil"
0,"burgers,meatballs,eggs"
1,chutney
2,"turkey,avocado"
3,"mineral water,milk,energy bar,whole wheat rice..."
4,low fat yogurt
...,...
7495,"butter,light mayo,fresh bread"
7496,"burgers,frozen vegetables,eggs,french fries,ma..."
7497,chicken
7498,"escalope,green tea"


# Data Preprocessing:

In [3]:
print(df.iloc[:, 0].head())  # Inspect the original column

0                               burgers,meatballs,eggs
1                                              chutney
2                                       turkey,avocado
3    mineral water,milk,energy bar,whole wheat rice...
4                                       low fat yogurt
Name: shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil, dtype: object


In [4]:
transactions = df.iloc[:, 0].str.strip().str.split(',')  # Ensure it splits by commas
print(transactions[:5]) 

0                           [burgers, meatballs, eggs]
1                                            [chutney]
2                                    [turkey, avocado]
3    [mineral water, milk, energy bar, whole wheat ...
4                                     [low fat yogurt]
Name: shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil, dtype: object


In [5]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [6]:
te=TransactionEncoder()

In [7]:
trans_arr = te.fit_transform(transactions)

In [8]:
trans_df = pd.DataFrame(trans_arr,columns=te.columns_)

In [9]:
trans_df.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


# Association Rule Mining:

In [10]:
freq_items = apriori(trans_df,min_support=0.01,use_colnames=True)

In [11]:
freq_items

Unnamed: 0,support,itemsets
0,0.020267,(almonds)
1,0.033200,(avocado)
2,0.010800,(barbecue sauce)
3,0.014267,(black tea)
4,0.011467,(body spray)
...,...,...
254,0.011067,"(mineral water, ground beef, milk)"
255,0.017067,"(mineral water, ground beef, spaghetti)"
256,0.015733,"(mineral water, milk, spaghetti)"
257,0.010267,"(mineral water, olive oil, spaghetti)"


In [12]:
rules = association_rules(freq_items, metric='lift' , min_threshold=1 , num_itemsets=5)

In [13]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(mineral water),(avocado),0.238267,0.033200,0.011467,0.048125,1.449559,1.0,0.003556,1.015680,0.407144,0.044103,0.015438,0.196753
1,(avocado),(mineral water),0.033200,0.238267,0.011467,0.345382,1.449559,1.0,0.003556,1.163629,0.320785,0.044103,0.140620,0.196753
2,(burgers),(cake),0.087200,0.081067,0.011467,0.131498,1.622103,1.0,0.004398,1.058068,0.420154,0.073129,0.054881,0.136473
3,(cake),(burgers),0.081067,0.087200,0.011467,0.141447,1.622103,1.0,0.004398,1.063185,0.417349,0.073129,0.059430,0.136473
4,(chocolate),(burgers),0.163867,0.087200,0.017067,0.104150,1.194377,1.0,0.002777,1.018920,0.194639,0.072934,0.018569,0.149934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403,"(mineral water, spaghetti)",(pancakes),0.059733,0.095067,0.011467,0.191964,2.019260,1.0,0.005788,1.119917,0.536836,0.080000,0.107077,0.156291
404,"(spaghetti, pancakes)",(mineral water),0.025200,0.238267,0.011467,0.455026,1.909736,1.0,0.005462,1.397744,0.488682,0.045503,0.284561,0.251576
405,(mineral water),"(spaghetti, pancakes)",0.238267,0.025200,0.011467,0.048125,1.909736,1.0,0.005462,1.024084,0.625373,0.045503,0.023518,0.251576
406,(pancakes),"(mineral water, spaghetti)",0.095067,0.059733,0.011467,0.120617,2.019260,1.0,0.005788,1.069235,0.557797,0.080000,0.064752,0.156291


# Analysis and Interpretation:

In [14]:
for index, rule in rules.head(10).iterrows():
    antecedents = list(rule['antecedents'])
    consequents = list(rule['consequents'])
    print(f"Rule: {antecedents} -> {consequents}")
    print(f"Support: {rule['support']}")
    print(f"Confidence: {rule['confidence']}")
    print(f"Lift: {rule['lift']}\n")

Rule: ['mineral water'] -> ['avocado']
Support: 0.011466666666666667
Confidence: 0.04812534974818131
Lift: 1.449558727354859

Rule: ['avocado'] -> ['mineral water']
Support: 0.011466666666666667
Confidence: 0.3453815261044177
Lift: 1.449558727354859

Rule: ['burgers'] -> ['cake']
Support: 0.011466666666666667
Confidence: 0.13149847094801223
Lift: 1.6221028488652824

Rule: ['cake'] -> ['burgers']
Support: 0.011466666666666667
Confidence: 0.14144736842105263
Lift: 1.6221028488652824

Rule: ['chocolate'] -> ['burgers']
Support: 0.017066666666666667
Confidence: 0.10414971521562247
Lift: 1.194377468069065

Rule: ['burgers'] -> ['chocolate']
Support: 0.017066666666666667
Confidence: 0.19571865443425077
Lift: 1.1943774680690649

Rule: ['eggs'] -> ['burgers']
Support: 0.0288
Confidence: 0.16023738872403562
Lift: 1.8375847330738029

Rule: ['burgers'] -> ['eggs']
Support: 0.0288
Confidence: 0.3302752293577982
Lift: 1.837584733073803

Rule: ['burgers'] -> ['french fries']
Support: 0.022
Confidenc

# Interview Questions:

## 1. What is lift and why is it important in Association rules?

Ans:- Lift: Lift is a measure used in association rule mining to assess the strength of a rule. It compares the probability of two items occurring together with the probability of them occurring independently. A lift value greater than 1 indicates that the items are more likely to occur together than by chance. It helps to find the most interesting rules.

## 2. What is support and Confidence. How do you calculate them?

Ans:- Support: Support is the proportion of transactions in the dataset that contain a specific item or itemset.

Confidence: Confidence is the probability that an item B is purchased when item A is purchased.

The formula for **Support** is:
$$
\text{Support}(A) = \frac{\text{Number of transactions containing } A}{\text{Total number of transactions}}
$$

The formula for **Confidence** is:
$$
\text{Confidence}(A \rightarrow B) = \frac{\text{Support}(A \cup B)}{\text{Support}(A)}
$$


## 3. What are some limitations or challenges of Association rules mining?

Ans:- Limitations of Association Rule Mining:

Scalability: It can be computationally expensive, especially with large datasets.
    
Overfitting: Too many rules can lead to irrelevant or overly complex patterns.
    
Sparsity: In sparse datasets, most rules might not be interesting or actionable.

Interpretability: Complex rules can be difficult to interpret and use in decision-making.

In [15]:
## end 