In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (12,8)

In [2]:
data = pd.read_excel(r"/content/Online retail.xlsx", header = None)

In [3]:
data

Unnamed: 0,0
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


In [4]:
data.columns = ["items"]

In [5]:
data

Unnamed: 0,items
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


In [6]:
data.duplicated().sum()

np.int64(2325)

In [7]:
data.drop_duplicates(keep = 'first', inplace = True)

In [8]:
data.isnull().sum()

Unnamed: 0,0
items,0


In [9]:
data = [i.split(',') for i in data['items']]

In [10]:
te = TransactionEncoder()
te_data = te.fit_transform(data)

In [11]:
te_df = pd.DataFrame(te_data, columns = te.columns_)

In [12]:
te_df

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,True,True,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5171,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5172,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
5173,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5174,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
te_df.replace([True, False], [1, 0], inplace = True)

In [14]:
te_df

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,0,1,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5171,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5172,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
5173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
scores = apriori(te_df, min_support = 0.05, use_colnames = True)

In [16]:
scores

Unnamed: 0,support,itemsets
0,0.113794,(burgers)
1,0.103555,(cake)
2,0.054869,(champagne)
3,0.083849,(chicken)
4,0.205178,(chocolate)
5,0.060665,(cookies)
6,0.07187,(cooking oil)
7,0.208076,(eggs)
8,0.083849,(escalope)
9,0.19262,(french fries)


In [17]:
rules = association_rules(scores, min_threshold = 0.2)

In [18]:
rules[['antecedents','consequents','antecedent support','consequent support','support','confidence','lift']]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
0,(chocolate),(mineral water),0.205178,0.299845,0.073223,0.356874,1.190193
1,(mineral water),(chocolate),0.299845,0.205178,0.073223,0.244201,1.190193
2,(chocolate),(spaghetti),0.205178,0.229521,0.055835,0.272128,1.185635
3,(spaghetti),(chocolate),0.229521,0.205178,0.055835,0.243266,1.185635
4,(eggs),(mineral water),0.208076,0.299845,0.070131,0.337047,1.12407
5,(mineral water),(eggs),0.299845,0.208076,0.070131,0.233892,1.12407
6,(eggs),(spaghetti),0.208076,0.229521,0.051391,0.246982,1.076078
7,(spaghetti),(eggs),0.229521,0.208076,0.051391,0.223906,1.076078
8,(frozen vegetables),(mineral water),0.12983,0.299845,0.050425,0.388393,1.29531
9,(ground beef),(mineral water),0.135819,0.299845,0.058733,0.432432,1.442184


The lift value being greater than 1 for all rules indicates a strong positive association between the antecedents and consequents. This suggests that customers who purchase the antecedent items are more likely to also purchase the consequent items.

### Interview Questions