# Data Preprocessing

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_excel('Online retail.xlsx')

# Check for missing values
missing_values = data.isnull().sum()

# Split each transaction into a list of items and strip any leading/trailing whitespace
data['items'] = data.iloc[:, 0].str.split(',').apply(lambda x: [item.strip() for item in x])

# Ensure each item list contains unique items
data['items'] = data['items'].apply(lambda x: list(set(x)))

data

Unnamed: 0,"shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil",items
0,"burgers,meatballs,eggs","[eggs, burgers, meatballs]"
1,chutney,[chutney]
2,"turkey,avocado","[turkey, avocado]"
3,"mineral water,milk,energy bar,whole wheat rice...","[green tea, energy bar, mineral water, milk, w..."
4,low fat yogurt,[low fat yogurt]
...,...,...
7495,"butter,light mayo,fresh bread","[butter, fresh bread, light mayo]"
7496,"burgers,frozen vegetables,eggs,french fries,ma...","[green tea, magazines, burgers, french fries, ..."
7497,chicken,[chicken]
7498,"escalope,green tea","[green tea, escalope]"


In [None]:
# Create a list of all unique items
all_items = list(set(item for sublist in data['items'] for item in sublist))

# One-hot encode the transactions
encoded_data = pd.DataFrame(0, index=data.index, columns=all_items)

# unique is refered as 1 and 0 for non unique
for index, items in data['items'].items():
    encoded_data.loc[index, items] = 1
encoded_data

Unnamed: 0,whole wheat pasta,burgers,spinach,soda,fresh tuna,cereals,avocado,gluten free bar,whole weat flour,extra dark chocolate,...,frozen smoothie,chili,mint green tea,oatmeal,meatballs,salmon,barbecue sauce,pickles,mineral water,burger sauce
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7496,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7497,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Association Rules

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

# Apply the Apriori algorithm
frequent_itemsets = apriori(encoded_data, min_support=0.01, use_colnames=True)
frequent_itemsets

#frequent_itemsets = apriori(encoded_data, min_support=0.01, use_colnames=False)
#frequent_itemsets



Unnamed: 0,support,itemsets
0,0.029467,(whole wheat pasta)
1,0.087200,(burgers)
2,0.022267,(fresh tuna)
3,0.025733,(cereals)
4,0.033200,(avocado)
...,...,...
254,0.015867,"(spaghetti, chocolate, mineral water)"
255,0.010133,"(spaghetti, french fries, mineral water)"
256,0.010133,"(eggs, ground beef, mineral water)"
257,0.010933,"(ground beef, chocolate, mineral water)"


In [None]:
# Generate the association rules
rules1 = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
rules1

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(cake),(burgers),0.081067,0.087200,0.011467,0.141447,1.622103,0.004398,1.063185,0.417349
1,(burgers),(cake),0.087200,0.081067,0.011467,0.131498,1.622103,0.004398,1.058068,0.420154
2,(burgers),(milk),0.087200,0.129600,0.017867,0.204893,1.580964,0.006566,1.094695,0.402580
3,(milk),(burgers),0.129600,0.087200,0.017867,0.137860,1.580964,0.006566,1.058761,0.422191
4,(frozen vegetables),(burgers),0.095333,0.087200,0.010533,0.110490,1.267082,0.002220,1.026182,0.232997
...,...,...,...,...,...,...,...,...,...,...
403,"(eggs, mineral water)",(chocolate),0.050933,0.163867,0.013467,0.264398,1.613494,0.005120,1.136665,0.400633
404,"(chocolate, mineral water)",(eggs),0.052667,0.179733,0.013467,0.255696,1.422642,0.004001,1.102059,0.313599
405,(eggs),"(chocolate, mineral water)",0.179733,0.052667,0.013467,0.074926,1.422642,0.004001,1.024062,0.362178
406,(chocolate),"(eggs, mineral water)",0.163867,0.050933,0.013467,0.082181,1.613494,0.005120,1.034045,0.454745


In [None]:
rules2 = rules1[["antecedents","consequents","support","confidence","lift"]]
rules2

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(cake),(burgers),0.011467,0.141447,1.622103
1,(burgers),(cake),0.011467,0.131498,1.622103
2,(burgers),(milk),0.017867,0.204893,1.580964
3,(milk),(burgers),0.017867,0.137860,1.580964
4,(frozen vegetables),(burgers),0.010533,0.110490,1.267082
...,...,...,...,...,...
403,"(eggs, mineral water)",(chocolate),0.013467,0.264398,1.613494
404,"(chocolate, mineral water)",(eggs),0.013467,0.255696,1.422642
405,(eggs),"(chocolate, mineral water)",0.013467,0.074926,1.422642
406,(chocolate),"(eggs, mineral water)",0.013467,0.082181,1.613494


In [None]:
rules3 = rules2[(rules2["confidence"] >= 0.5) & (rules2["support"] >= 0.01) & (rules2["lift"] >= 1.5)]
rules3

Unnamed: 0,antecedents,consequents,support,confidence,lift
326,"(ground beef, milk)",(mineral water),0.011067,0.50303,2.111207
390,"(eggs, ground beef)",(mineral water),0.010133,0.506667,2.126469


## Interpretation
### There are 2 strong associations given above after setting the approprate threshold, support and lift