In [1]:
## importing all the required library 
import numpy as np
import pandas as pd
import matplotlib.pyplot as ply 
import plotly as px
import seaborn as sns

In [2]:
# loading the dataset in the dtaframe 
dataset=pd.read_csv('Market_Basket_Optimisation .csv')
dataset.shape

(7500, 20)

In [3]:
## printing the first  five value from the dataframe 
dataset.head()

Unnamed: 0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
0,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
1,chutney,,,,,,,,,,,,,,,,,,,
2,turkey,avocado,,,,,,,,,,,,,,,,,,
3,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,
4,low fat yogurt,,,,,,,,,,,,,,,,,,,


**Next Step -** we need to convert the all purchase data item into numpy array, because it has so many null values.

In [4]:
## Gather all the items of each transaction into  numpy array 
transaction=[]
for i in range(0,dataset.shape[0]):
  for  j  in range(0,dataset.shape[1]):

    transaction.append(dataset.values[i,j])

## converting list into numpy array 
transaction = np.array(transaction)
print(transaction)

['burgers' 'meatballs' 'eggs' ... 'nan' 'nan' 'nan']


Now our data is in numpy format. We can delete all null values and printing the top5 frequent occouring items

In [5]:
# transform them a pandas dataframe 
df=pd.DataFrame(transaction,columns=['items'])
# put 1 to each item for making the countable table, for group by action 
df['incident_count']=1
## delete nan items from the dataframe
indexNames=df[df['items']=='nan'].index
df.drop(indexNames,inplace=True)

## making a new pandas Datframe for visualization 
df_table=df.groupby('items').sum().sort_values('incident_count',ascending=False).reset_index()
##initial vizualization 
df_table.head(5).style.background_gradient(cmap='Blues')

Unnamed: 0,items,incident_count
0,mineral water,1787
1,eggs,1348
2,spaghetti,1306
3,french fries,1282
4,chocolate,1230


These are the top five frequent purchased items by the cutomers.

In [6]:
## visualize the top 50 items with use of tree map 
import plotly.express as px
df_table['All']='Top 50 items'
# creating the tree map using plotly 
fig=px.treemap(df_table.head(50),path=['All','items'],values='incident_count',
               color=df_table['incident_count'].head(50),hover_data=['items'],
               color_continuous_scale='Blues') 


**Preprocessing Dataset**
we need to convert our dataset into true and flase format 

In [7]:
# installing  the required library
!pip install mlxtend --upgrade

Collecting mlxtend
  Downloading mlxtend-0.22.0-py2.py3-none-any.whl (1.4 MB)
     ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
     - -------------------------------------- 0.0/1.4 MB 991.0 kB/s eta 0:00:02
     ---- ----------------------------------- 0.1/1.4 MB 1.7 MB/s eta 0:00:01
     -------- ------------------------------- 0.3/1.4 MB 2.0 MB/s eta 0:00:01
     ----------- ---------------------------- 0.4/1.4 MB 2.5 MB/s eta 0:00:01
     ---------------- ----------------------- 0.6/1.4 MB 2.5 MB/s eta 0:00:01
     ------------------ --------------------- 0.6/1.4 MB 2.3 MB/s eta 0:00:01
     ---------------------- ----------------- 0.7/1.4 MB 2.5 MB/s eta 0:00:01
     ---------------------- ----------------- 0.7/1.4 MB 2.5 MB/s eta 0:00:01
     ------------------------ --------------- 0.8/1.4 MB 2.1 MB/s eta 0:00:01
     -------------------------- ------------- 0.9/1.4 MB 2.1 MB/s eta 0:00:01
     ---------------------------------------- 1.4/1.4 MB 2.8 MB/s eta

In [8]:
## importing the required 
transaction = []
for i in range(dataset.shape[0]):
    transaction.append([str(dataset.values[i,j]) for j in range(dataset.shape[1])])
transaction = np.array(transaction)

from mlxtend.preprocessing import TransactionEncoder
#initialize the transaction encoder
te=TransactionEncoder()
te_ary=te.fit(transaction).transform(transaction)
dataset=pd.DataFrame(te_ary,columns=te.columns_)
## dataset after encoding operation done
dataset.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
first30 = df_table["items"].head(30).values 
# Extract Top 30
dataset = dataset.loc[:,first30] 
# shape of the dataset
dataset.shape

(7500, 30)

In [10]:
#Importing Libraries -- FP Growth algorithm 
from mlxtend.frequent_patterns import fpgrowth
#running the fpgrowth algorithm
res=fpgrowth(dataset,min_support=0.05, use_colnames=True)
# printing top 10
res.head(10)

Unnamed: 0,support,itemsets
0,0.179733,(eggs)
1,0.0872,(burgers)
2,0.062533,(turkey)
3,0.238267,(mineral water)
4,0.132,(green tea)
5,0.1296,(milk)
6,0.058533,(whole wheat rice)
7,0.0764,(low fat yogurt)
8,0.170933,(french fries)
9,0.050533,(soup)


Our dataset is now ready, and we can implement an FP growth algorithm to find the frequent occurring items by setting minimum support to 0.05.

In [11]:
# importing required module  -- Association rule mining 
from mlxtend.frequent_patterns import association_rules
# creating asssociation rules
res=association_rules(res, metric="lift", min_threshold=1)
# printing association rules
res

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(eggs),(mineral water),0.179733,0.238267,0.050933,0.283383,1.189351,0.008109,1.062957,0.19409
1,(mineral water),(eggs),0.238267,0.179733,0.050933,0.213766,1.189351,0.008109,1.043286,0.209004
2,(spaghetti),(mineral water),0.174133,0.238267,0.059733,0.343032,1.439698,0.018243,1.159468,0.369806
3,(mineral water),(spaghetti),0.238267,0.174133,0.059733,0.250699,1.439698,0.018243,1.102184,0.400941
4,(chocolate),(mineral water),0.163867,0.238267,0.052667,0.3214,1.348907,0.013623,1.122506,0.309351
5,(mineral water),(chocolate),0.238267,0.163867,0.052667,0.221041,1.348907,0.013623,1.073398,0.339566


Now, we can create different associations rules from these frequently occurring elements.

In [12]:
#Sorting
res.sort_values("confidence",ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(spaghetti),(mineral water),0.174133,0.238267,0.059733,0.343032,1.439698,0.018243,1.159468,0.369806
4,(chocolate),(mineral water),0.163867,0.238267,0.052667,0.3214,1.348907,0.013623,1.122506,0.309351
0,(eggs),(mineral water),0.179733,0.238267,0.050933,0.283383,1.189351,0.008109,1.062957,0.19409
3,(mineral water),(spaghetti),0.238267,0.174133,0.059733,0.250699,1.439698,0.018243,1.102184,0.400941
5,(mineral water),(chocolate),0.238267,0.163867,0.052667,0.221041,1.348907,0.013623,1.073398,0.339566
1,(mineral water),(eggs),0.238267,0.179733,0.050933,0.213766,1.189351,0.008109,1.043286,0.209004


**Conclusion-The output shows that {spaghetti} -> {mineral water} has the highest confidence, and they are more related to each other.**