# Import Library

In [3]:
import pandas as pd
import mlxtend
from mlxtend.frequent_patterns import association_rules
import datetime as dt
import networkx as nx

# Functions

## Dataset preparation for Market Basket Analysis (MBA)

In [4]:
# Convert the original DataFrame to a binary one which contains only zero and ones, then we'll return the output for applying the MBA function. 
def prepare_dataframe_for_MBA(path):
    
    # Call our dataset
    df = pd.read_excel(path)
    
    # Select customers with more than one item 
    df = df[df['Quantity']>=0].dropna().reset_index(drop = True)
    
    # Make a DataFrame with invoices and descriptions as rows and columns.
    # Since most of the dataset was related to the United Kingdom, I selected the information related to this country.
    df = (df[df['Country'] == "United Kingdom"].groupby(['Invoice','Description'])['Quantity']
         .sum().unstack().reset_index().fillna(0).set_index('Invoice'))
    
    # Make DataFrame binary 
    df = df.applymap(make_data_binary)
    return df

def make_data_binary(x):
    if x > 0:
        return 1
    else:
        return 0

## Product Bundling 

In [15]:
#MBA: Market Basket Analysis
def product_bundling(df):
    # We only need the presence of the sold product, not the quantity of them.
    df = df[(df > 0).sum(axis = 1) >= 2].astype(bool)
    
    # Implement apriori algorithm to calculate support of items.
    new_items = mlxtend.frequent_patterns.apriori(df, min_support=0.03, use_colnames=False, max_len=10).sort_values('support', ascending = False).reset_index(drop = True)
    new_items['lenght'] = new_items['itemsets'].apply(lambda x : len(x))
    
    # Apply association rules on the support values prepared by apriori algorithm.
    final_df = association_rules(new_items, metric='lift', min_threshold=1).sort_values('confidence', ascending=False).reset_index(drop = True)
    return final_df

In [16]:
# main function which calls above functions.
def main():
    path = '../Datasets/online_retail_II.xlsx'
    prepared_df = prepare_dataframe_for_MBA(path)
    final_df = product_bundling(prepared_df)
    return final_df

In [17]:
# call the main function to execute the project
main()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(3943),(3906),0.047415,0.076784,0.036277,0.765092,9.964175,0.032636,3.930114
1,(3125),(4233),0.058739,0.179578,0.042934,0.730932,4.070274,0.032386,3.049127
2,(4317),(4312),0.049095,0.060855,0.033788,0.688213,11.309069,0.0308,3.012136
3,(1920),(1919),0.052828,0.061166,0.030241,0.572438,9.358753,0.02701,2.195785
4,(4312),(4317),0.060855,0.049095,0.033788,0.555215,11.309069,0.0308,2.137898
5,(2211),(1955),0.057682,0.0705,0.031921,0.553398,7.849656,0.027854,2.081273
6,(1919),(1920),0.061166,0.052828,0.030241,0.494405,9.358753,0.02701,1.87338
7,(3906),(3943),0.076784,0.047415,0.036277,0.472447,9.964175,0.032636,1.805669
8,(1955),(2211),0.0705,0.057682,0.031921,0.45278,7.849656,0.027854,1.722011
9,(4233),(3125),0.179578,0.058739,0.042934,0.239085,4.070274,0.032386,1.237012


## Result:
The results display items with a strong correlation, indicating a higher likelihood of selecting the second item after choosing the first. In the following lines, I will explain the important features displayed in the table.

Support : It is one of the measure of interestingness. This tells about usefulness and certainty of rules. 5% Support means total 5% of transactions in database follow the rule.

Confidence: A confidence of 60% means that 60% of the customers who purchased a milk and bread also bought butter.

Lift: The higher the lift value, the higher the association between the items willl. If the lift value is more than 1, it is enough for us to say that those two items are associated each other.