In [1]:
import pandas as pd
import numpy as np
import Orange
from Orange.data import Domain, DiscreteVariable
from orangecontrib.associate.fpgrowth import *
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets
from IPython.display import display

### Association Mining

In [2]:
data = pd.read_csv('../data/processed/placed_orders.csv')
data.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalCost
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


In [3]:
# Unique item list
items = list(data['Description'].unique())

# Group by InvoiceNo to get item tuples
transaction_level = (
    data.groupby('InvoiceNo')['Description']
    .apply(tuple)
    .reset_index()
)

# Convert to binary (one-hot) transaction dataframe
transaction_df = pd.DataFrame([
    {item: int(item in desc) for item in items}
    for desc in transaction_level['Description']
])

transaction_df.shape


(18532, 3877)

In [4]:
def prune_dataset(df, length_trans=2, total_sales_perc=0.5, start_item=None, end_item=None, TopCols=None):
    if 'total_items' in df.columns:
        del df['total_items']

    item_count = df.sum().sort_values(ascending=False).reset_index()
    item_count.rename(columns={'index': 'item_name', 0: 'item_count'}, inplace=True)

    total_items = item_count['item_count'].sum()
    item_count['item_perc'] = item_count['item_count'] / total_items
    item_count['total_perc'] = item_count['item_perc'].cumsum()

    if TopCols:
        selected = TopCols
    elif start_item is not None and end_item is not None:
        selected = list(item_count.iloc[start_item:end_item]['item_name'])
    else:
        selected = list(item_count[item_count['total_perc'] <= total_sales_perc]['item_name'])

    df['total_items'] = df[selected].sum(axis=1)
    df = df[df['total_items'] >= length_trans].drop(columns='total_items')
    return df[selected], item_count[item_count['item_name'].isin(selected)]


In [5]:
data['Amount'] = data['Quantity'] * data['UnitPrice']
amount_sum = data.groupby('Description')['Amount'].sum().sort_values(ascending=False)
inv = data.groupby('Description')['InvoiceNo'].nunique().sort_values(ascending=False)

top15 = list(inv.head(15).index)
output_df, item_counts = prune_dataset(transaction_df, length_trans=2, start_item=0, end_item=15)

print(f"Total Sales by Top 15 Products: {amount_sum[top15].sum():.2f}")
print(f"Transactions: {output_df.shape[0]}, Products: {output_df.shape[1]}")

Total Sales by Top 15 Products: 762898.32
Transactions: 4668, Products: 15


In [6]:
domain = Domain([DiscreteVariable.make(name=item, values=['0', '1']) for item in output_df.columns])
data_orange = Orange.data.Table.from_numpy(domain=domain, X=output_df.values, Y=None)
data_encoded, mapping = OneHot.encode(data_orange, include_class=True)

In [7]:
support = 0.01
confidence = 0.6

print(f"Min transactions required: {int(output_df.shape[0] * support)}")

itemsets = dict(frequent_itemsets(data_encoded, support))
rules_df = pd.DataFrame()

if len(itemsets) > 0:
    rules = [
        (P, Q, supp, conf)
        for P, Q, supp, conf in association_rules(itemsets, confidence)
        if len(Q) == 1
    ]

    names = {item: f"{var.name}={val}" for item, var, val in OneHot.decode(mapping, data_orange, mapping)}
    eligible = [v for v in names.values() if v.endswith('1')]

    N = output_df.shape[0]
    rule_stats = list(rules_stats(rules, itemsets, N))

    rule_list = []
    for rule_data in rule_stats:
        antecedent, consequent = rule_data[:2]
        named_consequent = names[next(iter(consequent))]
        if named_consequent in eligible:
            rule_lhs = [names[i][:-2] for i in antecedent if names[i] in eligible]
            if rule_lhs and len(rule_lhs) > 1:
                rule_list.append({
                    'support': rule_data[2],
                    'confidence': rule_data[3],
                    'coverage': rule_data[4],
                    'lift': rule_data[6],
                    'antecedent': ', '.join(rule_lhs),
                    'consequent': named_consequent[:-2]
                })

    rules_df = pd.DataFrame(rule_list)
    print(f"Generated {len(rules_df)} association rules.")
else:
    print(" No frequent itemsets found.")


Min transactions required: 46
Generated 13925 association rules.


In [8]:
fig = px.scatter(
    rules_df, x='support', y='confidence', color='lift',
    hover_data=['antecedent', 'consequent'],
    title='Association Rules — Support vs Confidence (Bubble = Lift)',
    template='plotly_white',
    height=600,
    width=1200
)
fig.show()


In [13]:
top_rules = rules_df.nlargest(10, 'lift')
truncated_labels = top_rules['antecedent'].astype(str).apply(lambda x: x[:15] + "..." if len(x) > 15 else x)

fig = px.bar(
    top_rules,
    x='antecedent',
    y='lift',
    color='confidence',
    hover_data=['support', 'consequent'],
    title='Top 10 Association Rules by Lift',
    template='plotly_white'
)

fig.update_layout(
    xaxis_title='Antecedent',
    yaxis_title='Lift',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(len(top_rules))),
        ticktext=truncated_labels.tolist(),
        tickangle=45
    )
)

fig.show()


In [10]:
available_items = rules_df['consequent'].unique()

dropdown = widgets.Dropdown(
    options=available_items,
    description='Choose item:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='40%')
)

output = widgets.Output()

def recommend(change):
    with output:
        output.clear_output()
        chosen_item = change.new
        recs = rules_df[rules_df['consequent'] == chosen_item]['antecedent'].unique()
        display(pd.DataFrame(recs, columns=[f'Recommended with "{chosen_item}"']).head(10))

dropdown.observe(recommend, names='value')
display(dropdown, output)


Dropdown(description='Choose item:', layout=Layout(width='40%'), options=('LUNCH BAG  BLACK SKULL.', 'LUNCH BA…

Output()