In [76]:
import pandas as pd
import os
import plotly.express as px
import plotly.io as pio
from datetime import datetime
from scipy.stats import norm 
import matplotlib.pyplot as plt
from scipy.stats import probplot
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')
import statsmodels.api as sm

### Exploratory Data Analysis

In [33]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)

pio.templates.default = "plotly_white"

In [34]:
data = pd.read_csv(r'C:\Users\Ashish\Documents\DAU\Projects\Market_Basket_Analysis\data\processed\placed_orders.csv')
data.head(2)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalCost
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


In [35]:
print("In the dataset: ")
print(f'Oldest date is:{data.InvoiceDate.min()}')
print(f'Latest date is:{data.InvoiceDate.max()}\n')

data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')

In the dataset: 
Oldest date is:2010-12-01 08:26:00
Latest date is:2011-12-09 12:50:00



In [36]:
orders_per_year = (
    data['InvoiceDate']
    .dt.year
    .value_counts()
    .sort_index()
    .reset_index()
)
orders_per_year.columns = ['Year', 'NumberOfOrders']
orders_per_year['Year'] = orders_per_year['Year'].astype(str)

fig1 = px.bar(
    orders_per_year,
    x='Year',
    y='NumberOfOrders',
    title='Number of Orders per Year'
)

fig1.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Orders',
    width=400,         
    height=500
)

fig1.show()


In [37]:
# Monthly revenue trend
monthly_sales = (
    data.set_index('InvoiceDate')
        .resample('M')['TotalCost']
        .sum()
        .reset_index()
)

fig2 = px.line(monthly_sales, x='InvoiceDate', y='TotalCost',
              title='Monthly Revenue Trend', markers=True, width=1200, height=500)
fig2.show()

In [38]:
# Top 10 selling products
top_products = (
    data.groupby('Description')['Quantity']
        .sum()
        .nlargest(20)
        .reset_index()
)

fig3 = px.bar(top_products, x='Quantity', y='Description',
             orientation='h', title='Top 20 Products Sold')
fig3.update_layout(yaxis={'categoryorder':'total ascending'}, width=1200, height=500)
fig3.show()

In [39]:
# Sales by country
country_sales = (
    data.groupby('Country')['TotalCost']
        .sum()
        .sort_values(ascending=False)
        .reset_index()
)

fig4 = px.treemap(country_sales, path=['Country'], values='TotalCost',
                 title='Sales by Country',    
                 width=800,         
                 height=500)
fig4.show()

In [40]:
fig5= px.box(
    data,
    x='UnitPrice',     
    title='Distribution of Unit Prices',
    width=800,         
    height=300
)

fig5.show()

In [41]:
import plotly.subplots as sp
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

# Create subplot figure with appropriate layout
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=( "Monthly Revenue Trend",
        "Top 10 Products Sold",
        "Sales by Country",
        "Distribution of Unit Prices"),
        specs=[
        [{"type": "xy"}, {"type": "xy"}],
        [{"type": "domain"}, {"type": "xy"}]  
    ],
    horizontal_spacing=0.17,  
    vertical_spacing=0.15     
)

for trace in fig2.data:
    fig.add_trace(trace, row=1, col=1)

for trace in fig3.data:
    fig.add_trace(trace, row=1, col=2)

for trace in fig4.data:
    fig.add_trace(trace, row=2, col=1)

for trace in fig5.data:
    fig.add_trace(trace, row=2, col=2)

# Add each figure as a subplot
fig.update_layout(
    title_text="Exploratory Analysis Dashboard",
    height=900,
    width=1600,
    showlegend=False,
    margin=dict(t=100, b=50, l=10, r=10)
)

# Save as HTML file
fig.show()

if not os.path.exists('../figures'):
    os.makedirs('../figures')
pio.write_html(fig, '../figures/market_basket_analysis.html')

In [43]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalCost
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


### Cohort Analysis

In [64]:
cohort =  data.copy()

# Vectorized date conversion
cohort['InvoiceMonth'] = cohort['InvoiceDate'].values.astype('datetime64[M]')
cohort['CohortMonth'] = cohort.groupby('CustomerID')['InvoiceMonth'].transform('min')

# Cohort index
cohort['CohortIndex'] = (
    (cohort['InvoiceMonth'].dt.year - cohort['CohortMonth'].dt.year) * 12 +
    (cohort['InvoiceMonth'].dt.month - cohort['CohortMonth'].dt.month) + 1
)

# Retention
grouping = cohort.groupby(['CohortMonth', 'CohortIndex'])
cohort_counts = grouping['CustomerID'].nunique().unstack()
cohort_sizes = cohort_counts[1]
retention = cohort_counts.divide(cohort_sizes, axis=0) * 100

# Average spend (using TotalCost)
avg_total_spend = grouping['TotalCost'].mean().unstack().round(1)
avg_unit_spend = grouping['UnitPrice'].mean().unstack().round(1)
avg_qty = grouping['Quantity'].mean().unstack().round(1)

fig6 = px.imshow(
    retention,
    text_auto='.1f',
    color_continuous_scale='mint',
    aspect='auto',
    title='Customer Retention by Monthly Cohorts (%)'
)
fig6.update_layout(width=900, height=500)

fig7 = px.imshow(
    avg_total_spend,
    text_auto='.1f',
    color_continuous_scale='mint',
    aspect='auto',
    title='Average Total Spend by Monthly Cohorts'
)
fig7.update_layout(width=900, height=500)

fig8 = px.imshow(
    avg_qty,
    text_auto='.1f',
    color_continuous_scale='mint',
    aspect='auto',
    title='Average Quantity per Monthly Cohorts'
)
fig8.update_layout(width=900, height=500)


fig9 = px.imshow(
    avg_unit_spend,
    text_auto='.1f',
    color_continuous_scale='mint',
    aspect='auto',
    title='Average Unit Spend per Monthly Cohorts'
)
fig9.update_layout(width=900, height=500)

dashboard = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Retention by Monthly Cohorts",
        "Average Spend by Monthly Cohorts",
        "Average Quantity per Monthly Cohorts",
        "Average Unit Spend per Monthly Cohorts"
    ),
    vertical_spacing=0.10
)

for trace in fig6.data:
    dashboard.add_trace(trace, row=1, col=1)
for trace in fig7.data:
    dashboard.add_trace(trace, row=1, col=2)
for trace in fig8.data:
    dashboard.add_trace(trace, row=2, col=1)
for trace in fig9.data:
    dashboard.add_trace(trace, row=2, col=2)

dashboard.update_layout(
    title_text="Cohort Analysis Dashboard",
    title_x=0.5,
    height=800,
    width=1200,
    showlegend=False,
    margin=dict(l=80, r=80, t=100, b=80),
    paper_bgcolor='rgba(248,248,255,1)',
    plot_bgcolor='rgba(248,248,255,1)',
    coloraxis_colorscale="mint"
)

dashboard.show()
if not os.path.exists('../figures'):
    os.makedirs('../figures')

pio.write_html(dashboard, '../figures/market_basket_analysis.html')

### RFM Analysis

In [86]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
current_date = data['InvoiceDate'].max()

#  RFM Computation 
# Recency
recency = (
    data.groupby('CustomerID')['InvoiceDate']
    .max()
    .reset_index()
    .rename(columns={'InvoiceDate': 'LastPurchaseDate'})
)
recency['Recency'] = (current_date - recency['LastPurchaseDate']).dt.days
recency.drop(columns='LastPurchaseDate', inplace=True)

# Frequency
frequency = (
    data.groupby('CustomerID')['InvoiceNo']
    .nunique()
    .reset_index()
    .rename(columns={'InvoiceNo': 'Frequency'})
)

# Monetary
data['TotalCost'] = data['Quantity'] * data['UnitPrice']
monetary = (
    data.groupby('CustomerID')['TotalCost']
    .sum()
    .reset_index()
    .rename(columns={'TotalCost': 'Monetary'})
)


rfm_table = (
    recency
    .merge(frequency, on='CustomerID')
    .merge(monetary, on='CustomerID')
    .set_index('CustomerID')
)


rfm_table.to_csv("../data/rfm/rfm_table.csv")
rfm_table.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,325,1,77183.6
12347,1,7,4310.0
12348,74,4,1797.24
12349,18,1,1757.55
12350,309,1,334.4


In [82]:
def QQ_plot(rfm_table, output_path='./figures/rfm_dashboard.html'):
    """
    Create a fully interactive Plotly dashboard with Distribution and QQ plots
    for Recency, Frequency, and Monetary metrics.
    """

    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
    measures = ['Recency', 'Frequency', 'Monetary']
    n_measures = len(measures)

    fig = make_subplots(
        rows=n_measures,
        cols=2,
        subplot_titles=[
            f"{m} Distribution" if i % 2 == 0 else f"{m} QQ Plot"
            for m in measures for i in range(2)
        ],
        horizontal_spacing=0.15,
        vertical_spacing=0.10
    )

    for i, measure in enumerate(measures):
        data = rfm_table[measure].dropna()
        mu, sigma = norm.fit(data)
        x = np.linspace(data.min(), data.max(), 100)
        pdf = norm.pdf(x, mu, sigma)

        # Histogram
        fig.add_trace(
            go.Histogram(
                x=data,
                nbinsx=30,
                histnorm='probability density',
                opacity=0.6,
                name=f'{measure} Histogram'
            ),
            row=i+1, col=1
        )

        # Fitted normal curve
        fig.add_trace(
            go.Scatter(
                x=x,
                y=pdf,
                mode='lines',
                name=f'{measure} Fitted Normal',
                line=dict(color='crimson', width=2)
            ),
            row=i+1, col=1
        )

        # QQ Plot 
        (theoretical, sample), (slope, intercept, r) = probplot(data, dist='norm', fit=True)

        # QQ Points
        fig.add_trace(
            go.Scatter(
                x=theoretical,
                y=sample,
                mode='markers',
                name=f'{measure} QQ Points',
                marker=dict(size=5, color='royalblue', opacity=0.7)
            ),
            row=i+1, col=2
        )

        fig.add_trace(
            go.Scatter(
                x=theoretical,
                y=slope * theoretical + intercept,
                mode='lines',
                name=f'{measure} QQ Line',
                line=dict(color='crimson', dash='dash')
            ),
            row=i+1, col=2
        )

        # Update axes
        fig.update_xaxes(title_text=measure, row=i+1, col=1)
        fig.update_yaxes(title_text='Density', row=i+1, col=1)
        fig.update_xaxes(title_text='Theoretical Quantiles', row=i+1, col=2)
        fig.update_yaxes(title_text='Sample Quantiles', row=i+1, col=2)

        # Add metric info to titles
        fig.layout.annotations[2 * i].text += f"<br>(μ={mu:.2f}, σ={sigma:.2f})"
        fig.layout.annotations[2 * i + 1].text += f"<br>(r²={r**2:.3f})"

    fig.update_layout(
        title_text="RFM Distribution & QQ Plot Dashboard",
        title_x=0.5,
        height=1200,
        width=1200,
        showlegend=False,
        template='plotly_white',
        paper_bgcolor='rgba(248,248,255,1)',
        plot_bgcolor='rgba(248,248,255,1)',
        margin=dict(l=80, r=80, t=100, b=80)
    )

    pio.write_html(fig, file=output_path, auto_open=False)

    return fig


In [84]:
fig = QQ_plot(rfm_table, output_path='../figures/rfm_dashboard.html')
fig.show()