In [None]:
%pip install numpy==1.26.4 

In [None]:
pip install mlxtend


In [None]:
# importing libraries
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

#### TCN2 TASK 3: MARKET BASKET ANALYSIS

#### PART C1: DATA WRANGLING & ENCODING

In [None]:
# Load the dataset
df = pd.read_csv('Megastore Dataset.csv')

In [None]:
# 1. Nominal Variable Encoding (Region & Segment)
# I use One-Hot Encoding because these categories have no inherent rank.
df = pd.get_dummies(df, columns=['Region', 'Segment'], prefix=['Region', 'Segment'])

In [None]:
# 2. Ordinal Variable Encoding
# Variable A: CustomerOrderSatisfaction (Ranked 0 to 4)
satisfaction_map = {
    'Very Dissatisfied': 0,
    'Dissatisfied': 1,
    'Prefer not to answer': 2,
    'Satisfied': 3,
    'Very Satisfied': 4
}
df['CustomerOrderSatisfaction_Encoded'] = df['CustomerOrderSatisfaction'].map(satisfaction_map)

In [None]:
# Variable B: OrderPriority (Ranked 0 to 1)
# "High" priority is greater than "Medium" priority.
priority_map = {'Medium': 0, 'High': 1}
df['OrderPriority_Encoded'] = df['OrderPriority'].map(priority_map)

In [None]:
# Save the encoded dataset 
df.to_csv('Cleaned_Data.csv', index=False)
print("Step 1 Complete: Cleaned_Data.csv exported.")

#### PART C2: MARKET BASKET ANALYSIS

In [None]:
# 1. Transactionalize the dataset
# I group by OrderID to create a list of items per transaction
transactions = df.groupby('OrderID')['ProductName'].apply(list).tolist()

In [None]:
# 2. Convert to One-Hot Encoded Boolean Matrix
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
basket_df = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
# Save the transactionalized dataset 
basket_df.to_csv('Market_Basket_Data.csv', index=False)
print("Step 2 Complete: Market_Basket_Data.csv exported.")

In [None]:
# 3. Generate Association Rules (Apriori)
# I use a minimum support of 0.02 (2%) to find frequent itemsets
frequent_itemsets = apriori(basket_df, min_support=0.02, use_colnames=True)

In [None]:
# Generate rules and filter by Lift (Metric of choice)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

In [None]:
# Sort by Lift to find the strongest associations
top_rules = rules.sort_values(by='lift', ascending=False).head(3)

In [None]:
# Display the Top 3 Rules
print("\n" + "="*50)
print("TOP 3 ASSOCIATION RULES (Sorted by Lift)")
print("="*50)

for i, row in top_rules.iterrows():
    # Convert frozen sets to strings for clean display
    ant = list(row['antecedents'])[0]
    con = list(row['consequents'])[0]
    
    print(f"Rule #{i+1}: {ant} -> {con}")
    print(f"   Support:    {row['support']:.4f}")
    print(f"   Confidence: {row['confidence']:.4f}")
    print(f"   Lift:       {row['lift']:.4f}")
    print("-" * 30)