# 🛒 Cart Optimization for Budget-Conscious Shopping
This notebook provides a comprehensive walkthrough:
1. **Objective**: We aim to help shoppers select items based on personalized preferences and budget constraints.
2. **Data Sources**: Supports both IGA and Woolworths datasets.
3. **Workflow**:
   - Load and normalize data
   - Compute multi-factor preference scores
   - Generate random shopping carts
   - Optimize carts under budget
   - Simulate and export multiple carts


## 1. Imports and Setup
We begin by importing required libraries:
- **pandas, numpy**: Data manipulation and numerical computations.
- **os**: File path operations.
- **tqdm**: Progress bars for simulations.
- **random**: Random cart generation.
- **collections.Counter**: Counting frequencies.
- **sklearn** modules**: (Optional) Placeholder for text similarity if extending to product descriptions.


In [9]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import random
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## 2. Data Loading & Normalization
### 2.1 `load_and_normalize_data`
- Reads a CSV using `pd.read_csv`.
- Infers dataset type from filename.
- Calls `preprocess_data` to standardize columns.

### 2.2 `preprocess_data`
- Renames columns depending on dataset type.
- Ensures numeric types are correct.


In [10]:
def load_and_normalize_data(filepath):
    """
    Load a CSV file and determine its source (IGA vs. Woolworths).
    Then standardize schema for downstream analysis.

    Args:
        filepath (str): Path to the CSV file.
    Returns:
        df (DataFrame): Cleaned product data.
        dataset_type (str): 'iga' or 'woolworths'.
    """
    df = pd.read_csv(filepath)
    filename = os.path.basename(filepath).lower()

    # Determine dataset source
    if 'woolworths' in filename or 'woolies' in filename:
        dataset_type = 'woolworths'
    elif 'iga' in filename:
        dataset_type = 'iga'
    else:
        dataset_type = 'iga'  # default fallback

    # Normalize columns and types
    df = preprocess_data(df, dataset_type)
    return df, dataset_type

def preprocess_data(df, dataset_type):
    """
    Standardize column names and enforce data types.

    Args:
        df (DataFrame): Raw data.
        dataset_type (str): 'iga' or 'woolworths'.
    Returns:
        DataFrame: Normalized data.
    """
    if dataset_type == 'iga':
        mapping = {
            'best_price': 'best_price',
            'unit_price': 'unit_price',
            'total_price': 'total_price',
            'item_name': 'item_name',
            'brand': 'brand',
            'category': 'category',
            'item_id': 'item_id'
        }
        df = df.rename(columns=mapping)
    else:
        mapping = {
            'Item ID': 'item_id',
            'Item Name': 'item_name',
            'Brand': 'brand',
            'Category': 'category',
            'Unit Price': 'unit_price',
            'Total': 'total_price'
        }
        df = df.rename(columns=mapping)
        # Convert types for consistency
        df['item_id'] = df['item_id'].astype(str)
        df['unit_price'] = df['unit_price'].astype(float)
        df['total_price'] = df['total_price'].astype(float)

    return df


## 3. Scoring Logic
We compute a **preference score** per item combining:
- **Necessity**: Is item category essential?
- **Purchase Frequency**: How often user bought it before.
- **Brand Loyalty**: Proportion of past purchases from same brand.
- **Common Brand**: Top 3 frequently bought brands.
- **Affordability**: Inverse price percentile within category.
- **Availability**: Logistic transform of stock count.
- **Rating**: Weighted by average rating and review volume.

Each factor has a weight in the final linear combination.

In [11]:
def compute_brand_scores(df, user_purchase_history):
    """
    Derive purchase count per item and count by brand.

    Args:
        df (DataFrame): Product dataset.
        user_purchase_history (dict): {item_id: count or {'count': int, 'brand': str}}
    Returns:
        Counter: brand frequency
        set: brands previously purchased
    """
    if not user_purchase_history:
        df['purchase_count'] = 0
        return Counter(), set()

    sample_val = next(iter(user_purchase_history.values()))
    detailed = isinstance(sample_val, dict)

    if detailed:
        df['purchase_count'] = df['item_id'].map(
            lambda pid: user_purchase_history.get(pid, {}).get('count', 0)
        )
        brand_counter = Counter(
            entry['brand'] for entry in user_purchase_history.values() if 'brand' in entry
        )
    else:
        df['purchase_count'] = df['item_id'].map(
            lambda pid: user_purchase_history.get(pid, 0)
        )
        brand_counter = Counter()

    return brand_counter, set(brand_counter)


In [12]:
def calculate_scores(
    df, essential_categories, brand_counter, past_brands, stock_info, rating_info
):
    """
    Compute per-item scores and aggregate into preference_score.
    """
    top_brands = {b for b, _ in brand_counter.most_common(3)}
    max_freq = max(df['purchase_count'].max(), 1)

    df['necessity_score'] = df['category'].isin(essential_categories).astype(int)
    df['purchase_frequency_score'] = df['purchase_count'] / max_freq
    total_brand = sum(brand_counter.values()) or 1
    df['brand_loyalty_score'] = df['brand'].map(lambda b: brand_counter.get(b, 0) / total_brand)
    df['common_brand_score'] = df['brand'].isin(top_brands).astype(int)
    df['affordability_score'] = 1 - df.groupby('category')['unit_price'].rank(pct=True)
    df['affordability_score'] = df['affordability_score'].clip(0, 1)
    if stock_info:
        df['availability_score'] = df['item_id'].map(
            lambda pid: 1 / (1 + np.exp(-stock_info.get(pid, 1)))
        )
    else:
        df['availability_score'] = 1
    if rating_info:
        max_rev = max([cnt for _, cnt in rating_info.values()] or [1])
        df['rating_score'] = df['item_id'].map(
            lambda pid: rating_info.get(pid, (0.5,0))[0] * 
                       (np.log1p(rating_info.get(pid, (0,0))[1]) / np.log1p(max_rev))
        )
    else:
        df['rating_score'] = 0.5
    df['preference_score'] = (
        2.0 * df['necessity_score'] +
        2.0 * df['purchase_frequency_score'] +
        1.5 * df['brand_loyalty_score'] +
        1.0 * df['affordability_score'] +
        0.5 * df['availability_score'] +
        1.0 * df['rating_score']
    )
    return df


In [13]:
def add_smart_preference_scores_fast(df, **kwargs):
    """Convenience wrapper to compute and add preference scores."""
    bc, pb = compute_brand_scores(df, kwargs.get('user_purchase_history', {}))
    return calculate_scores(
        df.copy(),
        kwargs.get('essential_categories', set()),
        bc, pb,
        kwargs.get('stock_info', {}),
        kwargs.get('rating_info', {})
    )


## 4. Cart Simulation & Optimization
### 4.1 `generate_random_cart`
- Randomly sample items to recreate a user's cart.

### 4.2 `optimize_cart_for_budget`
- Calculate **value_score** = preference_score / total_price.
- Greedily pick highest value until budget limit.


In [14]:
def generate_random_cart(df, cart_size=5):
    """Return a dict list representing a random shopping cart."""
    return df.sample(n=cart_size).to_dict('records')

def optimize_cart_for_budget(cart, budget):
    """Filter cart items to maximize value_score under budget."""
    for item in cart:
        item['value_score'] = item['preference_score'] / item['total_price']
    sorted_cart = sorted(cart, key=lambda x: x['value_score'], reverse=True)
    selected, spent = [], 0.0
    for it in sorted_cart:
        if spent + it['total_price'] <= budget:
            selected.append(it)
            spent += it['total_price']
    dropped = [it for it in cart if it not in selected]
    return selected, dropped


## 5. Example Usage & Batch Export
Demonstrates running a single example and exporting 100 simulated carts.

In [15]:
# File paths (update accordingly)
iga_path = "/Users/macbook1/Documents/Submissions/Tri1_25/SIT764/Datamate/IGA/Original_updated_iga_data.csv"
wool_path = "/Users/macbook1/Documents/Submissions/Tri1_25/SIT764/Datamate/Test_Data/synthetic_woolworths_cleaned.csv"
df, dtype = load_and_normalize_data(wool_path)
df = add_smart_preference_scores_fast(
    df,
    user_purchase_history={},
    essential_categories={'pantry', 'dairy'},
    stock_info={},
    rating_info={}
)

# Single example
cart = generate_random_cart(df, 5)
opt_cart, dropped = optimize_cart_for_budget(cart, 30.0)
print("Original:")
for i in cart:
    print(f" - {i['item_name']} (${i['total_price']:.2f})")
print("Optimized:")
for i in opt_cart:
    print(f" - {i['item_name']} (${i['total_price']:.2f})")

# Batch export
rows = []
for cid in tqdm(range(1, 101)):
    size = random.randint(3,6)
    bud = random.uniform(20,50)
    c = generate_random_cart(df, size)
    sel, drop = optimize_cart_for_budget(c, bud)
    rows.append({
        'cart_id': cid,
        'budget': round(bud,2),
        'selected': ', '.join(it['item_name'] for it in sel),
        'dropped': ', '.join(it['item_name'] for it in drop)
    })
out_df = pd.DataFrame(rows)
out_df.to_csv(f"Budget_optimized_carts_detailed_{dtype}.csv", index=False)


Original:
 - Darrell Lea Milk Chocolate Coconut Rough Bb's 168g ($44.00)
 - Biopak Art Series Paper Cups Paper Cups Small 295ml 25 Pack ($56.00)
 - Dairy Farmers Thick & Creamy Pineapple & Passionfruit Yoghurt 550g ($36.00)
 - Hawaiian Tropic Tropic Silk Hydration Sunscreen Lotion Spf 50 180ml ($19.00)
 - Fluffy Fabric Softener Freshen Up Spray Spice Allure 400ml ($70.00)
Optimized:
 - Hawaiian Tropic Tropic Silk Hydration Sunscreen Lotion Spf 50 180ml ($19.00)


100%|██████████| 100/100 [00:00<00:00, 432.32it/s]
