In [15]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [33]:
df = pd.read_csv('/Users/rajpatel/Desktop/coles.csv')
df.columns = [col.strip() for col in df.columns]


columns = list(df.columns)
for i, col in enumerate(columns):
    if col == 'item_name':
        columns[i] = 'ori_name'
        break


df.columns = columns
print(df.columns.tolist())

['product_code', 'category', 'ori_name', 'essential_flag', 'item_name', 'comapny', 'subcat', 'item_price', 'unit_price', 'brand_name', 'weight/coume']


In [35]:
# === Section 1: Load the Dataset ===
# This function grabs our Coles CSV file and gets it ready. It cleans up messy column
# names (no extra spaces) and renames 'item_name' to 'ori_name' so we don’t mix it up
# later. Gotta have clean data to start with!
def load_data(file_path="/Users/rajpatel/Desktop/coles.csv"):
    """Load CSV, clean column names, and rename first 'item_name' to 'ori_name'."""
    try:
        items_df = pd.read_csv(file_path)
        items_df.columns = items_df.columns.str.strip()  # Bye, extra spaces!
        # Rename first 'item_name' to 'ori_name'
        columns = list(items_df.columns)
        if 'item_name' in columns:
            columns[columns.index('item_name')] = 'ori_name'
        items_df.columns = columns
        return items_df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        raise


In [37]:

# === Section 2: Set Random Seeds ===
# Locking in the randomness with seeds so our results don’t change every time we run
# this. Keeps things consistent, like a science experiment!
random.seed(42)
np.random.seed(42)

# === Section 3: Clean Subcategory Names ===
# Fixing a typo in 'subcat' (Chiken -> Chicken). Clean data makes our model happy!
df['subcat'] = df['subcat'].str.strip().replace('Chiken', 'Chicken')


In [53]:

# === Section 4: Subcategory Weights and Importance ===
# These dictionaries set priorities for items. Stuff like milk gets high weights (2.0)
# because it’s essential, while snacks get lower (0.3) since they’re optional. Helps
# us figure out what’s worth keeping!
subcategory_weights = {
    # Meat & Seafood (High priority - protein essentials)
    'Beef': 1.8, 'Chicken': 1.8, 'Chiken': 1.8, 'Pork': 1.6, 'Lamb': 1.5, 'Fish': 1.5, 'Salmon': 1.5,
    'Turkey': 1.5, 'Prawns': 1.4, 'Tuna': 1.4, 'Mixed Meat': 1.4, 'Seafood': 1.3, 'Kangaroo': 1.2,
    'Veal': 1.2, 'Duck': 1.1, 'Venison': 1.0, 'Crab': 1.0, 'Trout': 1.0, 'Mussels': 1.0, 'Wallaby': 0.9,
    'Plant-Based': 1.5,
    # Fresh Produce (High priority - health essentials)
    'Fruit': 1.8, 'Vegetables (Leafy/Salad)': 1.8, 'Vegetables (Fruiting)': 1.7, 'Vegetables (Root/Onion/Garlic)': 1.7,
    'Vegetables (Stem/Flower/Pod)': 1.6, 'Mushrooms': 1.4, 'Herbs/Sprouts': 1.2, 'Value-Added Produce': 1.0,
    'Nuts/Seeds/Dried Fruit': 1.1, 'Other Items (F&V Section)': 0.8,
    # Dairy & Eggs (High priority - nutrition essentials)
    'Milk Standard': 2.0, 'Eggs Standard': 1.9, 'Yoghurt Standard': 1.7, 'Cheese Standard': 1.6, 'Butter Standard': 1.5,
    'Milk Specialty': 1.5, 'Cream Standard': 1.3, 'Yoghurt Specialty': 1.2, 'Cheese Specialty': 1.1,
    'Butter Specialty': 1.0, 'Outsider': 0.5,
    # Bakery (Medium priority)
    'Bread Loaves': 1.8, 'Sourdough & Artisan Breads': 1.4, 'Rolls & Buns': 1.3, 'Wraps & Flatbreads': 1.3,
    'Savoury Bakery Items': 1.0, 'Pancakes, Waffles & Crepes': 0.8, 'Muffins & Cupcakes': 0.7, 'Cakes & Slices': 0.6,
    'Sweet Pastries & Donuts': 0.5, 'Biscuits & Cookies': 0.5,
    # Deli & Processed Meats (Medium priority)
    'Ham': 1.2, 'Bacon': 1.1, 'Chicken (Processed/Cooked)': 1.0, 'Turkey (Processed/Cooked)': 1.0,
    'Salami/Pepperoni/Chorizo': 0.9, 'Beef (Processed/Cooked)': 0.9, 'Pork (Processed/Cooked)': 0.9,
    'Seafood (Processed/Cooked)': 0.8, 'Frankfurts/Sausages': 0.8, 'Platters/Kits': 0.7, 'Dips/Pate': 0.6,
    'Antipasto/Olives/Pickles': 0.5,
    # Pantry (Medium-Low priority)
    'Pasta/Rice/Noodles/Grains': 1.5, 'Canned Goods': 1.4, 'Breakfast Cereals': 1.3,
    'Meal Kits/Bases/Instant Meals': 1.2, 'Meal Kits/Bases/Instant Meals5': 1.2, 'Baking Ingredients': 1.1,
    'Spreads/Oils/Condiments': 1.0, 'Baking Mixes': 0.9, 'Crackers/Breadsticks': 0.7, 'Snacks (Savoury)': 0.5,
    'Snacks (Sweet)': 0.4, 'Confectionery': 0.3, 'Other Pantry Items': 0.6, 'Pantry': 0.8, 'Pantry/Other': 0.8,
    # Beverages (Low priority)
    'Water': 1.5, 'Milk': 1.8, 'Juice/Smoothie': 0.9, 'Tea': 0.8, 'Coffee': 0.8, 'Functional/Health Drink': 0.7,
    'Soft Drink/Mixer': 0.3, 'Alcoholic Beverages (Low/No Alc)': 0.2, 'Beverages (Shelf-Stable)': 0.5,
    'Other Drinks': 0.4, 'Non-Drink Item': 0.1,
    # Frozen Foods (Medium priority)
    'Frozen Vegetables': 1.6, 'Frozen Fruits': 1.5, 'Frozen Meat': 1.4, 'Frozen Poultry': 1.4, 'Frozen Seafood': 1.3,
    'Frozen Meals': 1.0, 'Frozen Chips': 0.7, 'Frozen Pastry': 0.6, 'Ice Cream': 0.4, 'Frozen Desserts': 0.3,
    'Other Frozen': 0.5,
    # Household & Cleaning (Variable priority)
    'Paper Products': 1.5, 'Dishwashing': 1.4, 'Laundry Care': 1.4, 'Cleaning Solutions & Wipes': 1.3,
    'Bags, Wraps & Foils': 1.2, 'Cleaning Tools & Accessories': 1.0, 'Kitchenware & Food Storage': 0.9,
    'Air Care & Pest Control': 0.8, 'Home Maintenance & General': 0.7, 'Stationery': 0.4,
    # Personal Care & Health (High priority)
    'First Aid & Wellness': 2.0, 'Health & Medicines': 2.0, 'Medicines & Health Treatments': 2.0,
    'Vitamins & Supplements': 1.5, 'Oral Care': 1.5, 'Feminine & Incontinence Care': 1.5, 'Wash Products': 1.4,
    'Skincare': 1.2, 'Hair Care': 1.1, 'Deodorants & Antiperspirants': 1.0, 'Deodorants & Body Sprays': 1.0,
    'Shaving & Hair Removal': 0.9, "Shaving & Men's Grooming": 0.9, 'First Aid & Wellness Accessories': 0.8,
    'Beauty, Wellness & First Aid': 1.3,
    # Other/Miscellaneous
    'Prepared Meals': 1.0, 'Cheese': 1.2, 'Bakery': 0.9, 'Other Deli': 0.6,
}

subcategory_importance = {
    # Health & Medical - Critical
    'First Aid & Wellness': 1.0, 'Health & Medicines': 1.0, 'Medicines & Health Treatments': 1.0,
    'Vitamins & Supplements': 0.8, 'Feminine & Incontinence Care': 0.7, 'Oral Care': 0.6,
    # Essential Food
    'Milk Standard': 0.7, 'Eggs Standard': 0.7, 'Bread Loaves': 0.6, 'Fruit': 0.6, 'Vegetables (Leafy/Salad)': 0.6,
    'Vegetables (Fruiting)': 0.5, 'Vegetables (Root/Onion/Garlic)': 0.5, 'Vegetables (Stem/Flower/Pod)': 0.5,
    # Protein Sources
    'Chicken': 0.5, 'Beef': 0.5, 'Fish': 0.5, 'Salmon': 0.4, 'Poultry': 0.4, 'Turkey': 0.4,
    # Important Household
    'Paper Products': 0.5, 'Dishwashing': 0.4, 'Laundry Care': 0.4, 'Cleaning Solutions & Wipes': 0.3,
    'Wash Products': 0.3,
    # Staples
    'Rice/Pasta/Grains': 0.4, 'Pasta/Rice/Noodles/Grains': 0.4, 'Canned Goods': 0.3, 'Frozen Vegetables': 0.3,
    'Frozen Fruits': 0.3, 'Water': 0.4,
    # Luxuries/Less Essential
    'Ice Cream': -0.2, 'Cakes & Slices': -0.2, 'Sweet Pastries & Donuts': -0.2, 'Confectionery': -0.3,
    'Snacks (Sweet)': -0.2, 'Soft Drink/Mixer': -0.2, 'Alcoholic Beverages (Low/No Alc)': -0.3
}


In [55]:
# === Section 5: Scoring Functions ===
# These functions give each item a score based on how important it is and its price.
# Helps us decide what’s a must-have vs. what’s too pricey to keep.

def get_subcategory_weight(subcat, default_weight=0.5):
    """Grab the weight for a subcategory (e.g., 2.0 for milk, 0.3 for candy)."""
    return subcategory_weights.get(subcat, default_weight)

def calculate_base_priority(row):
    """Give essentials a higher base score so they stick around."""
    subcategory_value = subcategory_importance.get(row['subcat'], 0)
    return row['essential_flag'] * 5 + 3 + subcategory_value

def calculate_price_sensitivity_penalty(items_df):
    """Penalize pricey items—expensive snacks get hit harder than pricey milk."""
    items_df = items_df.copy()
    items_df['price_rank'] = 0.0
    items_df['price_sensitivity_penalty'] = 0.0
    overall_ranks = items_df['unit_price'].rank(pct=True)
    items_df['price_rank'] = overall_ranks
    for category in items_df['category'].unique():
        category_mask = (items_df['category'] == category)
        category_df = items_df.loc[category_mask]
        if len(category_df) > 1:
            category_ranks = items_df.loc[category_mask, 'unit_price'].rank(pct=True)
            items_df.loc[category_mask, 'price_rank'] = category_ranks
            for subcategory in category_df['subcat'].unique():
                subcat_mask = category_mask & (items_df['subcat'] == subcategory)
                subcat_df = items_df.loc[subcat_mask]
                if len(subcat_df) > 1:
                    subcat_ranks = items_df.loc[subcat_mask, 'unit_price'].rank(pct=True)
                    items_df.loc[subcat_mask, 'price_rank'] = subcat_ranks
    for idx, row in items_df.iterrows():
        price_rank = row['price_rank']
        essential_flag = row['essential_flag']
        if essential_flag:
            if price_rank > 0.9:
                items_df.loc[idx, 'price_sensitivity_penalty'] = 1.0
            elif price_rank > 0.75:
                items_df.loc[idx, 'price_sensitivity_penalty'] = 0.5
        else:
            if price_rank > 0.9:
                items_df.loc[idx, 'price_sensitivity_penalty'] = 3.0
            elif price_rank > 0.75:
                items_df.loc[idx, 'price_sensitivity_penalty'] = 2.0
            elif price_rank > 0.6:
                items_df.loc[idx, 'price_sensitivity_penalty'] = 1.0
    return items_df

def calculate_preference_score(items_df):
    """Score items from 1-10 based on priority and price—milk wins, pricey candy loses."""
    items_df = items_df.copy()
    items_df['subcategory_weight'] = items_df['subcat'].apply(get_subcategory_weight)
    items_df['base_priority'] = items_df.apply(calculate_base_priority, axis=1)
    items_df = calculate_price_sensitivity_penalty(items_df)
    items_df['preference_score'] = (
        items_df['base_priority'] + items_df['subcategory_weight'] - items_df['price_sensitivity_penalty']
    ).clip(1, 10)
    return items_df

def calculate_value_score(items_df):
    """Score how good a deal each item is—cheap essentials get high marks!"""
    items_df = items_df.copy()
    items_df['price_percentile'] = items_df['unit_price'].rank(pct=True)
    items_df['value_score'] = (
        (items_df['preference_score'] * 0.6) + (items_df['essential_flag'] * 2.0) - (items_df['price_percentile'] * 1.0)
    ).clip(0, 10)
    return items_df


In [57]:
# === Section 6: Add Discounts and Best Prices ===
# Adds discount and best price columns, like a real sale! Essentials get smaller
# discounts (gotta keep milk steady), while snacks get bigger ones (sales bait!).
def add_discount_and_best_price_fixed(items_df):
    """Add discount and best price columns to ALL items with realistic rates."""
    items_df = items_df.copy()
    items_df['subcat'] = items_df['subcat'].str.strip().replace('Chiken', 'Chicken')
    core_foods = [
        'Pork', 'Beef', 'Chicken', 'Prawns', 'Lamb', 'Mixed Meat', 'Salmon', 'Fish', 'Turkey', 'Tuna',
        'Kangaroo', 'Seafood', 'Plant-Based', 'Veal', 'Duck', 'Trout', 'Mussels', 'Venison', 'Wallaby',
        'Crab', 'Fruit', 'Vegetables (Fruiting)', 'Vegetables (Root/Onion/Garlic)', 'Vegetables (Stem/Flower/Pod)',
        'Vegetables (Leafy/Salad)', 'Mushrooms', 'Herbs/Sprouts', 'Value-Added Produce', 'Frozen Fruits',
        'Frozen Vegetables', 'Other Items (F&V Section)', 'Nuts/Seeds/Dried Fruit', 'Milk Standard',
        'Milk Specialty', 'Cheese Standard', 'Cheese Specialty', 'Yoghurt Standard', 'Yoghurt Specialty',
        'Butter Standard', 'Butter Specialty', 'Cream Standard', 'Eggs Standard', 'Milk', 'Outsider',
        'Bread Loaves', 'Wraps & Flatbreads', 'Rolls & Buns', 'Sourdough & Artisan Breads'
    ]
    discretionary_foods = [
        'Bacon', 'Ham', 'Salami/Pepperoni/Chorizo', 'Frankfurts/Sausages', 'Pork (Processed/Cooked)',
        'Chicken (Processed/Cooked)', 'Turkey (Processed/Cooked)', 'Beef (Processed/Cooked)',
        'Seafood (Processed/Cooked)', 'Cakes & Slices', 'Sweet Pastries & Donuts', 'Biscuits & Cookies',
        'Pancakes, Waffles & Crepes', 'Muffins & Cupcakes', 'Savoury Bakery Items', 'Snacks (Sweet)',
        'Snacks (Savoury)', 'Confectionery', 'Juice/Smoothie', 'Functional/Health Drink', 'Soft Drink/Mixer',
        'Alcoholic Beverages (Low/No Alc)'
    ]
    items_df['discount'] = 0.0
    items_df['best_price'] = items_df['item_price'].copy()
    items_df['subcat'] = items_df['subcat'].fillna('Other')
    items_df['item_price'] = pd.to_numeric(items_df['item_price'], errors='coerce').fillna(items_df['item_price'].mean())
    valid_df = items_df[items_df['subcat'].notna() & items_df['item_price'].notna()].copy()
    core_items = valid_df[valid_df['subcat'].isin(core_foods)]
    disc_items = valid_df[valid_df['subcat'].isin(discretionary_foods)]
    other_items = valid_df[~valid_df['subcat'].isin(core_foods + discretionary_foods)]
    core_discount_indices = core_items.index if len(core_items) > 0 else []
    disc_discount_indices = disc_items.index if len(disc_items) > 0 else []
    other_discount_indices = other_items.index if len(other_items) > 0 else []
    for idx in core_discount_indices:
        item_price = items_df.loc[idx, 'item_price']
        discount_rate = 0.154 * np.random.uniform(0.7, 1.3)
        items_df.loc[idx, 'discount'] = discount_rate
        original_price = item_price / (1 - discount_rate)
        best_discount = 0.154 * np.random.uniform(1.1, 1.2)
        items_df.loc[idx, 'best_price'] = original_price * (1 - best_discount)
    for idx in disc_discount_indices:
        item_price = items_df.loc[idx, 'item_price']
        discount_rate = 0.259 * np.random.uniform(0.7, 1.3)
        items_df.loc[idx, 'discount'] = discount_rate
        original_price = item_price / (1 - discount_rate)
        best_discount = 0.259 * np.random.uniform(1.1, 1.2)
        items_df.loc[idx, 'best_price'] = original_price * (1 - best_discount)
    for idx in other_discount_indices:
        item_price = items_df.loc[idx, 'item_price']
        discount_rate = 0.15 * np.random.uniform(0.7, 1.3)
        items_df.loc[idx, 'discount'] = discount_rate
        original_price = item_price / (1 - discount_rate)
        best_discount = 0.15 * np.random.uniform(1.1, 1.2)
        items_df.loc[idx, 'best_price'] = original_price * (1 - best_discount)
    return items_df

In [59]:
# === Section 7: Group Subcategories ===
# This dictionary groups grocery subcategories into big categories (like 'Fresh Produce' for
# fruits and veggies). It’s like organizing a grocery store so we can pick items for carts later.
subcategory_groups = {
    'Fresh Produce': [
        'Fruit', 'Vegetables (Fruiting)', 'Vegetables (Root/Onion/Garlic)', 'Vegetables (Stem/Flower/Pod)',
        'Vegetables (Leafy/Salad)', 'Mushrooms', 'Herbs/Sprouts', 'Value-Added Produce',
        'Other Items (F&V Section)', 'Nuts/Seeds/Dried Fruit'
    ],
    'Dairy and Eggs': [
        'Milk Standard', 'Milk Specialty', 'Milk', 'Cheese Standard', 'Cheese Specialty', 'Cheese',
        'Yoghurt Standard', 'Yoghurt Specialty', 'Butter Standard', 'Butter Specialty', 'Cream Standard',
        'Eggs Standard'
    ],
    'Fresh Meat': [
        'Beef', 'Chicken', 'Pork', 'Lamb', 'Mixed Meat', 'Turkey', 'Kangaroo', 'Veal', 'Duck', 'Venison', 'Wallaby'
    ],
    'Fresh Seafood': [
        'Fish', 'Salmon', 'Prawns', 'Tuna', 'Seafood', 'Trout', 'Mussels', 'Crab'
    ],
    'Processed Meats': [
        'Bacon', 'Ham', 'Chicken (Processed/Cooked)', 'Turkey (Processed/Cooked)', 'Beef (Processed/Cooked)',
        'Pork (Processed/Cooked)', 'Frankfurts/Sausages', 'Salami/Pepperoni/Chorizo', 'Seafood (Processed/Cooked)'
    ],
    'Bakery': [
        'Bread Loaves', 'Wraps & Flatbreads', 'Rolls & Buns', 'Sourdough & Artisan Breads', 'Bakery',
        'Savoury Bakery Items'
    ],
    'Sweet Baked Goods': [
        'Cakes & Slices', 'Sweet Pastries & Donuts', 'Biscuits & Cookies', 'Pancakes, Waffles & Crepes',
        'Muffins & Cupcakes'
    ],
    'Pantry Staples': [
        'Pantry/Other', 'Pantry', 'Pasta/Rice/Noodles/Grains', 'Breakfast Cereals', 'Canned Goods',
        'Spreads/Oils/Condiments', 'Baking Ingredients', 'Baking Mixes', 'Crackers/Breadsticks',
        'Antipasto/Olives/Pickles', 'Other Pantry Items'
    ],
    'Prepared and Convenience Foods': [
        'Meal Kits/Bases/Instant Meals', 'Meal Kits/Bases/Instant Meals5', 'Prepared Meals', 'Platters/Kits',
        'Dips/Pate', 'Other Deli'
    ],
    'Snacks and Confectionery': [
        'Snacks (Sweet)', 'Snacks (Savoury)', 'Confectionery'
    ],
    'Beverages': [
        'Juice/Smoothie', 'Soft Drink/Mixer', 'Water', 'Tea', 'Coffee', 'Functional/Health Drink',
        'Beverages (Shelf-Stable)', 'Other Drinks', 'Alcoholic Beverages (Low/No Alc)'
    ],
    'Frozen Foods': [
        'Frozen Chips', 'Frozen Fruits', 'Frozen Vegetables', 'Frozen Meat', 'Frozen Poultry', 'Frozen Seafood',
        'Frozen Meals', 'Frozen Pastry', 'Other Frozen'
    ],
    'Frozen Desserts': [
        'Ice Cream', 'Frozen Desserts'
    ],
    'Household and Cleaning': [
        'Dishwashing', 'Bags, Wraps & Foils', 'Laundry Care', 'Kitchenware & Food Storage', 'Paper Products',
        'Cleaning Solutions & Wipes', 'Cleaning Tools & Accessories', 'Home Maintenance & General',
        'Air Care & Pest Control', 'Stationery'
    ],
    'Health and Beauty': [
        'Vitamins & Supplements', 'Skincare', 'Wash Products', 'First Aid & Wellness', 'Health & Medicines',
        'Feminine & Incontinence Care', 'Oral Care', 'Deodorants & Antiperspirants', 'Hair Care',
        'Shaving & Hair Removal', 'First Aid & Wellness Accessories', 'Deodorants & Body Sprays',
        'Medicines & Health Treatments', "Shaving & Men's Grooming", 'Beauty, Wellness & First Aid'
    ],
    'Plant-Based Alternatives': ['Plant-Based'],
    'Miscellaneous': ['Outsider', 'Non-Drink Item']
}

# Reverse lookup: subcategory -> group (e.g., 'Fruit' -> 'Fresh Produce'). Super handy for
# figuring out what group an item belongs to when we’re building carts!
subcat_to_group = {subcat: group for group, subcats in subcategory_groups.items() for subcat in subcats}


In [61]:

# === Section 8: Define Cart Sizes and Budgets ===
# These set how many items and how much cash shoppers get. Families get bigger carts and budgets,
# singles get less—it’s like mimicking real shopping habits!
cart_size_ranges = {
    ('Family', 'Budget'): (12, 18), ('Family', 'Mainstream'): (10, 16), ('Family', 'Premium'): (8, 14),
    ('Family', 'Health'): (10, 16), ('Couple', 'Budget'): (8, 12), ('Couple', 'Mainstream'): (7, 11),
    ('Couple', 'Premium'): (6, 10), ('Couple', 'Health'): (7, 11), ('Single', 'Budget'): (6, 9),
    ('Single', 'Mainstream'): (5, 8), ('Single', 'Premium'): (5, 7), ('Single', 'Health'): (5, 8),
    ('Shared', 'Budget'): (8, 12), ('Shared', 'Mainstream'): (7, 11), ('Shared', 'Premium'): (6, 10),
    ('Shared', 'Health'): (7, 11),
}

budget_ranges = {
    ('Family', 'Budget'): (150, 200), ('Family', 'Mainstream'): (180, 240), ('Family', 'Premium'): (220, 300),
    ('Family', 'Health'): (200, 280), ('Couple', 'Budget'): (100, 140), ('Couple', 'Mainstream'): (120, 180),
    ('Couple', 'Premium'): (160, 220), ('Couple', 'Health'): (140, 200), ('Single', 'Budget'): (50, 80),
    ('Single', 'Mainstream'): (70, 110), ('Single', 'Premium'): (100, 150), ('Single', 'Health'): (90, 130),
    ('Shared', 'Budget'): (120, 180), ('Shared', 'Mainstream'): (150, 210), ('Shared', 'Premium'): (180, 250),
    ('Shared', 'Health'): (160, 230),
}

# === Section 9: Generate Users ===
# This creates fake shoppers (like families or singles) with random cart sizes and budgets
# based on their type. Think of it as setting up players for our grocery simulation!
def generate_users(num_users=5000):
    """Generate user personas with household types and shopping styles."""
    household_types = ['Family', 'Couple', 'Single', 'Shared']
    household_probs = [0.35, 0.25, 0.25, 0.15]  # Families are most common, shared least
    shopping_styles = ['Budget', 'Mainstream', 'Premium', 'Health']
    shopping_probs = [0.30, 0.40, 0.15, 0.15]  # Mainstream wins, premium and health tie
    users = []
    for i in range(num_users):
        user_id = f'user_{i+1}'
        household = np.random.choice(household_types, p=household_probs)  # Pick a random household
        shopping_style = np.random.choice(shopping_styles, p=shopping_probs)  # Pick a random style
        min_size, max_size = cart_size_ranges[(household, shopping_style)]
        cart_size = np.random.randint(min_size, max_size + 1)  # Random cart size in range
        min_budget, max_budget = budget_ranges[(household, shopping_style)]
        budget = round(np.random.uniform(min_budget, max_budget))  # Random budget in range
        users.append({
            'user_id': user_id, 'household_type': household, 'shopping_style': shopping_style,
            'cart_size': cart_size, 'weekly_budget': budget
        })
    return pd.DataFrame(users)  # Spit out a nice table of users

In [63]:


# === Section 10: Adjust Group Probabilities ===
# This tweaks how likely shoppers pick certain groups (like families grabbing dairy or singles
# loving quick meals). It’s key for making carts match their vibe!
def get_group_probabilities(household, shopping_style):
    """Calculate product group probabilities based on household type and shopping style."""
    base_probs = {
        'Fresh Produce': 0.20, 'Dairy and Eggs': 0.15, 'Fresh Meat': 0.07, 'Fresh Seafood': 0.03,
        'Processed Meats': 0.05, 'Bakery': 0.07, 'Sweet Baked Goods': 0.03, 'Pantry Staples': 0.12,
        'Prepared and Convenience Foods': 0.03, 'Snacks and Confectionery': 0.05, 'Beverages': 0.05,
        'Frozen Foods': 0.02, 'Frozen Desserts': 0.01, 'Household and Cleaning': 0.02,
        'Health and Beauty': 0.01, 'Plant-Based Alternatives': 0.01, 'Miscellaneous': 0.01
    }
    adjusted_probs = base_probs.copy()  # Start with default chances
    
    # Families: more dairy and snacks, less fancy stuff
    if household == 'Family':
        adjusted_probs['Dairy and Eggs'] += 0.03
        adjusted_probs['Bakery'] += 0.02
        adjusted_probs['Snacks and Confectionery'] += 0.02
        adjusted_probs['Frozen Foods'] += 0.01
        adjusted_probs['Frozen Desserts'] += 0.01
        adjusted_probs['Fresh Produce'] -= 0.02
        adjusted_probs['Health and Beauty'] -= 0.01
        adjusted_probs['Fresh Seafood'] -= 0.01
        adjusted_probs['Miscellaneous'] -= 0.01
    
    # Couples: fresh meat and produce, skip the snacks
    elif household == 'Couple':
        adjusted_probs['Fresh Meat'] += 0.02
        adjusted_probs['Fresh Seafood'] += 0.01
        adjusted_probs['Fresh Produce'] += 0.02
        adjusted_probs['Pantry Staples'] += 0.01
        adjusted_probs['Beverages'] += 0.01
        adjusted_probs['Frozen Desserts'] -= 0.005
        adjusted_probs['Snacks and Confectionery'] -= 0.01
        adjusted_probs['Household and Cleaning'] -= 0.01
        adjusted_probs['Prepared and Convenience Foods'] -= 0.01
    
    # Singles: quick meals and drinks, not big on meat
    elif household == 'Single':
        adjusted_probs['Prepared and Convenience Foods'] += 0.05
        adjusted_probs['Beverages'] += 0.02
        adjusted_probs['Bakery'] += 0.01
        adjusted_probs['Fresh Meat'] -= 0.03
        adjusted_probs['Fresh Produce'] -= 0.02
        adjusted_probs['Dairy and Eggs'] -= 0.01
        adjusted_probs['Pantry Staples'] -= 0.01
    
    # Shared: snacks and drinks for the crew
    elif household == 'Shared':
        adjusted_probs['Snacks and Confectionery'] += 0.04
        adjusted_probs['Beverages'] += 0.03
        adjusted_probs['Pantry Staples'] += 0.01
        adjusted_probs['Dairy and Eggs'] -= 0.02
        adjusted_probs['Health and Beauty'] -= 0.01
        adjusted_probs['Household and Cleaning'] -= 0.01
        adjusted_probs['Fresh Seafood'] -= 0.01
    
    # Budget: cheap staples and frozen, skip the pricey stuff
    if shopping_style == 'Budget':
        adjusted_probs['Pantry Staples'] += 0.03
        adjusted_probs['Frozen Foods'] += 0.02
        adjusted_probs['Bakery'] += 0.01
        adjusted_probs['Health and Beauty'] -= 0.005
        adjusted_probs['Fresh Seafood'] -= 0.02
        adjusted_probs['Fresh Meat'] -= 0.01
        adjusted_probs['Prepared and Convenience Foods'] -= 0.02
    
    # Premium: splurge on meat and seafood, less pantry junk
    elif shopping_style == 'Premium':
        adjusted_probs['Fresh Meat'] += 0.02
        adjusted_probs['Fresh Seafood'] += 0.03
        adjusted_probs['Prepared and Convenience Foods'] += 0.02
        adjusted_probs['Beverages'] += 0.01
        adjusted_probs['Fresh Produce'] += 0.01
        adjusted_probs['Frozen Foods'] -= 0.01
        adjusted_probs['Pantry Staples'] -= 0.02
        adjusted_probs['Snacks and Confectionery'] -= 0.02
        adjusted_probs['Miscellaneous'] -= 0.005
    
    # Health: load up on produce and plant-based, ditch the junk
    elif shopping_style == 'Health':
        adjusted_probs['Fresh Produce'] += 0.05
        adjusted_probs['Plant-Based Alternatives'] += 0.03
        adjusted_probs['Dairy and Eggs'] += 0.01
        adjusted_probs['Health and Beauty'] += 0.02
        adjusted_probs['Snacks and Confectionery'] -= 0.03
        adjusted_probs['Beverages'] -= 0.01
        adjusted_probs['Processed Meats'] -= 0.03
        adjusted_probs['Sweet Baked Goods'] -= 0.02
        adjusted_probs['Frozen Desserts'] -= 0.005
    
    # No negative probs—keep a tiny chance for everything
    for category in adjusted_probs:
        if adjusted_probs[category] < 0:
            adjusted_probs[category] = 0.001
    
    # Normalize so it all adds to 1 (keeps it legit!)
    total = sum(adjusted_probs.values())
    return {k: v / total for k, v in adjusted_probs.items()}

In [65]:
def generate_shopping_carts(users_df, items_df, subcat_to_group):
    # Make a copy of items_df so we don’t mess up the original data—like a backup!
    items_df = items_df.copy()
    
    # Add a new column 'product_group' by linking subcategories to bigger groups (e.g., 'Apples' → 'Fresh Produce')
    items_df['product_group'] = items_df['subcat'].map(subcat_to_group)
    
    # Set up dictionaries to group items—like putting stuff on labeled shelves for quick access
    items_by_group = {group: items_df[items_df['product_group'] == group] for group in set(subcat_to_group.values())}
    items_by_subcat = {subcat: items_df[items_df['subcat'] == subcat] for subcat in items_df['subcat'].unique()}
    
    # Count how many users we’ve got and tell everyone we’re starting
    total_users = len(users_df)
    print(f"Generating carts for {total_users} users...")
    
    # This list will hold every single item from all users’ carts
    all_cart_items = []

    # Loop through each user—time to build some carts!
    for i, (_, user) in enumerate(users_df.iterrows()):
        # Every 500 users, let’s give a shoutout so we know it’s still running
        if i % 500 == 0:
            print(f"Processing user {i+1}/{total_users}...")
        
        # Grab the user’s info—like their ID, how they shop, and their budget
        user_id = user['user_id']
        household = user['household_type']
        style = user['shopping_style']
        cart_size = user['cart_size']
        budget = user['weekly_budget']
        
        # Figure out the max items—usually cart_size, but we can sneak in a few extra
        max_items = min(cart_size + 3, int(cart_size * 1.15))
        
        # Get probabilities for product groups—like, does this family love dairy or snacks?
        group_probs = get_group_probabilities(household, style)
        
        # Start fresh: empty cart, no items picked yet, and zero dollars spent
        user_cart_items = []
        selected_product_codes = set()  # To avoid picking the same thing twice
        current_spend = 0  # How much we’ve spent so far
        attempts = 0  # Don’t wanna loop forever—count how many tries we take
        max_attempts = max_items * 5  # Give it a few chances per item
        
        # Keep going until the cart’s full or we’ve tried too many times
        while len(user_cart_items) < max_items and attempts < max_attempts:
            attempts += 1
            
            # Pick a product group randomly, weighted by the user’s probabilities
            product_group = np.random.choice(list(group_probs.keys()), p=list(group_probs.values()))
            
            # Find subcategories in that group that actually exist in our items
            subcats_in_group = [subcat for subcat, group in subcat_to_group.items()
                                if group == product_group and subcat in items_by_subcat]
            if not subcats_in_group:
                continue  # Oops, no subcategories here—skip it!
            
            # Pick a random subcategory from the group—like choosing between apples or bananas
            subcategory = np.random.choice(subcats_in_group)
            
            # Grab all items in that subcategory
            subcat_items = items_by_subcat[subcategory]
            if len(subcat_items) == 0:
                continue  # No items? Move on!
            
            # Only look at items we haven’t picked yet—no duplicates!
            available_items = subcat_items[~subcat_items['product_code'].isin(selected_product_codes)]
            if len(available_items) == 0:
                continue  # Nothing left to pick? Next!
            
            # Budget shoppers get the cheap stuff (cheapest 70%)
            if style == 'Budget':
                available_items = available_items.sort_values('unit_price')
                available_items = available_items.iloc[:int(len(available_items) * 0.7)]
            # Premium shoppers get the fancy stuff (priciest 70%)
            elif style == 'Premium':
                available_items = available_items.sort_values('unit_price', ascending=False)
                available_items = available_items.iloc[:int(len(available_items) * 0.7)]
            
            if len(available_items) == 0:
                continue  # No items after filtering? Skip!
            
            # Pick one random item from what’s left
            selected_item = available_items.sample(1).iloc[0]
            
            # Check if adding this item blows the budget (we’re okay up to 125% of budget)
            if current_spend + selected_item['unit_price'] > budget * 1.25:
                continue  # Too expensive—skip it!
            
            # If we’re under the target cart size, add the item for sure
            if len(user_cart_items) < cart_size:
                selected_product_codes.add(selected_item['product_code'])
                user_cart_items.append({
                    'user_id': user_id, 'product_code': selected_item['product_code'],
                    'item_name': selected_item['item_name'], 'subcat': selected_item['subcat'],
                    'product_group': product_group, 'unit_price': selected_item['unit_price'],
                    'item_price': selected_item['item_price'], 'category': selected_item['category'],
                    'value_score': selected_item['value_score'], 'discount': selected_item['discount'],
                    'essential_flag': selected_item['essential_flag'], 'best_price': selected_item['best_price'],
                    'preference_score': selected_item['preference_score'],
                    'subcategory_weight': selected_item['subcategory_weight'],  # Extra detail from items_df
                    'price_rank': selected_item['price_rank'],  # Ranking from items_df
                    'price_sensitivity_penalty': selected_item['price_sensitivity_penalty'],  # Penalty from items_df
                    'price_percentile': selected_item['price_percentile'],  # Percentile from items_df
                    'household_type': household, 'shopping_style': style, 'weekly_budget': budget
                })
                current_spend += selected_item['unit_price']
            # If we’ve hit the target size, maybe add extra items (15% chance)
            elif len(user_cart_items) >= cart_size and len(user_cart_items) < max_items:
                if random.random() < 0.15:
                    selected_product_codes.add(selected_item['product_code'])
                    user_cart_items.append({
                        'user_id': user_id, 'product_code': selected_item['product_code'],
                        'item_name': selected_item['item_name'], 'subcat': selected_item['subcat'],
                        'product_group': product_group, 'unit_price': selected_item['unit_price'],
                        'item_price': selected_item['item_price'], 'category': selected_item['category'],
                        'value_score': selected_item['value_score'], 'discount': selected_item['discount'],
                        'essential_flag': selected_item['essential_flag'], 'best_price': selected_item['best_price'],
                        'preference_score': selected_item['preference_score'],
                        'subcategory_weight': selected_item['subcategory_weight'],
                        'price_rank': selected_item['price_rank'],
                        'price_sensitivity_penalty': selected_item['price_sensitivity_penalty'],
                        'price_percentile': selected_item['price_percentile'],
                        'household_type': household, 'shopping_style': style, 'weekly_budget': budget
                    })
                    current_spend += selected_item['unit_price']
        
        # Toss this user’s cart into the big pile
        all_cart_items.extend(user_cart_items)
        
        # If they spent too much, let’s call it out—how far over budget are we?
        if current_spend > budget:
            over_pct = ((current_spend - budget) / budget) * 100
            print(f"User {user_id} cart: {len(user_cart_items)} items, ${current_spend:.2f} (${current_spend - budget:.2f} or {over_pct:.1f}% over budget)")
    
    # Make a nice DataFrame out of all the cart items—like a big shopping list
    carts_df = pd.DataFrame(all_cart_items)
    
    # Add a flag to mark items that push us over budget
    carts_df['is_over_budget'] = False
    for user_id in carts_df['user_id'].unique():
        user_budget = carts_df.loc[carts_df['user_id'] == user_id, 'weekly_budget'].iloc[0]
        user_items = carts_df[carts_df['user_id'] == user_id].copy()
        user_items = user_items.sort_values('unit_price')  # Sort by price, cheap to pricey
        user_items['cumulative_spend'] = user_items['unit_price'].cumsum()  # Add up costs as we go
        over_budget_items = user_items[user_items['cumulative_spend'] > user_budget].index
        carts_df.loc[over_budget_items, 'is_over_budget'] = True  # Flag the budget-busters
    
    # All done—here’s the final cart DataFrame!
    return carts_df

In [67]:
# === Main Execution ===
# Alright, this is the big show! We’re simulating a grocery shopping spree—loading data,
# creating shoppers, filling carts, and checking who overspent. Let’s dive in!
def main():
    # Load and process items
    # First up, we’re grabbing the Coles dataset—like picking up the grocery list. Then we
    # slap on some discounts (who doesn’t love a deal?) and score items based on how much
    # we’d want them and their value. It’s like prepping the store shelves.
    items_df = load_data('/Users/rajpatel/Desktop/coles.csv')
    items_df = add_discount_and_best_price_fixed(items_df)
    items_df = calculate_preference_score(items_df)
    items_df = calculate_value_score(items_df)
    
    # Generate users
    # Time to make our shoppers! Each one gets a unique ID, a household type (like 'Family'),
    # a shopping style (like 'Budget'), and a budget. It’s like casting characters for our
    # shopping adventure.
    users_df = generate_users()
    
    # Generate shopping carts
    # Here’s where it gets fun—we’re building carts for each user based on their vibe and
    # budget. Imagine them strolling the aisles, tossing stuff in their cart!
    carts_df = generate_shopping_carts(users_df, items_df, subcat_to_group)
    
    # Save results
    # Let’s save our carts to a CSV—like hitting checkout and keeping the receipt for later.
    carts_df.to_csv('simulated_shopping_carts.csv', index=False)
    
    # Print summary statistics
    # Time to spill the tea! We’re counting users, items, and averaging cart sizes and spends.
    # It’s like the store manager’s quick recap after a busy day.
    print("\n==== Shopping Cart Simulation Summary ====")
    print(f"Total users: {users_df['user_id'].nunique()}")
    print(f"Total cart items: {len(carts_df)}")
    print(f"Average items per cart: {len(carts_df) / users_df['user_id'].nunique():.2f}")
    print(f"Average spend per cart: ${carts_df.groupby('user_id')['unit_price'].sum().mean():.2f}")
    
    # Budget analysis
    # Now, let’s see who splurged. We’re comparing what they spent to their budget, counting
    # the over-spenders, and figuring out by how much. It’s like spotting who grabbed that
    # extra ice cream tub!
    user_spending = carts_df.groupby('user_id')['unit_price'].sum()
    user_budgets = users_df.set_index('user_id')['weekly_budget']
    user_budget_diff = user_spending - user_budgets.loc[user_spending.index]
    
    over_budget_users = (user_budget_diff > 0).sum()
    over_budget_pct = over_budget_users / len(user_budget_diff) * 100
    avg_over_amount = user_budget_diff[user_budget_diff > 0].mean()
    max_over_amount = user_budget_diff.max()
    
    print("\nBudget Analysis:")
    print(f"Users over budget: {over_budget_users} ({over_budget_pct:.1f}%)")
    print(f"Average overspend amount: ${avg_over_amount:.2f}" if not np.isnan(avg_over_amount) else "Average overspend amount: $0.00")
    print(f"Maximum overspend amount: ${max_over_amount:.2f}")
    avg_pct = (user_budget_diff[user_budget_diff > 0] / user_budgets.loc[user_budget_diff[user_budget_diff > 0].index] * 100).mean()
    print(f"Average over-budget percentage: {avg_pct:.1f}%" if not np.isnan(avg_pct) else "Average over-budget percentage: 0.0%")
    
    # Product group distribution
    # Let’s peek at what’s flying off the shelves—showing the percentage of each product
    # group across all carts. Which aisle’s the busiest?
    print("\nProduct group distribution:")
    group_counts = carts_df['product_group'].value_counts(normalize=True) * 100
    for group, percentage in group_counts.items():
        print(f"{group}: {percentage:.1f}%")
    
    # Top 10 subcategories
    # Here’s the top 10 subcategories by item count—these are the crowd-pleasers!
    print("\nTop 10 most common subcategories:")
    print(carts_df['subcat'].value_counts().head(10))
    
    # Shopping style distribution
    # Breaking down the shoppers’ styles—are most of them penny-pinchers or big spenders?
    print("\nShopping style distribution:")
    style_counts = carts_df.groupby('user_id')['shopping_style'].first().value_counts(normalize=True) * 100
    for style, percentage in style_counts.items():
        print(f"{style}: {percentage:.1f}%")
    
    # Over-budget items analysis
    # If we marked over-budget items, let’s dig in—how many pushed the limit, which groups
    # they’re from, and how pricey they are. These are the budget busters!
    if 'is_over_budget' in carts_df.columns:
        over_budget_items = carts_df[carts_df['is_over_budget']]
        print("\nOver-budget items analysis:")
        print(f"Total items that push carts over budget: {len(over_budget_items)} ({len(over_budget_items)/len(carts_df)*100:.1f}% of all items)")
        if len(over_budget_items) > 0:
            print("\nTop product groups in over-budget items:")
            ob_groups = over_budget_items['product_group'].value_counts(normalize=True) * 100
            for group, pct in ob_groups.head(5).items():
                print(f"{group}: {pct:.1f}%")
            print(f"\nAverage price of over-budget items: ${over_budget_items['unit_price'].mean():.2f}")
            print(f"Average price of all items: ${carts_df['unit_price'].mean():.2f}")

# This kicks off the whole thing when we run the script—like pressing ‘start’ on our sim!
if __name__ == "__main__":
    main()

Generating carts for 5000 users...
Processing user 1/5000...
User user_3 cart: 4 items, $153.55 ($30.55 or 24.8% over budget)
User user_4 cart: 10 items, $225.43 ($34.43 or 18.0% over budget)
User user_8 cart: 11 items, $247.61 ($11.61 or 4.9% over budget)
User user_11 cart: 11 items, $127.62 ($1.62 or 1.3% over budget)
User user_15 cart: 12 items, $198.18 ($37.18 or 23.1% over budget)
User user_22 cart: 12 items, $162.92 ($23.92 or 17.2% over budget)
User user_26 cart: 12 items, $175.53 ($22.53 or 14.7% over budget)
User user_38 cart: 18 items, $248.66 ($25.66 or 11.5% over budget)
User user_56 cart: 13 items, $295.26 ($36.26 or 14.0% over budget)
User user_59 cart: 9 items, $89.73 ($15.73 or 21.3% over budget)
User user_67 cart: 8 items, $109.62 ($16.62 or 17.9% over budget)
User user_68 cart: 6 items, $115.60 ($18.60 or 19.2% over budget)
User user_70 cart: 18 items, $271.53 ($37.53 or 16.0% over budget)
User user_73 cart: 9 items, $202.62 ($31.62 or 18.5% over budget)
User user_79 

In [73]:
def trim_carts(carts_df, users_df, budget_threshold=0.9):
    """
    Trims shopping carts to fit within budget and cart size constraints by prioritizing essential items.

    Parameters:
    - carts_df (pd.DataFrame): DataFrame with cart items, including columns:
        'user_id', 'item_price', 'essential_flag', 'value_score', 'preference_score', 'price_rank'
    - users_df (pd.DataFrame): DataFrame with user info, including columns:
        'user_id', 'weekly_budget', 'cart_size'
    - budget_threshold (float): Fraction of weekly budget to use (default: 0.9)

    Returns:
    - pd.DataFrame: Updated carts_df with 'keep_item' column (1 to keep, 0 to remove)
    """
    # Print initial debug info to check the structure of input DataFrames
    print("Starting cart trimming...")
    print("carts_df Shape:", carts_df.shape)
    print("carts_df Columns:", carts_df.columns.tolist())
    print("users_df Shape:", users_df.shape)
    print("users_df Columns:", users_df.columns.tolist())
    
    # Validate that carts_df has all required columns
    required_carts_cols = ['user_id', 'item_price', 'essential_flag', 'value_score', 
                           'preference_score', 'price_rank']
    missing_carts_cols = [col for col in required_carts_cols if col not in carts_df.columns]
    if missing_carts_cols:
        print(f"Error: carts_df is missing required columns: {missing_carts_cols}")
        return carts_df  # Return original DataFrame if columns are missing
    
    # Validate that users_df has all required columns
    required_users_cols = ['user_id', 'weekly_budget', 'cart_size']
    missing_users_cols = [col for col in required_users_cols if col not in users_df.columns]
    if missing_users_cols:
        print(f"Error: users_df is missing required columns: {missing_users_cols}")
        return carts_df  # Return original DataFrame if columns are missing
    
    # Create a working copy of carts_df to avoid modifying the original
    df = carts_df.copy()
    
    # Initialize 'keep_item' column to 1, assuming all items are kept unless trimmed
    df['keep_item'] = 1
    
    # Iterate over each unique user in the carts DataFrame
    for user_id in df['user_id'].unique():
        # Extract the user's cart items into a separate DataFrame
        user_cart = df[df['user_id'] == user_id].copy()
        
        # Fetch the user's budget and cart size limit from users_df
        user_info = users_df[users_df['user_id'] == user_id]
        if user_info.empty:
            print(f"Warning: No info for user_id {user_id}")
            continue  # Skip this user if no info is found
        
        # Calculate the effective budget (e.g., 90% of weekly budget by default)
        budget = user_info['weekly_budget'].iloc[0] * budget_threshold
        max_items = user_info['cart_size'].iloc[0]  # Maximum number of items allowed
        
        # Compute the current total cost and item count of the cart
        total_cost = user_cart['item_price'].sum()
        num_items = len(user_cart)
        
        # Skip trimming if the cart is already within both budget and item limits
        if total_cost <= budget and num_items <= max_items:
            continue
        
        # Sort items to prioritize keeping essentials and preferred items
        # - 'essential_flag' descending: Keep essentials (1) before non-essentials (0)
        # - 'preference_score' descending: Keep highly preferred items first
        # - 'value_score' ascending: *Potential Error* - If higher value_score is better, 
        #   this should likely be descending to keep high-value items
        # - 'price_rank' ascending: Keep cheaper items first (lower rank = cheaper)
        user_cart = user_cart.sort_values(
            by=['essential_flag', 'preference_score', 'value_score', 'price_rank'],
            ascending=[False, False, True, True]  # Note: Check 'value_score' direction
        )
        user_cart = user_cart.reset_index()  # Reset index to preserve original indices
        
        # Calculate the cumulative cost of items in the sorted order
        user_cart['cumulative_cost'] = user_cart['item_price'].cumsum()
        
        # Determine how many items can be kept within the budget
        k_budget = (user_cart['cumulative_cost'] <= budget).sum()
        k = min(k_budget, max_items)  # Limit by both budget and max_items
        
        # If trimming is required, mark excess items for removal
        if k < len(user_cart):
            remove_indices = user_cart['index'].iloc[k:]  # Indices of items to remove
            df.loc[remove_indices, 'keep_item'] = 0  # Set 'keep_item' to 0 for removal
    
    # Print summary statistics of the trimming process
    print("keep_item Distribution:")
    print(df['keep_item'].value_counts())  # Count of items kept vs. removed
    print("Keep Percentage:", (df['keep_item'].mean() * 100).round(2), "%")  # Percentage kept
    
    return df  # Return the updated DataFrame with 'keep_item' column

In [69]:
# === XGBoost Model for Smart Cart Trimming ===
# We’re building a tool to help trim shopping carts based on a budget. Think of it as a
# virtual shopping buddy that uses XGBoost (a powerful machine learning algorithm) to
# decide which items to keep or ditch. Let’s walk through it step-by-step!

# First, we grab our tools—think of these as our data science toolkit.
import pandas as pd  # For handling data tables
import numpy as np  # For math and number crunching
from sklearn.model_selection import train_test_split  # To split data for training/testing
from sklearn.preprocessing import LabelEncoder  # To turn words into numbers
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score  # To grade our model
import xgboost as xgb  # The star of the show: XGBoost!

# === Step 1: Load and Prep the Data ===
# We’re starting with a grocery dataset (Coles) and sprucing it up with extra info like
# discounts and scores. Then, we create fake shoppers and their carts, and trim those
# carts to fit budgets—this gives us the data to teach our model.
items_df = load_data('/Users/rajpatel/Desktop/coles.csv')  # Load the grocery list
items_df = add_discount_and_best_price_fixed(items_df)  # Add sale prices and deals
items_df = calculate_preference_score(items_df)  # Score how much we like each item
items_df = calculate_value_score(items_df)  # Score how good a deal it is
users_df = generate_users()  # Make up some shoppers
carts_df = generate_shopping_carts(users_df, items_df, subcat_to_group)  # Fill their carts
carts_df = trim_carts(carts_df, users_df)  # Trim carts to budget, creating 'keep_item' (1 = keep, 0 = remove)

# === Step 2: Pick What Matters ===
# Features are the hints we give the model—like price, importance, or shopper habits.
# The target ('keep_item') is what we’re predicting: did the item stay in the cart?
features = [
    'essential_flag', 'preference_score', 'value_score', 'price_rank',
    'subcategory_weight', 'item_price', 'unit_price', 'price_percentile',
    'price_sensitivity_penalty', 'weekly_budget', 'household_type',
    'shopping_style', 'subcat', 'category'
]
target = 'keep_item'

# === Step 3: Set Up the Dataset ===
# We copy the features into X (our input) and the target into y (our goal). Then, we
# turn text like 'Family' or 'Budget' into numbers because models only speak math.
X = carts_df[features].copy()  # Features go here
y = carts_df[target]  # Target goes here

# Encode categorical stuff—think of it as translating words into a secret code for the model
categorical_cols = ['household_type', 'shopping_style', 'subcat', 'category']
label_encoders = {}  # We’ll save these to use later
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))  # Turn text into numbers
    label_encoders[col] = le  # Store the translator

# === Step 4: Split for Training and Testing ===
# We’re splitting the data: 80% to teach the model, 20% to test it—like a practice quiz
# before the big exam. The random_state keeps it consistent.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Step 5: Train the XGBoost Model ===
# Now we build the model! XGBoost learns patterns from the data to predict what to keep.
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # It’s a yes/no choice: keep or remove
    eval_metric='logloss',  # Measures how confident the model is
    use_label_encoder=False,  # We’ve already encoded, so skip this
    random_state=42,  # Keeps results repeatable
    learning_rate=0.1,  # Learn slowly for accuracy
    max_depth=6,  # Don’t overcomplicate decisions
    n_estimators=100  # Use 100 mini-decisions to figure it out
)
xgb_model.fit(X_train, y_train)  # Teach the model with the training data

# === Step 6: Test How Good It Is ===
# Let’s see how the model does on the test data—like checking answers after a quiz.
y_pred = xgb_model.predict(X_test)  # Guess which items to keep
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]  # Get the confidence scores

# Print the report card—how well did we do?
print("Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")  # How often we’re right
print(f"Precision: {precision_score(y_test, y_pred):.4f}")  # How good our 'keep' guesses are
print(f"Recall: {recall_score(y_test, y_pred):.4f}")  # How many 'keep' items we caught
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")  # Balance of precision and recall
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")  # How well we separate keep vs. remove

# === Step 7: Build a Cart Trimmer ===
# This function uses the model to slim down a cart. It predicts what to keep, then
# removes the least important stuff until the budget fits.
def suggest_removals(user_cart, model, budget):
    """
    Suggest items to remove from a user's cart to fit the budget.
    
    Parameters:
    - user_cart (pd.DataFrame): Cart items for a single user
    - model: Trained XGBoost model
    - budget (float): User’s budget limit
    
    Returns:
    - pd.DataFrame: Trimmed cart
    """
    user_cart = user_cart.copy()  # Don’t mess with the original
    # Encode the cart’s text fields to match the model’s training
    for col in categorical_cols:
        user_cart[col] = label_encoders[col].transform(user_cart[col].astype(str))
    # Predict the odds of keeping each item
    user_cart['keep_prob'] = model.predict_proba(user_cart[features])[:, 1]
    # Sort by keep probability—lowest first, so we ditch the least wanted
    user_cart = user_cart.sort_values('keep_prob')
    total_cost = user_cart['item_price'].sum()  # Current cart cost
    # Keep removing items until we’re under budget
    while total_cost > budget and not user_cart.empty:
        item_to_remove = user_cart.iloc[0]  # Grab the least likely to keep
        total_cost -= item_to_remove['item_price']  # Subtract its cost
        user_cart = user_cart.iloc[1:]  # Drop it from the cart
    return user_cart

# === Step 8: Test It Out ===
# Let’s try it on user_1! We grab their cart, set a budget (90% of their weekly limit),
# trim it, and see what’s left. It’s like a real-world demo!
user_id = 'user_1'
user_cart = carts_df[carts_df['user_id'] == user_id].copy()  # Get their cart
budget = users_df[users_df['user_id'] == user_id]['weekly_budget'].iloc[0] * 0.9  # 90% of budget
trimmed_cart = suggest_removals(user_cart, xgb_model, budget)  # Trim it down



Generating carts for 5000 users...
Processing user 1/5000...
User user_1 cart: 4 items, $101.23 ($20.23 or 25.0% over budget)
User user_3 cart: 10 items, $189.56 ($9.56 or 5.3% over budget)
User user_21 cart: 16 items, $246.24 ($49.24 or 25.0% over budget)
User user_30 cart: 9 items, $123.38 ($5.38 or 4.6% over budget)
User user_34 cart: 7 items, $119.03 ($23.03 or 24.0% over budget)
User user_36 cart: 10 items, $311.37 ($60.37 or 24.1% over budget)
User user_37 cart: 17 items, $219.49 ($22.49 or 11.4% over budget)
User user_41 cart: 14 items, $218.04 ($28.04 or 14.8% over budget)
User user_43 cart: 5 items, $93.28 ($3.28 or 3.6% over budget)
User user_44 cart: 16 items, $284.50 ($49.50 or 21.1% over budget)
User user_47 cart: 11 items, $256.91 ($46.91 or 22.3% over budget)
User user_48 cart: 17 items, $215.81 ($30.81 or 16.7% over budget)
User user_51 cart: 10 items, $198.22 ($9.22 or 4.9% over budget)
User user_53 cart: 5 items, $115.73 ($15.73 or 15.7% over budget)
User user_54 cart

Parameters: { "use_label_encoder" } are not used.



Model Performance:
Accuracy: 0.9336
Precision: 0.9504
Recall: 0.9775
F1-Score: 0.9637
ROC-AUC: 0.9524


In [77]:
# This function is like your budget-savvy friend who checks your shopping cart and suggests what to ditch so you don’t overspend—it uses the XGBoost model to trim it down smartly
def suggest_removals(user_cart, model, budget, label_encoders):
    """
    Suggest items to remove from a user's cart to fit the budget.
    
    Parameters:
    - user_cart (pd.DataFrame): Cart items for a single user
    - model: Trained XGBoost model
    - budget (float): User's budget threshold
    - label_encoders (dict): Label encoders for categorical columns
    
    Returns:
    - pd.DataFrame: Trimmed cart
    - list: List of removed items with name, price, and subcategory
    """
    user_cart = user_cart.copy()
    # Ensure categorical columns are encoded consistently
    for col in categorical_cols:
        try:
            user_cart[col] = label_encoders[col].transform(user_cart[col].astype(str))
        except KeyError:
            print(f"Error: Label encoder for {col} not found or mismatch in categories.")
            return user_cart, []
    
    # Predict probability of keeping each item
    try:
        user_cart['keep_prob'] = model.predict_proba(user_cart[features])[:, 1]
    except Exception as e:
        print(f"Error predicting with model: {e}")
        return user_cart, []
    
    # Sort by keep_prob (ascending) to remove least important items first
    user_cart = user_cart.sort_values('keep_prob')
    total_cost = user_cart['item_price'].sum()
    removed_items = []
    
    # Remove items until budget is met
    while total_cost > budget and not user_cart.empty:
        item_to_remove = user_cart.iloc[0]
        removed_items.append({
            'item_name': item_to_remove['item_name'],
            'subcat': item_to_remove['subcat'],
            'item_price': item_to_remove['item_price']
        })
        total_cost -= item_to_remove['item_price']
        user_cart = user_cart.iloc[1:]
    
    return user_cart, removed_items

# Here we’re testing the trimming on 5 users—grabbing some who overspent to see how our function saves the day, and showing a before-and-after summary
print("\n=== Cart Trimming Results for 5 Users ===\n")

# Select 5 users likely to need trimming (cart cost > weekly_budget)
cart_costs = carts_df.groupby('user_id')['item_price'].sum().reset_index()
user_costs = cart_costs.merge(users_df[['user_id', 'weekly_budget']], on='user_id')
over_budget_users = user_costs[user_costs['item_price'] > user_costs['weekly_budget']]
if len(over_budget_users) >= 5:
    user_ids = over_budget_users['user_id'].sample(5, random_state=42).tolist()
else:
    print("Warning: Fewer than 5 users have cart costs exceeding their weekly budget. Selecting 5 random users.")
    user_ids = users_df['user_id'].sample(5, random_state=42).tolist()

for user_id in user_ids:
    # Get user's cart
    user_cart = carts_df[carts_df['user_id'] == user_id].copy()
    if user_cart.empty:
        print(f"\nNo cart found for {user_id}")
        continue
    
    # Get user's budget (full weekly budget)
    budget_row = users_df[users_df['user_id'] == user_id]['weekly_budget']
    if budget_row.empty:
        print(f"\nNo budget found for {user_id}")
        continue
    weekly_budget = budget_row.iloc[0]
    
    # Apply suggest_removals
    trimmed_cart, removed_items = suggest_removals(user_cart, xgb_model, weekly_budget, label_encoders)
    
    # Print results
    print(f"User {user_id} Cart Trimming:")
    print(f"Weekly Budget: ${weekly_budget:.2f}")
    print(f"Original items: {len(user_cart)}, Cost: ${user_cart['item_price'].sum():.2f}")
    print(f"Trimmed items: {len(trimmed_cart)}, Cost: ${trimmed_cart['item_price'].sum():.2f}")
    if removed_items:
        print("Removed Items:")
        for item in removed_items:
            # Decode subcategory
            try:
                subcat = label_encoders['subcat'].inverse_transform([int(item['subcat'])])[0] if item['subcat'].isdigit() else item['subcat']
            except Exception:
                subcat = item['subcat']
            print(f"- {item['item_name']} (${item['item_price']:.2f}, {subcat})")
    else:
        print("Removed Items: None")
    print()  # Empty line for readability


=== Cart Trimming Results for 5 Users ===

User user_1014 Cart Trimming:
Weekly Budget: $69.00
Original items: 10, Cost: $75.05
Trimmed items: 9, Cost: $67.55
Removed Items:
- Cranberry & Pumpkin Seed Crackers ($7.50, 29)

User user_1558 Cart Trimming:
Weekly Budget: $65.00
Original items: 8, Cost: $65.60
Trimmed items: 7, Cost: $58.10
Removed Items:
- Fig & Almond Crackers ($7.50, 29)

User user_3392 Cart Trimming:
Weekly Budget: $54.00
Original items: 10, Cost: $64.84
Trimmed items: 7, Cost: $50.64
Removed Items:
- Fudge Ice Cream 8Pack ($8.50, 58)
- Traditional Gravy Mix ($2.10, 85)
- Free Almond Long Life Milk ($3.60, 68)

User user_4401 Cart Trimming:
Weekly Budget: $52.00
Original items: 10, Cost: $78.40
Trimmed items: 7, Cost: $47.50
Removed Items:
- Hot Roast Pork ($21.00, 91)
- Crusher Original Lemon Soft Drink Bottle ($2.00, 105)
- Nespresso Caffe Verona Ca ($7.90, 26)

User user_2184 Cart Trimming:
Weekly Budget: $74.00
Original items: 9, Cost: $89.05
Trimmed items: 8, Cost