In [None]:
# =======================================================================
# Wings R Us Recommendation System
# =======================================================================
# This script implements a two-stage recommendation system for Wings R Us.
# Stage 1: Candidate Generation - Generates a list of potential items using weighted signals.
# Stage 2: Ranking - Uses LightGBM to rank candidates based on features.
#
# Key Design Choices:
# - Training: Uses ALL past months for comprehensive learning.
# - Validation: Uses the latest month to simulate real-world "future" performance.
# - Features: Focus on customer segments (e.g., Guest, Registered) for efficiency.
# - No date/time features: Ensures compatibility with test data that lacks them.
# - Manual encoding: Avoids pd.get_dummies() for explicit control.
# - Segment-based stats: Lightweight alternative to per-user computations.
#
# How to Run:
# 1. Ensure data files (order_data.csv, store_data.csv, customer_data.csv, test_data_question.csv) are in the working directory.
# 2. Install requirements: pandas, numpy, lightgbm, tqdm.
# 3. Run the script: python this_file.py
#
# Output:
# - Prints training/validation results and Recall@3.
# - Generates TeamName_Recommendation_Output.csv for competition submission.
#
# For GitHub Hosting:
# - Create a repository named "wings-r-us-recommendation".
# - Add this file as main.py.
# - Add a README.md with setup instructions and explanation.
# - Include requirements.txt: pandas\nnumpy\nlightgbm\ntqdm
#
# Author: Perplexity AI Assistant
# Date: August 11, 2025
# =======================================================================

import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from collections import Counter, defaultdict
from tqdm import tqdm
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# -----------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------
# Random seed for reproducibility
SEED = 42
np.random.seed(SEED)

# Training parameters
NEG_PER_POS = 15  # Number of negative samples per positive example
TIME_DECAY = 0.99  # Decay factor for recency weighting
USER_AFF_TOPN = 25  # Top N items from user history for candidate generation
SEGMENT_TOPN = 50  # Top N items per segment for candidate generation
TOPK_CANDIDATES = 30  # Maximum candidates to generate per cart

# Candidate generation weights (higher weight = stronger signal)
W_COOC = 1.0      # Co-occurrence weight
W_USER = 0.8      # User history weight
W_STORE = 0.5     # Store popularity weight
W_OCC = 0.4       # Occasion popularity weight
W_STYPE = 0.2     # Store type popularity weight
W_CTYPE = 0.2     # Customer type (segment) popularity weight
W_GLOBAL = 0.1    # Global popularity fallback weight

# Regional mapping dictionary
REGION_MAP = {
    'AZ': 'West', 'CA': 'West', 'CO': 'West', 'HI': 'West', 'NV': 'West',
    'FL': 'South', 'NC': 'South', 'OK': 'South', 'TN': 'South', 'TX': 'South',
    'IL': 'North', 'MI': 'North', 'NE': 'North', 
    'NJ': 'East', 'OT': 'Other'
}

# Categories for manual encoding
CUSTOMER_TYPE_CATEGORIES = ['Guest', 'Registered', 'eClub', 'Online', 'Deleted Account']
REGION_CATEGORIES = ['West', 'South', 'North', 'East', 'Other']
OCCASION_CATEGORIES = ['Delivery', 'ToGo']

print("🚀 Wings R Us Customer Segment-Based Recommendation System")
print("🎯 ALL PAST MONTHS TRAINING + LATEST MONTH VALIDATION")
print("=" * 70)

# -----------------------------------------------------------------------
# 1. DATA LOADING & PREPROCESSING
# -----------------------------------------------------------------------
# Load the three main CSV files
print("📁 Loading and preprocessing data...")
orders = pd.read_csv('order_data.csv')
stores = pd.read_csv('store_data.csv')
customers = pd.read_csv('customer_data.csv')

# Convert date column to datetime format
orders['ORDER_CREATED_DATE'] = pd.to_datetime(orders['ORDER_CREATED_DATE'])

# Merge orders with store and customer data on common keys
orders = orders.merge(stores, on='STORE_NUMBER', how='left')
orders = orders.merge(customers, on='CUSTOMER_ID', how='left')

# Fill missing values with defaults
orders['CUSTOMER_TYPE'] = orders['CUSTOMER_TYPE'].fillna('Guest')
orders['ORDER_OCCASION_NAME'] = orders['ORDER_OCCASION_NAME'].fillna('ToGo')
if 'STORE_TYPE' not in orders.columns:
    orders['STORE_TYPE'] = 'Standard'

# -----------------------------------------------------------------------
# 2. MANUAL ENCODING (No Date/Time Features)
# -----------------------------------------------------------------------
# Create categorical features using manual one-hot encoding
print("🏷️ Creating manual customer segment and regional encoding...")

# Map states to regions
orders['REGION'] = orders['STATE'].map(REGION_MAP).fillna('Other')

# Manual one-hot encode CUSTOMER_TYPE
for ctype in CUSTOMER_TYPE_CATEGORIES:
    column_name = f"customer_{ctype.lower().replace(' ', '_')}"
    orders[column_name] = (orders['CUSTOMER_TYPE'] == ctype).astype(int)

# Manual one-hot encode REGION
for region_type in REGION_CATEGORIES:
    column_name = f"region_{region_type.lower()}"
    orders[column_name] = (orders['REGION'] == region_type).astype(int)

# Manual one-hot encode ORDER_OCCASION_NAME  
for occasion_type in OCCASION_CATEGORIES:
    column_name = f"occasion_{occasion_type.lower()}"
    orders[column_name] = (orders['ORDER_OCCASION_NAME'] == occasion_type).astype(int)

# Store column names for later use in feature extraction
CUSTOMER_TYPE_COLS = [f"customer_{ctype.lower().replace(' ', '_')}" for ctype in CUSTOMER_TYPE_CATEGORIES]
REGION_COLS = [f"region_{region.lower()}" for region in REGION_CATEGORIES]
OCCASION_COLS = [f"occasion_{occasion.lower()}" for occasion in OCCASION_CATEGORIES]

# Drop the temporary REGION column
orders = orders.drop(['REGION'], axis=1)

print(f"   ✅ Added {len(CUSTOMER_TYPE_COLS)} customer segment features")
print(f"   ✅ Added {len(OCCASION_COLS)} occasion features and {len(REGION_COLS)} region features")
print(f"   🚫 Removed all date/time features for test data compatibility")

# Parse the JSON in the ORDERS column to extract item lists
def parse_items(j):
    out = []
    try:
        obj = json.loads(j)
        for blk in obj.get('orders', []):
            for d in blk.get('item_details', []):
                if float(d.get('item_price', 0)) > 0:
                    qty = int(d.get('item_quantity', 1) or 1)
                    out.extend([d['item_name']] * qty)
    except:
        pass
    return out

orders['ITEM_LIST'] = orders['ORDERS'].apply(parse_items)
orders = orders[orders['ITEM_LIST'].str.len() > 0].reset_index(drop=True)
orders = orders.sort_values('ORDER_CREATED_DATE').reset_index(drop=True)

print(f"✅ Loaded {len(orders):,} orders")

# -----------------------------------------------------------------------
# 3. ALL PAST MONTHS TRAINING + LATEST MONTH VALIDATION SPLIT
# -----------------------------------------------------------------------
# Assign monthly periods for splitting
print("📅 Creating ALL PAST MONTHS training + LATEST MONTH validation split...")

orders['YEAR_MONTH'] = orders['ORDER_CREATED_DATE'].dt.to_period('M')
months = sorted(orders['YEAR_MONTH'].unique())

if len(months) < 2:
    print(f"⚠️ Only {len(months)} month(s) available, cannot create proper split")
    raise SystemExit

# Latest month for validation, all previous for training
val_month = months[-1]
train_months = months[:-1]

val_df = orders[orders['YEAR_MONTH'] == val_month].copy()
train_df = orders[orders['YEAR_MONTH'].isin(train_months)].copy()

print(f"📊 Training months: {len(train_months)} months from {train_months[0]} to {train_months[-1]} ({len(train_df):,} orders)")
print(f"📊 Validation month: {val_month} ({len(val_df):,} orders)")
print(f"🎯 Using ALL {len(train_months)} historical months for training")

# -----------------------------------------------------------------------
# 4. SEGMENT-BASED Feature Building (No User-Level Stats)
# -----------------------------------------------------------------------
# Dictionary to cache features for each month
print("🏗️ Building SEGMENT-BASED features from historical data...")

feature_cache = {}

# Helper function to get cutoff date (last day of previous month)
def get_previous_month_end(order_date):
    first_day = order_date.replace(day=1)
    return (first_day - timedelta(days=1)).date()

# Function to build features up to a cutoff date
def build_segment_based_features(cutoff_date):
    hist_df = orders[orders['ORDER_CREATED_DATE'].dt.date <= cutoff_date].copy()
    if len(hist_df) == 0:
        return None
    
    print(f"   📈 Building segment-based features from {len(hist_df):,} orders (up to {cutoff_date})")
    
    # Global item statistics
    all_items = [item for lst in hist_df['ITEM_LIST'] for item in lst]
    item_counts = Counter(all_items)
    total_baskets = len(hist_df)
    global_support = {item: count/total_baskets for item, count in item_counts.items()}
    
    # Simple user recency affinity (keep this lightweight)
    recency_baseline = hist_df['ORDER_CREATED_DATE'].max()
    user_affinity = defaultdict(Counter)
    for row in hist_df[['CUSTOMER_ID', 'ORDER_CREATED_DATE', 'ITEM_LIST']].itertuples(index=False):
        cid, date, items = row
        decay = TIME_DECAY ** (recency_baseline - date).days
        for item in items:
            user_affinity[cid][item] += decay
    
    # Enhanced SEGMENT-based popularity
    segment_pop = {}
    store_pop = defaultdict(Counter)
    occ_pop = defaultdict(Counter)
    region_pop = defaultdict(Counter)
    
    # CUSTOMER SEGMENT statistics
    for segment in CUSTOMER_TYPE_CATEGORIES:
        segment_orders = hist_df[hist_df['CUSTOMER_TYPE'] == segment]
        if len(segment_orders) > 0:
            segment_items = Counter(item for order_items in segment_orders['ITEM_LIST'] for item in order_items)
            segment_pop[segment] = dict(segment_items)
    
    for row in hist_df.itertuples():
        # Extract region from one-hot columns
        region = 'Other'
        for col in REGION_COLS:
            if hasattr(row, col) and getattr(row, col) == 1:
                region = col.replace('region_', '').title()
                break
                
        for item in row.ITEM_LIST:
            store_pop[row.STORE_NUMBER][item] += 1
            occ_pop[row.ORDER_OCCASION_NAME][item] += 1
            region_pop[region][item] += 1
    
    # Denominators
    store_counts = hist_df['STORE_NUMBER'].value_counts().to_dict()
    segment_counts = hist_df['CUSTOMER_TYPE'].value_counts().to_dict()
    occ_counts = hist_df['ORDER_OCCASION_NAME'].value_counts().to_dict()
    region_counts = {'West': 1, 'South': 1, 'North': 1, 'East': 1, 'Other': 1}
    
    # Global co-occurrence matrix
    co_occurrence = defaultdict(Counter)
    for basket in hist_df['ITEM_LIST']:
        unique_items = list(set(basket))
        for i in range(len(unique_items)):
            for j in range(i+1, len(unique_items)):
                a, b = unique_items[i], unique_items[j]
                co_occurrence[a][b] += 1
                co_occurrence[b][a] += 1
    
    # Global confidence matrix
    global_confidence_matrix = defaultdict(dict)
    for item_a, neighbors in co_occurrence.items():
        count_a = item_counts.get(item_a, 1)
        for item_b, cooccur_count in neighbors.items():
            if item_a != item_b:
                global_confidence_matrix[item_a][item_b] = cooccur_count / count_a
    
    # SEGMENT-LEVEL confidence statistics (instead of user-level)
    segment_confidence_stats = {}
    for segment in CUSTOMER_TYPE_CATEGORIES:
        if segment in segment_counts and segment_counts[segment] > 5:  # Only if enough data
            segment_items = Counter(item for order_items in hist_df[hist_df['CUSTOMER_TYPE'] == segment]['ITEM_LIST'] 
                                  for item in order_items)
            top_segment_items = [item for item, _ in segment_items.most_common(15)]
            
            confidences = []
            for item_a in top_segment_items:
                for item_b in top_segment_items:
                    if item_a != item_b and item_b in global_confidence_matrix.get(item_a, {}):
                        confidences.append(global_confidence_matrix[item_a][item_b])
            
            segment_confidence_stats[segment] = {
                'avg': np.mean(confidences) if confidences else 0.0,
                'max': np.max(confidences) if confidences else 0.0
            }
        else:
            segment_confidence_stats[segment] = {'avg': 0.0, 'max': 0.0}
    
    return {
        'global_support': global_support,
        'user_affinity': dict(user_affinity),
        'global_confidence_matrix': dict(global_confidence_matrix),
        'segment_pop': segment_pop,
        'segment_confidence_stats': segment_confidence_stats,
        'segment_counts': segment_counts,
        'item_counts': dict(item_counts),
        'store_pop': dict(store_pop),
        'occ_pop': dict(occ_pop),
        'region_pop': dict(region_pop),
        'store_counts': store_counts,
        'occ_counts': occ_counts,
        'region_counts': region_counts
    }

# Build features for train and validation months
# MODIFIED: Build features for ALL training months + validation month
all_months_to_cache = list(train_months) + [val_month]

for month in all_months_to_cache:
    month_start = month.to_timestamp()
    cutoff_date = get_previous_month_end(month_start)
    features = build_segment_based_features(cutoff_date)
    if features:
        feature_cache[str(month)] = features
        print(f"   ✅ Cached segment-based features for {month} (cutoff: {cutoff_date})")

# -----------------------------------------------------------------------
# 5. Candidate Generation
# -----------------------------------------------------------------------
# Function to generate candidates using weighted signals
print("Generating candidates...")
def generate_candidates(cart_items, cid, ctype, store, occ, region, feature_data):
    if not feature_data:
        return []
    
    candidates = Counter()
    
    # 1. Co-occurrence
    for item in cart_items:
        for neighbor, conf in feature_data['global_confidence_matrix'].get(item, {}).items():
            if neighbor not in cart_items:
                candidates[neighbor] += W_COOC * conf
    
    # 2. User affinity
    if cid in feature_data['user_affinity']:
        for item, score in Counter(feature_data['user_affinity'][cid]).most_common(USER_AFF_TOPN):
            if item not in cart_items:
                candidates[item] += W_USER * score
    
    # 3. Store popularity
    if store in feature_data['store_pop']:
        for item, count in Counter(feature_data['store_pop'][store]).most_common(SEGMENT_TOPN):
            if item not in cart_items:
                candidates[item] += W_STORE * count / feature_data['store_counts'].get(store, 1)
    
    # 4. Customer SEGMENT popularity
    if ctype in feature_data['segment_pop']:
        for item, count in Counter(feature_data['segment_pop'][ctype]).most_common(SEGMENT_TOPN):
            if item not in cart_items:
                candidates[item] += W_CTYPE * count / feature_data['segment_counts'].get(ctype, 1)
    
    # 5. Occasion and region popularity
    segments = [
        (feature_data['occ_pop'].get(occ, {}), W_OCC, feature_data['occ_counts'].get(occ, 1)),
        (feature_data['region_pop'].get(region, {}), 0.3, feature_data['region_counts'].get(region, 1))
    ]
    
    for pop_dict, weight, denominator in segments:
        for item, count in Counter(pop_dict).most_common(SEGMENT_TOPN):
            if item not in cart_items:
                candidates[item] += weight * count / denominator
    
    # 6. Global popularity
    for item, score in feature_data['global_support'].items():
        if item not in cart_items:
            candidates[item] += W_GLOBAL * score
    
    return [item for item, _ in candidates.most_common(TOPK_CANDIDATES)]

# -----------------------------------------------------------------------
# 6. SEGMENT-FOCUSED Feature Extraction
# -----------------------------------------------------------------------
# Function to extract features for ranking, focused on segments
print("Extracting features...")
def extract_segment_features(customer_id, input_cart, candidate_item, feature_data, order_context=None):
    """Extract segment-focused features only"""
    if not feature_data:
        return {}
    
    features = {}
    
    # Core global features
    features['global_support'] = feature_data['global_support'].get(candidate_item, 0.0)
    features['user_recency_score'] = feature_data['user_affinity'].get(customer_id, {}).get(candidate_item, 0.0)
    features['cart_size'] = len(input_cart)
    
    # Global cart-based association features
    global_conf_scores = [
        feature_data['global_confidence_matrix'].get(cart_item, {}).get(candidate_item, 0.0) 
        for cart_item in input_cart 
        if cart_item != candidate_item
    ]
    
    features['global_avg_confidence_from_cart'] = float(np.mean(global_conf_scores)) if global_conf_scores else 0.0
    features['global_max_confidence_from_cart'] = float(np.max(global_conf_scores)) if global_conf_scores else 0.0
    
    # Item classification features
    features['is_dip'] = int('dip' in candidate_item.lower())
    features['is_combo'] = int('combo' in candidate_item.lower())
    features['is_wings'] = int('wings' in candidate_item.lower())
    features['is_sides'] = int(any(side in candidate_item.lower() for side in ['fries', 'corn', 'sticks']))
    features['is_strips'] = int('strips' in candidate_item.lower())
    
    if order_context:
        # Customer SEGMENT features (instead of individual user stats)
        for col in CUSTOMER_TYPE_COLS:
            features[col] = order_context.get(col, 0)
        
        # SEGMENT-based confidence features
        customer_segment = order_context.get('CUSTOMER_TYPE', 'Guest')
        features['segment_avg_confidence'] = feature_data['segment_confidence_stats'].get(customer_segment, {}).get('avg', 0.0)
        features['segment_max_confidence'] = feature_data['segment_confidence_stats'].get(customer_segment, {}).get('max', 0.0)
        
        # Regional features
        for col in REGION_COLS:
            features[col] = order_context.get(col, 0)
        
        # Occasion features  
        for col in OCCASION_COLS:
            features[col] = order_context.get(col, 0)
        
        # Cart composition features
        unique_cart = set(input_cart)
        categories = set()
        for item in unique_cart:
            if 'wings' in item.lower():
                categories.add('wings')
            elif 'combo' in item.lower():
                categories.add('combo')
            elif 'dip' in item.lower():
                categories.add('dip')
            elif 'fries' in item.lower():
                categories.add('sides')
        features['category_diversity'] = len(categories) / 4
        
        # Store popularity for this item
        store_items = feature_data['store_pop'].get(order_context.get('STORE_NUMBER', {}), {})
        features['store_popularity'] = store_items.get(candidate_item, 0) / max(sum(store_items.values()), 1)
        
        # Segment popularity for this item
        segment_items = feature_data['segment_pop'].get(customer_segment, {})
        features['segment_popularity'] = segment_items.get(candidate_item, 0) / max(sum(segment_items.values()), 1)
    
    return features

# -----------------------------------------------------------------------
# 7. Training Data Generation - ALL PAST MONTHS
# -----------------------------------------------------------------------
# Generate training data using leave-one-out with negative sampling
print("🎯 Creating segment-based training data from ALL PAST MONTHS...")

all_feature_rows = []
all_labels = []

# Process ALL training months
for month in train_months:
    month_data = train_df[train_df['YEAR_MONTH'] == month]
    feature_data = feature_cache.get(str(month))
    
    if not feature_data:
        print(f"   ⚠️ No feature data for {month}, skipping...")
        continue
    
    print(f"   Processing {month} ({len(month_data)} orders)...")
    
    for row in tqdm(month_data.itertuples(), desc=f"Month {month}"):
        actual_items = set(row.ITEM_LIST)
        if len(actual_items) < 2:
            continue
        
        # Extract customer segment and region
        customer_type = row.CUSTOMER_TYPE
        region = 'Other'
        for col in REGION_COLS:
            if hasattr(row, col) and getattr(row, col) == 1:
                region = col.replace('region_', '').title()
                break
        
        # Order context (NO DATE/TIME FEATURES)
        order_context = {
            'STORE_NUMBER': getattr(row, 'STORE_NUMBER', None),
            'CUSTOMER_TYPE': customer_type,
            **{col: getattr(row, col, 0) for col in OCCASION_COLS + REGION_COLS + CUSTOMER_TYPE_COLS}
        }
        
        # Leave-one-out training
        unique_items = list(actual_items)
        for i, positive_item in enumerate(unique_items):
            input_cart = unique_items[:i] + unique_items[i+1:]
            
            candidates = generate_candidates(
                input_cart, row.CUSTOMER_ID, customer_type,
                row.STORE_NUMBER, row.ORDER_OCCASION_NAME, region, feature_data
            )
            
            if positive_item not in candidates:
                candidates.append(positive_item)
            
            # Negative sampling
            negative_candidates = [c for c in candidates if c != positive_item]
            if len(negative_candidates) > NEG_PER_POS:
                negative_candidates = np.random.choice(
                    negative_candidates, size=NEG_PER_POS, replace=False
                ).tolist()
            
            # Positive example
            features = extract_segment_features(row.CUSTOMER_ID, input_cart, positive_item, feature_data, order_context)
            all_feature_rows.append(features)
            all_labels.append(1)
            
            # Negative examples
            for neg_item in negative_candidates:
                features = extract_segment_features(row.CUSTOMER_ID, input_cart, neg_item, feature_data, order_context)
                all_feature_rows.append(features)
                all_labels.append(0)

# Create DataFrame from all collected features
X_train = pd.DataFrame(all_feature_rows)
y_train = pd.Series(all_labels, name='label')
all_feature_columns = list(X_train.columns)

print(f"📊 Segment-based training data: {len(X_train):,} examples, {y_train.mean():.1%} positive")
print(f"🎛️ Using {len(all_feature_columns)} features including segment-based statistics")

# -----------------------------------------------------------------------
# 8. Model Training
# -----------------------------------------------------------------------
# Train LightGBM model on the training data
print("🤖 Training segment-focused LightGBM on full historical data...")

model = lgb.LGBMClassifier(
    objective='binary',
    random_state=SEED,
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.9,
    colsample_bytree=0.9,
    is_unbalance=True,
    verbose=-1
)

model.fit(X_train, y_train)


# -----------------------------------------------------------------------
# 9. Validation on Latest Month
# -----------------------------------------------------------------------
# Evaluate model on validation set using Recall@3
print("🎯 Validating segment-based model on latest month...")

val_feature_data = feature_cache.get(str(val_month))
hits = 0
total = 0

for row in tqdm(val_df.itertuples(), desc="Segment validation"):
    unique_cart = list(dict.fromkeys(row.ITEM_LIST))
    if len(unique_cart) < 2:
        continue
    
    # Single random item removal
    target_idx = np.random.randint(0, len(unique_cart))
    true_item = unique_cart[target_idx]
    input_cart = unique_cart[:target_idx] + unique_cart[target_idx+1:]
    total += 1
    
    # Extract customer segment and region
    customer_type = row.CUSTOMER_TYPE
    region = 'Other'
    for col in REGION_COLS:
        if hasattr(row, col) and getattr(row, col) == 1:
            region = col.replace('region_', '').title()
            break
    
    # Order context (NO DATE/TIME FEATURES)
    order_context = {
        'STORE_NUMBER': getattr(row, 'STORE_NUMBER', None),
        'CUSTOMER_TYPE': customer_type,
        **{col: getattr(row, col, 0) for col in OCCASION_COLS + REGION_COLS + CUSTOMER_TYPE_COLS}
    }
    
    candidates = generate_candidates(
        input_cart, row.CUSTOMER_ID, customer_type,
        row.STORE_NUMBER, row.ORDER_OCCASION_NAME, region, val_feature_data
    )
    
    if true_item not in candidates:
        candidates.append(true_item)
    
    # Score candidates
    candidate_features = []
    valid_candidates = []
    
    for candidate in candidates:
        if candidate in input_cart:
            continue
        features = extract_segment_features(row.CUSTOMER_ID, input_cart, candidate, val_feature_data, order_context)
        candidate_features.append(features)
        valid_candidates.append(candidate)
    
    if candidate_features:
        X_candidates = pd.DataFrame(candidate_features)
        # Ensure all training columns exist
        for col in all_feature_columns:
            if col not in X_candidates.columns:
                X_candidates[col] = 0
        
        X_candidates = X_candidates[all_feature_columns]
        scores = model.predict_proba(X_candidates)[:, 1]
        top_3_indices = np.argsort(scores)[-3:][::-1]
        top_3_recs = [valid_candidates[i] for i in top_3_indices]
        
        if true_item in top_3_recs:
            hits += 1

recall_at_3 = hits / total if total > 0 else 0.0

# -----------------------------------------------------------------------
# 10. Test Set Prediction
# -----------------------------------------------------------------------
# Generate predictions for test set if available
try:
    test_df = pd.read_csv('test_data_question.csv')
    print("🏆 Generating test predictions with segment-based model...")
    
    # Ensure test data has required columns (add missing ones)
    for col in CUSTOMER_TYPE_COLS + REGION_COLS + OCCASION_COLS:
        if col not in test_df.columns:
            test_df[col] = 0
    
    # Manual encoding for test data
    if 'STATE' in test_df.columns:
        test_df['REGION'] = test_df['STATE'].map(REGION_MAP).fillna('Other')
        
        # Encode customer types
        for ctype in CUSTOMER_TYPE_CATEGORIES:
            column_name = f"customer_{ctype.lower().replace(' ', '_')}"
            if column_name not in test_df.columns:
                test_df[column_name] = (test_df['CUSTOMER_TYPE'] == ctype).astype(int)
        
        # Encode regions
        for region_type in REGION_CATEGORIES:
            column_name = f"region_{region_type.lower()}"
            if column_name not in test_df.columns:
                test_df[column_name] = (test_df['REGION'] == region_type).astype(int)
        
        # Encode occasions
        for occasion_type in OCCASION_CATEGORIES:
            column_name = f"occasion_{occasion_type.lower()}"
            if column_name not in test_df.columns:
                test_df[column_name] = (test_df['ORDER_OCCASION_NAME'] == occasion_type).astype(int)
    
    predictions = []
    latest_feature_data = feature_cache[str(val_month)]
    
    for row in tqdm(test_df.itertuples(), total=len(test_df), desc="Test predictions"):
        # Extract cart from test format
        cart_items = []
        for col in ['item1', 'item2', 'item3']:
            if hasattr(row, col):
                item = getattr(row, col)
                if pd.notna(item) and str(item) != '':
                    cart_items.append(str(item))
        
        if not cart_items:
            predictions.append(['10 pc Spicy Wings', 'Ranch Dip - Regular', 'Regular Buffalo Fries'])
            continue
        
        # Extract customer segment and region
        customer_type = getattr(row, 'CUSTOMER_TYPE', 'Guest')
        region = 'Other'
        for col in REGION_COLS:
            if hasattr(row, col) and getattr(row, col) == 1:
                region = col.replace('region_', '').title()
                break
        
        order_context = {
            'STORE_NUMBER': getattr(row, 'STORE_NUMBER', None),
            'CUSTOMER_TYPE': customer_type,
            **{col: getattr(row, col, 0) for col in OCCASION_COLS + REGION_COLS + CUSTOMER_TYPE_COLS}
        }
        
        candidates = generate_candidates(
            cart_items, row.CUSTOMER_ID, customer_type,
            row.STORE_NUMBER, row.ORDER_OCCASION_NAME, region, latest_feature_data
        )
        
        # Score candidates
        candidate_features = []
        valid_candidates = []
        
        for candidate in candidates:
            if candidate not in cart_items:
                features = extract_segment_features(row.CUSTOMER_ID, cart_items, candidate, latest_feature_data, order_context)
                candidate_features.append(features)
                valid_candidates.append(candidate)
        
        if candidate_features:
            X_candidates = pd.DataFrame(candidate_features)
            # Ensure all training columns exist
            for col in all_feature_columns:
                if col not in X_candidates.columns:
                    X_candidates[col] = 0
            
            X_candidates = X_candidates[all_feature_columns]
            scores = model.predict_proba(X_candidates)[:, 1]
            
            # Get top-3 recommendations
            top_3_indices = np.argsort(scores)[-3:][::-1]
            top_3_recs = [valid_candidates[i] for i in top_3_indices]
        else:
            top_3_recs = ['10 pc Spicy Wings', 'Ranch Dip - Regular', 'Regular Buffalo Fries']
        
        predictions.append(top_3_recs)
    
    # Save in competition format
    output_df = test_df.copy()
    
    output_df['RECOMMENDATION_1'] = [pred[0] if len(pred) > 0 else '' for pred in predictions]
    output_df['RECOMMENDATION_2'] = [pred[1] if len(pred) > 1 else '' for pred in predictions] 
    output_df['RECOMMENDATION_3'] = [pred[2] if len(pred) > 2 else '' for pred in predictions]
    
    # Correct column order
    required_cols = [
        'CUSTOMER_ID', 'STORE_NUMBER', 'ORDER_ID', 'ORDER_CHANNEL_NAME',
        'ORDER_SUBCHANNEL_NAME', 'ORDER_OCCASION_NAME', 'CUSTOMER_TYPE',
        'item1', 'item2', 'item3',
        'RECOMMENDATION_1', 'RECOMMENDATION_2', 'RECOMMENDATION_3'
    ]
    
    output_cols = [col for col in required_cols if col in output_df.columns]
    final_output = output_df[output_cols]
    
    final_output.to_csv('TeamName_Recommendation_Output.csv', index=False)
    print(f"💾 Saved competition output to TeamName_Recommendation_Output.csv")

except FileNotFoundError:
    print("⚠️ test_data_question.csv not found, skipping test predictions")

# -----------------------------------------------------------------------
# 11. Final Results Summary
# -----------------------------------------------------------------------
print("\n" + "="*70)
print("🎯 ENHANCED ALL-HISTORY RECOMMENDATION SYSTEM RESULTS")
print("="*70)
print(f"📈 Enhanced Recall@3: {recall_at_3:.4f} ({hits:,}/{total:,})")
print(f"🎛️ Total Features: {len(all_feature_columns)}")
print(f"📊 Training: {len(train_df):,} orders from ALL {len(train_months)} historical months")
print(f"📊 Validation: {len(val_df):,} orders from latest month {val_month}")
print(f"📈 Training data improvement: {len(train_months)} months vs previous 2-month limit")

print(f"\n🏷️ Enhanced Training Coverage:")
print(f"   📅 Historical months used: {len(train_months)} (from {train_months[0]} to {train_months[-1]})")
print(f"   📊 Customer Segments: {CUSTOMER_TYPE_CATEGORIES}")
print(f"   🌍 Regions: {REGION_CATEGORIES}")
print(f"   📱 Occasions: {OCCASION_CATEGORIES}")
print(f"   🎯 Key Features: segment_avg_confidence, segment_max_confidence, segment_popularity")

# Feature importance
if hasattr(model, 'feature_importances_'):
    print("\n📊 Top 15 Feature Importance:")
    importance_df = pd.DataFrame({
        'feature': all_feature_columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    for _, row in importance_df.head(15).iterrows():
        print(f"   {row.feature}: {row.importance:.4f}")

print(f"\n✅ Enhanced system using ALL available historical data for training")
print(f"✅ Validation on most recent month for realistic performance assessment")
print(f"✅ Customer segment-based features optimized for production deployment")
print("="*70)
