# Amazon ML Challenge 2025 - Price Prediction (Optimized Solution)
**Goal:** Predict product prices from catalog content with SMAPE < 40%

**Strategy:**
- Advanced feature engineering (target encoding, text features, categories)
- Ensemble modeling (LightGBM + XGBoost + CatBoost)
- Weighted averaging based on validation performance

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from tqdm import tqdm
tqdm.pandas()

# Configuration
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("✓ All libraries imported successfully!")

In [None]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Training samples: {len(train_df):,}")
print(f"Test samples: {len(test_df):,}")
print(f"\nTarget (price) range: ${train_df['price'].min():.2f} - ${train_df['price'].max():.2f}")
print(f"Mean price: ${train_df['price'].mean():.2f}")

## Feature Engineering Pipeline
Extract comprehensive features from catalog content

In [None]:
def extract_all_features(text):
    """Extract all relevant features from catalog content"""
    features = {}
    text_str = str(text).lower()
    
    # 1. Quantity features
    value_match = re.search(r'value:\s*([\d.]+)', text_str)
    features['value'] = float(value_match.group(1)) if value_match else 0.0
    
    unit_match = re.search(r'unit:\s*(\w+(?:\s+\w+)?)', text_str)
    features['unit'] = unit_match.group(1) if unit_match else 'unknown'
    
    pack_match = re.search(r'pack of (\d+)|(\d+)\s*pack', text_str, re.IGNORECASE)
    features['pack_size'] = int(pack_match.group(1) or pack_match.group(2)) if pack_match else 1
    features['total_quantity'] = features['value'] * features['pack_size']
    
    # 2. Brand extraction
    name_match = re.search(r'item name:\s*([^,\n]+?)(?:,|\n)', text_str)
    if name_match:
        brand = name_match.group(1).strip().split()[0]
        features['brand'] = brand if brand else 'unknown'
    else:
        features['brand'] = 'unknown'
    
    # 3. Text statistics
    words = text_str.split()
    features['char_count'] = len(text_str)
    features['word_count'] = len(words)
    features['bullet_count'] = len(re.findall(r'bullet point \d+:', text_str))
    features['avg_word_len'] = np.mean([len(w) for w in words]) if words else 0
    
    # 4. Premium keywords (count occurrences)
    premium_kw = ['premium', 'luxury', 'deluxe', 'gourmet', 'organic', 'natural',
                  'professional', 'artisan', 'imported', 'certified', 'exclusive']
    features['premium_count'] = sum(text_str.count(kw) for kw in premium_kw)
    
    # 5. Category detection
    features['is_food'] = int(any(kw in text_str for kw in ['food', 'snack', 'beverage', 'coffee', 'tea', 'spice']))
    features['is_electronics'] = int(any(kw in text_str for kw in ['electronic', 'battery', 'charger', 'cable']))
    features['is_health'] = int(any(kw in text_str for kw in ['health', 'beauty', 'vitamin', 'supplement', 'care']))
    features['is_home'] = int(any(kw in text_str for kw in ['home', 'kitchen', 'household', 'cleaning']))
    
    # 6. Size indicators
    features['has_ounce'] = int('ounce' in text_str or ' oz' in text_str)
    features['has_pound'] = int('pound' in text_str or ' lb' in text_str)
    features['has_gram'] = int('gram' in text_str or ' g ' in text_str)
    
    # 7. Quality indicators
    features['digit_count'] = sum(c.isdigit() for c in text_str)
    features['upper_ratio'] = sum(c.isupper() for c in str(text)) / len(str(text)) if len(str(text)) > 0 else 0
    
    return features

# Extract features
print("Extracting features from training data...")
train_features = train_df['catalog_content'].progress_apply(extract_all_features).apply(pd.Series)
train_df = pd.concat([train_df, train_features], axis=1)

print("Extracting features from test data...")
test_features = test_df['catalog_content'].progress_apply(extract_all_features).apply(pd.Series)
test_df = pd.concat([test_df, test_features], axis=1)

print(f"✓ Extracted {len(train_features.columns)} features")

In [None]:
# Target encoding for brand (CRITICAL for performance)
print("Creating brand target encoding...")
brand_stats = train_df.groupby('brand')['price'].agg([
    ('brand_mean', 'mean'),
    ('brand_median', 'median'),
    ('brand_std', 'std'),
    ('brand_count', 'count')
]).reset_index()

train_df = train_df.merge(brand_stats, on='brand', how='left')
test_df = test_df.merge(brand_stats, on='brand', how='left')

# Fill missing for unseen brands
for col in ['brand_mean', 'brand_median', 'brand_std']:
    test_df[col].fillna(train_df[col].median(), inplace=True)
test_df['brand_count'].fillna(1, inplace=True)

print("✓ Brand encoding complete")

In [None]:
# Target encoding for unit
print("Creating unit target encoding...")
unit_stats = train_df.groupby('unit')['price'].agg([
    ('unit_mean', 'mean'),
    ('unit_median', 'median'),
    ('unit_count', 'count')
]).reset_index()

train_df = train_df.merge(unit_stats, on='unit', how='left')
test_df = test_df.merge(unit_stats, on='unit', how='left')

for col in ['unit_mean', 'unit_median']:
    test_df[col].fillna(train_df[col].median(), inplace=True)
test_df['unit_count'].fillna(1, inplace=True)

print("✓ Unit encoding complete")

In [None]:
# TF-IDF features
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True
)

tfidf_train = tfidf.fit_transform(train_df['catalog_content'].fillna(''))
tfidf_test = tfidf.transform(test_df['catalog_content'].fillna(''))

print(f"✓ TF-IDF: {tfidf_train.shape[1]} features")

In [None]:
# Prepare feature matrix
feature_cols = [
    'value', 'pack_size', 'total_quantity',
    'brand_mean', 'brand_median', 'brand_std', 'brand_count',
    'unit_mean', 'unit_median', 'unit_count',
    'char_count', 'word_count', 'bullet_count', 'avg_word_len',
    'premium_count', 'is_food', 'is_electronics', 'is_health', 'is_home',
    'has_ounce', 'has_pound', 'has_gram', 'digit_count', 'upper_ratio'
]

X_train_num = train_df[feature_cols].values
X_train = np.hstack([X_train_num, tfidf_train.toarray()])

X_test_num = test_df[feature_cols].values
X_test = np.hstack([X_test_num, tfidf_test.toarray()])

y_train = np.log1p(train_df['price'].values)  # Log transform

print(f"Feature matrix: {X_train.shape} (train), {X_test.shape} (test)")

In [None]:
# Train/validation split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=RANDOM_SEED
)
print(f"Train: {X_tr.shape[0]:,}, Validation: {X_val.shape[0]:,}")

## Ensemble Modeling
Train 3 gradient boosting models and combine predictions

In [None]:
def calculate_smape(y_true, y_pred):
    """Calculate Symmetric Mean Absolute Percentage Error"""
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.abs(y_true - y_pred) / denominator) * 100

print("✓ SMAPE function ready")

In [None]:
# Model 1: LightGBM
print("Training LightGBM...")
params_lgb = {
    'objective': 'regression', 'metric': 'mae', 'boosting_type': 'gbdt',
    'num_leaves': 63, 'learning_rate': 0.03, 'feature_fraction': 0.85,
    'bagging_fraction': 0.85, 'bagging_freq': 5, 'min_child_samples': 20,
    'reg_alpha': 0.1, 'reg_lambda': 0.1, 'verbose': -1,
    'random_state': RANDOM_SEED, 'n_jobs': -1
}

train_lgb = lgb.Dataset(X_tr, label=y_tr)
val_lgb = lgb.Dataset(X_val, label=y_val, reference=train_lgb)

model_lgb = lgb.train(
    params_lgb, train_lgb, num_boost_round=2000,
    valid_sets=[train_lgb, val_lgb], valid_names=['train', 'valid'],
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)]
)

y_val_pred_lgb = np.expm1(model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration))
y_val_actual = np.expm1(y_val)
smape_lgb = calculate_smape(y_val_actual, y_val_pred_lgb)
print(f"✓ LightGBM - SMAPE: {smape_lgb:.2f}%")

In [None]:
# Model 2: XGBoost
print("Training XGBoost...")
params_xgb = {
    'objective': 'reg:squarederror', 'eval_metric': 'mae',
    'max_depth': 7, 'learning_rate': 0.03, 'subsample': 0.85,
    'colsample_bytree': 0.85, 'min_child_weight': 3,
    'reg_alpha': 0.1, 'reg_lambda': 0.1, 'tree_method': 'hist',
    'random_state': RANDOM_SEED, 'n_jobs': -1
}

dtrain = xgb.DMatrix(X_tr, label=y_tr)
dval = xgb.DMatrix(X_val, label=y_val)

model_xgb = xgb.train(
    params_xgb, dtrain, num_boost_round=2000,
    evals=[(dtrain, 'train'), (dval, 'valid')],
    early_stopping_rounds=100, verbose_eval=200
)

y_val_pred_xgb = np.expm1(model_xgb.predict(dval, iteration_range=(0, model_xgb.best_iteration)))
smape_xgb = calculate_smape(y_val_actual, y_val_pred_xgb)
print(f"✓ XGBoost - SMAPE: {smape_xgb:.2f}%")

In [None]:
# Model 3: CatBoost
print("Training CatBoost...")
model_catb = cb.CatBoostRegressor(
    iterations=2000, learning_rate=0.03, depth=7, l2_leaf_reg=3,
    random_seed=RANDOM_SEED, verbose=200, early_stopping_rounds=100,
    task_type='CPU', thread_count=-1
)

model_catb.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=200)

y_val_pred_catb = np.expm1(model_catb.predict(X_val))
smape_catb = calculate_smape(y_val_actual, y_val_pred_catb)
print(f"✓ CatBoost - SMAPE: {smape_catb:.2f}%")

In [None]:
# Ensemble with optimal weights
weights = np.array([1/smape_lgb, 1/smape_xgb, 1/smape_catb])
weights = weights / weights.sum()

print(f"\nEnsemble weights: LGB={weights[0]:.3f}, XGB={weights[1]:.3f}, CAT={weights[2]:.3f}")

y_val_pred_ens = (
    weights[0] * y_val_pred_lgb +
    weights[1] * y_val_pred_xgb +
    weights[2] * y_val_pred_catb
)

smape_ens = calculate_smape(y_val_actual, y_val_pred_ens)
print(f"\n🎯 Ensemble SMAPE: {smape_ens:.2f}%")
print(f"Improvement: {min(smape_lgb, smape_xgb, smape_catb) - smape_ens:.2f}%")

## Final Training & Predictions

In [None]:
# Train final models on full data
print("Training final models on full training data...")

# LightGBM
full_lgb = lgb.Dataset(X_train, label=y_train)
final_lgb = lgb.train(params_lgb, full_lgb, num_boost_round=model_lgb.best_iteration)

# XGBoost
dtrain_full = xgb.DMatrix(X_train, label=y_train)
final_xgb = xgb.train(params_xgb, dtrain_full, num_boost_round=model_xgb.best_iteration)

# CatBoost
final_catb = cb.CatBoostRegressor(
    iterations=model_catb.best_iteration_, learning_rate=0.03,
    depth=7, l2_leaf_reg=3, random_seed=RANDOM_SEED, verbose=0
)
final_catb.fit(X_train, y_train)

print("✓ All models trained on full data")

In [None]:
# Generate ensemble predictions
print("Generating ensemble predictions...")

dtest = xgb.DMatrix(X_test)

y_test_lgb = np.expm1(final_lgb.predict(X_test, num_iteration=final_lgb.best_iteration))
y_test_xgb = np.expm1(final_xgb.predict(dtest, iteration_range=(0, final_xgb.best_iteration)))
y_test_catb = np.expm1(final_catb.predict(X_test))

y_test_final = (
    weights[0] * y_test_lgb +
    weights[1] * y_test_xgb +
    weights[2] * y_test_catb
)

y_test_final = np.maximum(y_test_final, 0.01)  # Ensure positive

print(f"Predictions: {len(y_test_final):,}")
print(f"Range: ${y_test_final.min():.2f} - ${y_test_final.max():.2f}")
print(f"Mean: ${y_test_final.mean():.2f}")

In [None]:
# Create submission
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': y_test_final
})

# Validate
assert len(submission) == len(test_df)
assert list(submission.columns) == ['sample_id', 'price']
assert (submission['price'] > 0).all()
assert submission['price'].isna().sum() == 0

# Save
submission.to_csv('test_out.csv', index=False)

print("\n" + "="*60)
print("✅ SUBMISSION READY: test_out.csv")
print(f"Expected SMAPE: ~{smape_ens:.2f}%")
print("="*60)

## Summary

**Key Improvements:**
1. ✅ Target encoding for brand & unit (captures price patterns)
2. ✅ Comprehensive text features (1000 TF-IDF + metadata)
3. ✅ Category detection (food, electronics, health, home)
4. ✅ Ensemble of 3 models with optimal weighting
5. ✅ Log transformation for better price distribution

**Expected Performance:** SMAPE < 40%