In [None]:
"""
XGBoost-Only Product Price Prediction Solution

Key Features:
1. Advanced feature engineering with brand extraction
2. Cross-validation for stability
3. Better handling of different price ranges
4. Single XGBoost model for faster training
"""

import os
import re
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb


def extract_advanced_features(df):
    """Advanced feature extraction with better price indicators"""
    
    features = pd.DataFrame()
    text = df['catalog_content'].astype(str)
    
    # ========== PACK QUANTITY (Critical Feature) ==========
    def extract_pack_qty(t):
        patterns = [
            r'pack\s*of\s*(\d+)',
            r'(\d+)\s*pack',
            r'(\d+)\s*count',
            r'count[:\s]*(\d+)',
            r'(\d+)\s*(?:piece|pcs|units?)',
            r'set\s*of\s*(\d+)',
            r'(\d+)\s*in\s*1',
            r'quantity[:\s]*(\d+)',
        ]
        for pattern in patterns:
            match = re.search(pattern, t.lower())
            if match:
                qty = int(match.group(1))
                if qty > 0 and qty <= 1000:
                    return qty
        return 1
    
    features['pack_qty'] = text.apply(extract_pack_qty)
    features['pack_qty_log'] = np.log1p(features['pack_qty'])
    features['pack_qty_sqrt'] = np.sqrt(features['pack_qty'])
    features['is_multi_pack'] = (features['pack_qty'] > 1).astype(int)
    features['pack_tier'] = pd.cut(features['pack_qty'], 
                                    bins=[0, 1, 3, 6, 12, 24, 1000],
                                    labels=[0, 1, 2, 3, 4, 5]).astype(int)
    
    # ========== WEIGHT EXTRACTION ==========
    def extract_weight_grams(t):
        total = 0
        # Pounds
        for match in re.finditer(r'(\d+\.?\d*)\s*(?:lb|pound)s?', t.lower()):
            total += float(match.group(1)) * 453.592
        # Ounces
        for match in re.finditer(r'(\d+\.?\d*)\s*oz(?!one)', t.lower()):
            total += float(match.group(1)) * 28.3495
        # Kilograms
        for match in re.finditer(r'(\d+\.?\d*)\s*kg', t.lower()):
            total += float(match.group(1)) * 1000
        # Grams
        for match in re.finditer(r'(\d+\.?\d*)\s*g(?!\w)', t.lower()):
            total += float(match.group(1))
        return total
    
    features['weight_g'] = text.apply(extract_weight_grams)
    features['weight_log'] = np.log1p(features['weight_g'])
    features['has_weight'] = (features['weight_g'] > 0).astype(int)
    features['weight_per_pack'] = features['weight_g'] / features['pack_qty']
    features['weight_per_pack_log'] = np.log1p(features['weight_per_pack'])
    
    # ========== VOLUME EXTRACTION ==========
    def extract_volume_ml(t):
        total = 0
        # Liters
        for match in re.finditer(r'(\d+\.?\d*)\s*l(?:iter)?s?(?!\w)', t.lower()):
            total += float(match.group(1)) * 1000
        # Milliliters
        for match in re.finditer(r'(\d+\.?\d*)\s*ml', t.lower()):
            total += float(match.group(1))
        # Gallons
        for match in re.finditer(r'(\d+\.?\d*)\s*gal(?:lon)?s?', t.lower()):
            total += float(match.group(1)) * 3785.41
        # Fluid ounces
        for match in re.finditer(r'(\d+\.?\d*)\s*fl\.?\s*oz', t.lower()):
            total += float(match.group(1)) * 29.5735
        return total
    
    features['volume_ml'] = text.apply(extract_volume_ml)
    features['volume_log'] = np.log1p(features['volume_ml'])
    features['has_volume'] = (features['volume_ml'] > 0).astype(int)
    features['volume_per_pack'] = features['volume_ml'] / features['pack_qty']
    features['volume_per_pack_log'] = np.log1p(features['volume_per_pack'])
    
    # ========== SIZE/DIMENSIONS ==========
    def extract_dimensions(t):
        dims = []
        # 3D dimensions
        for match in re.finditer(r'(\d+\.?\d*)\s*x\s*(\d+\.?\d*)\s*x\s*(\d+\.?\d*)', t.lower()):
            dims.extend([float(match.group(i)) for i in [1, 2, 3]])
        # Individual measurements
        for match in re.finditer(r'(\d+\.?\d*)\s*(?:inch|cm|mm|ft)', t.lower()):
            dims.append(float(match.group(1)))
        return dims
    
    features['max_dim'] = text.apply(lambda x: max(extract_dimensions(x)) if extract_dimensions(x) else 0)
    features['dim_log'] = np.log1p(features['max_dim'])
    features['has_dimensions'] = (features['max_dim'] > 0).astype(int)
    
    # ========== BRAND EXTRACTION (Important!) ==========
    def extract_brand_strength(t):
        # Premium brands indicator
        premium_brands = ['apple', 'samsung', 'sony', 'nike', 'adidas', 'lego', 
                         'nestlé', 'loreal', 'dove', 'organic', 'natural']
        score = sum(1 for brand in premium_brands if brand in t.lower())
        
        # Has a capitalized word at the start (likely brand)
        if re.match(r'^[A-Z][a-z]+', t):
            score += 2
            
        return score
    
    features['brand_strength'] = text.apply(extract_brand_strength)
    
    # ========== TEXT FEATURES ==========
    features['text_len'] = text.str.len()
    features['text_len_log'] = np.log1p(features['text_len'])
    features['word_count'] = text.str.split().str.len()
    features['word_count_log'] = np.log1p(features['word_count'])
    features['avg_word_len'] = features['text_len'] / (features['word_count'] + 1)
    features['digit_count'] = text.apply(lambda x: sum(c.isdigit() for c in x))
    features['digit_ratio'] = features['digit_count'] / (features['text_len'] + 1)
    features['upper_count'] = text.apply(lambda x: sum(c.isupper() for c in x))
    features['upper_ratio'] = features['upper_count'] / (features['text_len'] + 1)
    
    # ========== PREMIUM/QUALITY INDICATORS ==========
    premium_kw = ['premium', 'luxury', 'deluxe', 'professional', 'pro', 'organic',
                  'natural', 'gourmet', 'artisan', 'exclusive', 'elite', 'designer',
                  'imported', 'signature', 'collection', 'certified', 'authentic']
    
    features['premium_count'] = text.str.lower().apply(
        lambda x: sum(1 for kw in premium_kw if kw in x)
    )
    
    budget_kw = ['budget', 'value', 'affordable', 'economy', 'basic', 'cheap',
                 'discount', 'clearance', 'sale']
    
    features['budget_count'] = text.str.lower().apply(
        lambda x: sum(1 for kw in budget_kw if kw in x)
    )
    
    # ========== CATEGORY INDICATORS ==========
    categories = {
        'electronics': ['electronic', 'digital', 'wireless', 'bluetooth', 'smart', 'usb', 'cable'],
        'food': ['food', 'snack', 'coffee', 'tea', 'chocolate', 'candy', 'nutrition', 'protein'],
        'beauty': ['beauty', 'cosmetic', 'makeup', 'skincare', 'lotion', 'shampoo', 'soap'],
        'health': ['vitamin', 'supplement', 'health', 'medical', 'capsule', 'tablet'],
        'baby': ['baby', 'infant', 'toddler', 'diaper', 'newborn', 'children'],
        'pet': ['pet', 'dog', 'cat', 'animal', 'bird', 'fish'],
        'home': ['home', 'kitchen', 'bedding', 'towel', 'furniture', 'decor'],
        'clothing': ['shirt', 'pant', 'dress', 'wear', 'fashion', 'clothing'],
        'toy': ['toy', 'game', 'puzzle', 'play', 'doll', 'action figure'],
        'book': ['book', 'novel', 'paperback', 'hardcover', 'journal']
    }
    
    for cat, keywords in categories.items():
        features[f'cat_{cat}'] = text.str.lower().apply(
            lambda x: int(any(kw in x for kw in keywords))
        )
    
    # Category with pack interaction
    for cat in categories.keys():
        features[f'{cat}_pack'] = features[f'cat_{cat}'] * features['pack_qty_log']
    
    # ========== MATERIAL INDICATORS ==========
    expensive_materials = {
        'gold': 10, 'platinum': 10, 'diamond': 10, 'silver': 8,
        'leather': 7, 'silk': 8, 'cashmere': 9, 'wool': 6,
        'stainless steel': 6, 'titanium': 8, 'ceramic': 5
    }
    
    cheap_materials = {
        'plastic': 2, 'rubber': 2, 'paper': 1, 'cardboard': 1
    }
    
    features['expensive_material_score'] = text.str.lower().apply(
        lambda x: sum(weight for mat, weight in expensive_materials.items() if mat in x)
    )
    
    features['cheap_material_score'] = text.str.lower().apply(
        lambda x: sum(weight for mat, weight in cheap_materials.items() if mat in x)
    )
    
    features['material_price_indicator'] = (
        features['expensive_material_score'] - features['cheap_material_score']
    )
    
    # ========== NUMBER EXTRACTION ==========
    def extract_numbers(t):
        nums = re.findall(r'\d+\.?\d*', t)
        return [float(n) for n in nums if 0 < float(n) < 10000]
    
    features['num_count'] = text.apply(lambda x: len(extract_numbers(x)))
    features['max_num'] = text.apply(lambda x: max(extract_numbers(x)) if extract_numbers(x) else 0)
    features['min_num'] = text.apply(lambda x: min(extract_numbers(x)) if extract_numbers(x) else 0)
    features['mean_num'] = text.apply(lambda x: np.mean(extract_numbers(x)) if extract_numbers(x) else 0)
    features['std_num'] = text.apply(lambda x: np.std(extract_numbers(x)) if len(extract_numbers(x)) > 1 else 0)
    
    # ========== ADVANCED INTERACTIONS ==========
    # Total content (weight or volume)
    features['total_content'] = features['weight_g'] + features['volume_ml']
    features['total_content_log'] = np.log1p(features['total_content'])
    
    # Price indicators
    features['bulk_indicator'] = features['pack_qty'] * features['total_content_log']
    features['premium_bulk'] = features['premium_count'] * features['bulk_indicator']
    features['quality_weight'] = features['brand_strength'] * features['weight_log']
    
    # Category-specific features
    features['food_weight'] = features['cat_food'] * features['weight_log']
    features['electronics_premium'] = features['cat_electronics'] * features['premium_count']
    features['beauty_volume'] = features['cat_beauty'] * features['volume_log']
    
    return features


class XGBoostPricePredictionModel:
    """XGBoost-only model with cross-validation"""
    
    def __init__(self):
        self.tfidf = TfidfVectorizer(
            max_features=4000,
            ngram_range=(1, 3),
            min_df=3,
            max_df=0.9,
            strip_accents='unicode',
            sublinear_tf=True
        )
        self.svd = TruncatedSVD(n_components=200, random_state=42)
        self.scaler = PowerTransformer(method='yeo-johnson', standardize=True)
        self.models = []
        
    def prepare_features(self, df, is_train=True):
        """Prepare features with better preprocessing"""
        
        # Text cleaning
        cleaned_text = df['catalog_content'].astype(str).apply(
            lambda x: re.sub(r'[^\w\s]', ' ', x.lower()).strip()
        )
        
        # TF-IDF
        if is_train:
            tfidf_feat = self.tfidf.fit_transform(cleaned_text)
            text_feat = self.svd.fit_transform(tfidf_feat)
        else:
            tfidf_feat = self.tfidf.transform(cleaned_text)
            text_feat = self.svd.transform(tfidf_feat)
        
        # Manual features
        print("   Extracting advanced features...")
        manual_feat = extract_advanced_features(df)
        
        # Combine
        all_feat = np.hstack([text_feat, manual_feat.values])
        
        # Scale
        if is_train:
            all_feat = self.scaler.fit_transform(all_feat)
        else:
            all_feat = self.scaler.transform(all_feat)
        
        print(f"   Total features: {all_feat.shape[1]}")
        return all_feat
    
    def train_with_cv(self, X, y, n_folds=5):
        """Train XGBoost with cross-validation"""
        
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        
        oof_preds = np.zeros(len(y))
        xgb_models = []
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
            print(f"\n{'='*60}")
            print(f"Fold {fold}/{n_folds}")
            print(f"{'='*60}")
            
            X_tr, X_val = X[train_idx], X[val_idx]
            y_tr, y_val = y[train_idx], y[val_idx]
            
            # Log transform target
            y_tr_log = np.log1p(y_tr)
            y_val_log = np.log1p(y_val)
            
            # XGBoost
            print("Training XGBoost...")
            xgb_model = xgb.XGBRegressor(
                n_estimators=2000,
                learning_rate=0.02,
                max_depth=10,
                min_child_weight=3,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=1.0,
                reg_lambda=1.0,
                random_state=42,
                verbosity=0
            )
            xgb_model.fit(
                X_tr, y_tr_log,
                eval_set=[(X_val, y_val_log)],
                verbose=False
            )
            oof_preds[val_idx] = np.expm1(xgb_model.predict(X_val))
            xgb_models.append(xgb_model)
            
            # Fold SMAPE
            fold_smape = self.calculate_smape(y_val, oof_preds[val_idx])
            print(f"\nFold {fold} XGBoost SMAPE: {fold_smape*100:.2f}%")
        
        # Store models
        self.models = xgb_models
        
        # Calculate overall CV score
        cv_smape = self.calculate_smape(y, oof_preds)
        
        print(f"\n{'='*60}")
        print(f"Overall CV SMAPE: {cv_smape*100:.2f}%")
        print(f"{'='*60}")
        
        return cv_smape
    
    def predict(self, X):
        """Predict using averaged XGBoost models"""
        
        # Get predictions from all folds
        preds = np.mean([np.expm1(model.predict(X)) for model in self.models], axis=0)
        preds = np.clip(preds, 0.1, 3000)
        
        return preds
    
    @staticmethod
    def calculate_smape(y_true, y_pred):
        """SMAPE metric"""
        denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
        diff = np.abs(y_true - y_pred)
        return np.mean(diff / (denominator + 1e-10))


def main():
    """Main execution"""
    
    DATASET_FOLDER = './dataset/'
    OUTPUT_FILE = 'test_out_xgboost.csv'
    
    print("=" * 70)
    print("XGBoost-Only Product Pricing Solution")
    print("=" * 70)
    
    # Load data
    print("\n1. Loading data...")
    train_df = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
    test_df = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
    
    print(f"   Train: {len(train_df):,} | Test: {len(test_df):,}")
    print(f"   Price range: ${train_df['price'].min():.2f} - ${train_df['price'].max():.2f}")
    print(f"   Price median: ${train_df['price'].median():.2f}")
    
    # Remove extreme outliers
    q1 = train_df['price'].quantile(0.01)
    q99 = train_df['price'].quantile(0.99)
    mask = (train_df['price'] >= q1) & (train_df['price'] <= q99)
    print(f"   Removing {(~mask).sum()} extreme outliers...")
    train_df = train_df[mask].reset_index(drop=True)
    
    # Initialize model
    model = XGBoostPricePredictionModel()
    
    # Prepare features
    print("\n2. Feature engineering...")
    X_train = model.prepare_features(train_df, is_train=True)
    y_train = train_df['price'].values
    
    # Train with CV
    print("\n3. Training with 5-Fold Cross-Validation...")
    cv_score = model.train_with_cv(X_train, y_train, n_folds=5)
    
    # Prepare test features
    print("\n4. Preparing test features...")
    X_test = model.prepare_features(test_df, is_train=False)
    
    # Predict
    print("\n5. Making predictions...")
    test_predictions = model.predict(X_test)
    
    # Save
    print("\n6. Creating submission...")
    submission = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': test_predictions
    })
    submission.to_csv(OUTPUT_FILE, index=False)
    
    print(f"\n✓ Saved to '{OUTPUT_FILE}'")
    print(f"\nPrediction stats:")
    print(f"   Min: ${test_predictions.min():.2f}")
    print(f"   Max: ${test_predictions.max():.2f}")
    print(f"   Median: ${np.median(test_predictions):.2f}")
    print(f"   Mean: ${np.mean(test_predictions):.2f}")
    
    print("\n" + "=" * 70)
    print(f"✓ COMPLETE! Expected test SMAPE: {cv_score*100:.1f}%")
    print("=" * 70)


if __name__ == "__main__":
    main()

XGBoost-Only Product Pricing Solution

1. Loading data...
   Train: 75,000 | Test: 75,000
   Price range: $0.13 - $2796.00
   Price median: $14.00
   Removing 1488 extreme outliers...

2. Feature engineering...
   Extracting advanced features...
   Total features: 266

3. Training with 5-Fold Cross-Validation...

Fold 1/8
Training XGBoost...

Fold 1 XGBoost SMAPE: 50.69%

Fold 2/8
Training XGBoost...

Fold 2 XGBoost SMAPE: 50.24%

Fold 3/8
Training XGBoost...

Fold 3 XGBoost SMAPE: 50.11%

Fold 4/8
Training XGBoost...

Fold 4 XGBoost SMAPE: 50.90%

Fold 5/8
Training XGBoost...

Fold 5 XGBoost SMAPE: 49.91%

Fold 6/8
Training XGBoost...

Fold 6 XGBoost SMAPE: 50.23%

Fold 7/8
Training XGBoost...

Fold 7 XGBoost SMAPE: 50.30%

Fold 8/8
Training XGBoost...

Fold 8 XGBoost SMAPE: 50.17%

Overall CV SMAPE: 50.32%

4. Preparing test features...
   Extracting advanced features...
   Total features: 266

5. Making predictions...

6. Creating submission...

✓ Saved to 'test_out_xgboost.csv'

Pr