In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
import traceback
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

Libraries imported successfully!


In [13]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
import joblib

# Define SMAPE function and scorer
def smape(actual, predicted):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    
    # Handle cases where both actual and predicted are zero
    denominator = (np.abs(actual) + np.abs(predicted))
    # Add small epsilon to avoid division by zero
    denominator = np.where(denominator == 0, 1e-10, denominator)
    
    smape_value = (100 / len(actual)) * np.sum(2 * np.abs(predicted - actual) / denominator)
    return smape_value

# Create SMAPE scorer for sklearn (lower is better)
smape_scorer = make_scorer(
    lambda y_true, y_pred: smape(y_true, y_pred),
    greater_is_better=False
)

# Feature extraction functions
def extract_features_from_catalog(catalog_text):
    """
    Extract structured features from the catalog_content field
    """
    features = {
        'item_name': '',
        'pack_size': '',
        'weight': 0.0,
        'volume': 0.0,
        'count': 0.0,
        'unit': '',
        'flavor': '',
        'brand': '',
        'is_organic': 0,
        'is_gluten_free': 0,
        'is_vegan': 0,
        'is_kosher': 0,
        'is_sugar_free': 0,
        'is_low_carb': 0,
        'is_non_gmo': 0,
        'calories_per_serving': 0.0,
        'protein_content': 0.0,
        'fiber_content': 0.0
    }
    
    if pd.isna(catalog_text):
        return features
    
    text = str(catalog_text).lower()
    
    # Extract item name
    item_name_match = re.search(r'item name:\s*([^\n]+)', catalog_text, re.IGNORECASE)
    if item_name_match:
        features['item_name'] = item_name_match.group(1).strip()
    
    # Extract pack size information
    pack_patterns = [
        r'pack of\s*(\d+)',
        r'(\d+)\s*count',
        r'(\d+)\s*pack',
        r'(\d+)\s*ct'
    ]
    for pattern in pack_patterns:
        match = re.search(pattern, text)
        if match:
            features['pack_size'] = match.group(1)
            break
    
    # Extract weight
    weight_patterns = [
        r'(\d+\.?\d*)\s*oz',
        r'(\d+\.?\d*)\s*ounce',
        r'(\d+\.?\d*)\s*lb',
        r'(\d+\.?\d*)\s*pound'
    ]
    for pattern in weight_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['weight'] = float(match.group(1))
            except:
                features['weight'] = 0.0
            break
    
    # Extract volume
    volume_patterns = [
        r'(\d+\.?\d*)\s*fl\s*oz',
        r'(\d+\.?\d*)\s*fluid\s*ounce',
        r'(\d+\.?\d*)\s*ml',
        r'(\d+\.?\d*)\s*liter'
    ]
    for pattern in volume_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['volume'] = float(match.group(1))
            except:
                features['volume'] = 0.0
            break
    
    # Extract count
    count_patterns = [
        r'(\d+)\s*tea bags',
        r'(\d+)\s*capsules',
        r'(\d+)\s*pods',
        r'(\d+)\s*cookies'
    ]
    for pattern in count_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['count'] = float(match.group(1))
            except:
                features['count'] = 0.0
            break
    
    # Extract unit from the structured Unit field
    unit_match = re.search(r'unit:\s*([^\n]+)', catalog_text, re.IGNORECASE)
    if unit_match:
        features['unit'] = unit_match.group(1).strip().lower()
    
    # Extract flavor information
    flavor_keywords = ['vanilla', 'chocolate', 'strawberry', 'lemon', 'mint', 'berry', 
                      'caramel', 'honey', 'spice', 'cinnamon', 'ginger', 'peach']
    for flavor in flavor_keywords:
        if flavor in text:
            features['flavor'] = flavor
            break
    
    # Extract brand names
    brand_patterns = [
        r'manufacturer:\s*([^\n]+)',
        r'brand:\s*([^\n]+)'
    ]
    for pattern in brand_patterns:
        match = re.search(pattern, catalog_text, re.IGNORECASE)
        if match:
            features['brand'] = match.group(1).strip()
            break
    
    # Health and dietary attributes
    features['is_organic'] = 1 if any(term in text for term in ['organic', 'usda organic']) else 0
    features['is_gluten_free'] = 1 if 'gluten free' in text else 0
    features['is_vegan'] = 1 if 'vegan' in text else 0
    features['is_kosher'] = 1 if 'kosher' in text else 0
    features['is_sugar_free'] = 1 if any(term in text for term in ['sugar free', 'no sugar', 'zero sugar']) else 0
    features['is_low_carb'] = 1 if any(term in text for term in ['low carb', 'keto', 'keto-friendly']) else 0
    features['is_non_gmo'] = 1 if any(term in text for term in ['non-gmo', 'non gmo']) else 0
    
    # Extract nutritional information
    calorie_match = re.search(r'(\d+)\s*calories', text)
    if calorie_match:
        try:
            features['calories_per_serving'] = float(calorie_match.group(1))
        except:
            features['calories_per_serving'] = 0.0
    
    protein_match = re.search(r'(\d+\.?\d*)\s*g\s*protein', text)
    if protein_match:
        try:
            features['protein_content'] = float(protein_match.group(1))
        except:
            features['protein_content'] = 0.0
    
    fiber_match = re.search(r'(\d+\.?\d*)\s*g\s*fiber', text)
    if fiber_match:
        try:
            features['fiber_content'] = float(fiber_match.group(1))
        except:
            features['fiber_content'] = 0.0
    
    return features

def create_derived_features(df, has_price=True):
    """
    Create derived features, handling the case where price might be missing
    """
    # Product category based on item name
    def categorize_product(item_name):
        if pd.isna(item_name):
            return 'other'
        item_name = str(item_name).lower()
        if any(word in item_name for word in ['tea', 'chai']):
            return 'tea'
        elif any(word in item_name for word in ['coffee', 'brew']):
            return 'coffee'
        elif any(word in item_name for word in ['snack', 'chip', 'cracker', 'cookie']):
            return 'snack'
        elif any(word in item_name for word in ['sauce', 'dressing', 'oil']):
            return 'condiment'
        elif any(word in item_name for word in ['spice', 'seasoning']):
            return 'spice'
        elif any(word in item_name for word in ['candy', 'chocolate']):
            return 'candy'
        elif any(word in item_name for word in ['pasta', 'rice', 'grain']):
            return 'grain'
        else:
            return 'other'
    
    df['product_category'] = df['item_name'].apply(categorize_product)
    
    # Text length features
    df['catalog_content_length'] = df['catalog_content'].str.len().fillna(0)
    
    # Only create price-related features if price is available
    if has_price and 'price' in df.columns:
        if 'weight' in df.columns:
            weight_safe = df['weight'].replace(0, np.nan)
            df['price_per_oz'] = df['price'] / weight_safe
            df['price_per_oz'] = df['price_per_oz'].replace([np.inf, -np.inf], np.nan)
        
        if 'volume' in df.columns:
            volume_safe = df['volume'].replace(0, np.nan)
            df['price_per_fl_oz'] = df['price'] / volume_safe
            df['price_per_fl_oz'] = df['price_per_fl_oz'].replace([np.inf, -np.inf], np.nan)
        
        if 'count' in df.columns:
            count_safe = df['count'].replace(0, np.nan)
            df['price_per_count'] = df['price'] / count_safe
            df['price_per_count'] = df['price_per_count'].replace([np.inf, -np.inf], np.nan)
    
    return df

class RobustSMAPEStackingPricePredictor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')
        self.feature_columns = []
        self.expected_columns = []
        self.stacking_model = None
        self.is_trained = False
        self.best_params_ = None
        
    def preprocess_data(self, df, is_training=True):
        """
        Preprocess the data: extract features, handle missing values, encode categorical variables
        """
        print("Step 1: Extracting features from catalog content...")
        
        # Make a copy to avoid modifying original data
        df_processed = df.copy()
        
        # Extract features
        catalog_features = df_processed['catalog_content'].apply(extract_features_from_catalog)
        features_df = pd.DataFrame(catalog_features.tolist(), index=df_processed.index)
        
        # Concatenate with original data
        df_enhanced = pd.concat([df_processed, features_df], axis=1)
        
        # Ensure all expected numerical columns exist with proper defaults
        numerical_columns = ['weight', 'volume', 'count', 'calories_per_serving', 'protein_content', 'fiber_content']
        for col in numerical_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 0.0
            else:
                df_enhanced[col] = pd.to_numeric(df_enhanced[col], errors='coerce').fillna(0.0)
        
        # Ensure all expected boolean columns exist
        boolean_columns = ['is_organic', 'is_gluten_free', 'is_vegan', 'is_kosher', 
                          'is_sugar_free', 'is_low_carb', 'is_non_gmo']
        for col in boolean_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 0
            else:
                df_enhanced[col] = df_enhanced[col].astype(int)
        
        # Handle categorical columns
        categorical_columns = ['unit', 'flavor', 'brand', 'item_name']
        for col in categorical_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 'unknown'
            else:
                df_enhanced[col] = df_enhanced[col].fillna('unknown')
        
        # Create derived features
        has_price = is_training and 'price' in df_enhanced.columns
        df_enhanced = create_derived_features(df_enhanced, has_price=has_price)
        
        # Handle outliers in price if it exists and we're training
        if is_training and 'price' in df_enhanced.columns:
            Q1 = df_enhanced['price'].quantile(0.25)
            Q3 = df_enhanced['price'].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = max(0, Q1 - 1.5 * IQR)  # Ensure lower bound is not negative
            upper_bound = Q3 + 1.5 * IQR
            
            df_enhanced['price'] = np.where(df_enhanced['price'] > upper_bound, upper_bound, df_enhanced['price'])
            df_enhanced['price'] = np.where(df_enhanced['price'] < lower_bound, lower_bound, df_enhanced['price'])
        
        return df_enhanced
    
    def prepare_features(self, df, is_training=True):
        """
        Prepare features for modeling with robust error handling
        """
        # Define core features that should always be present
        core_features = [
            'weight', 'volume', 'count', 'catalog_content_length',
            'is_organic', 'is_gluten_free', 'is_vegan', 'is_kosher', 
            'is_sugar_free', 'is_low_carb', 'is_non_gmo'
        ]
        
        # Add price per unit features if available (only in training)
        if is_training:
            price_per_features = [col for col in df.columns if col.startswith('price_per')]
            feature_columns = core_features + price_per_features
        else:
            feature_columns = core_features.copy()
        
        # Ensure all feature columns exist in the dataframe
        missing_features = set(feature_columns) - set(df.columns)
        for feature in missing_features:
            df[feature] = 0.0  # Add missing features with default value
        
        # Select only the feature columns we want
        feature_columns = [col for col in feature_columns if col in df.columns]
        X = df[feature_columns].copy()
        
        # Replace infinite values with NaN
        X = X.replace([np.inf, -np.inf], np.nan)
        
        if is_training:
            self.feature_columns = feature_columns
            self.expected_columns = feature_columns.copy()
            
            # Fit imputer and scaler on training data
            X_imputed = self.imputer.fit_transform(X)
            
            # Scale numerical features
            numerical_cols = [col for col in X.columns if X[col].dtype in ['float64', 'float32', 'int64']]
            if numerical_cols:
                X_imputed = self.scaler.fit_transform(X_imputed)
            
            X_processed = pd.DataFrame(X_imputed, columns=feature_columns, index=X.index)
            
            if 'price' in df.columns:
                y = df['price'].copy()
                return X_processed, y
            else:
                return X_processed
        else:
            # Ensure we have the same columns as training
            missing_cols = set(self.expected_columns) - set(X.columns)
            extra_cols = set(X.columns) - set(self.expected_columns)
            
            for col in missing_cols:
                X[col] = 0.0
            for col in extra_cols:
                X = X.drop(col, axis=1)
            
            # Reorder columns to match training
            X = X[self.expected_columns]
            
            # Transform using fitted imputer and scaler
            X_imputed = self.imputer.transform(X)
            X_imputed = self.scaler.transform(X_imputed)
            
            X_processed = pd.DataFrame(X_imputed, columns=self.expected_columns, index=X.index)
            return X_processed
    
    def build_stacking_regressor(self):
        """
        Build a StackingRegressor with multiple base models
        """
        # Base models with robust parameters
        base_models = [
            ('ridge', Ridge(alpha=1.0, random_state=42)),
            ('lasso', Lasso(alpha=0.1, random_state=42)),
            ('rf', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
            ('gbm', GradientBoostingRegressor(n_estimators=100, random_state=42)),
            ('svr', SVR(kernel='rbf', C=1.0))
        ]
        
        # Meta-model
        meta_model = LinearRegression()
        
        # Create StackingRegressor
        stacking_regressor = StackingRegressor(
            estimators=base_models,
            final_estimator=meta_model,
            cv=5,
            passthrough=False,
            n_jobs=-1
        )
        
        return stacking_regressor
    
    def _calculate_aggregated_feature_importance(self):
        """
        Calculate feature importance by aggregating importances from all base models
        """
        # Initialize importance array
        aggregated_importance = np.zeros(len(self.feature_columns))
        
        # Get importances from each base model
        for name, model in self.stacking_model.estimators_:
            try:
                if hasattr(model, 'feature_importances_'):
                    # Tree-based models (RandomForest, GradientBoosting)
                    importance = model.feature_importances_
                elif hasattr(model, 'coef_'):
                    # Linear models (Ridge, Lasso)
                    importance = abs(model.coef_)
                else:
                    # Models without feature importance (SVR) - skip
                    continue
                
                # Ensure the importance array matches our feature dimensions
                if len(importance) == len(self.feature_columns):
                    aggregated_importance += importance
                    
            except Exception as e:
                print(f"Warning: Could not get importance from {name}: {e}")
                continue
        
        # Normalize the importance scores
        if aggregated_importance.sum() > 0:
            aggregated_importance = aggregated_importance / aggregated_importance.sum()
        
        # Create DataFrame
        feature_importance_df = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': aggregated_importance
        }).sort_values('importance', ascending=False)
        
        return feature_importance_df
    
    def train(self, train_df, perform_tuning=True):
        print("Starting SMAPE-tuned StackingRegressor training...")

        # Preprocess training data
        df_processed = self.preprocess_data(train_df, is_training=True)

        # Prepare features
        X, y = self.prepare_features(df_processed, is_training=True)

        print(f"Training data shape: {X.shape}")
        print(f"Target variable shape: {y.shape}")
        print(f"Features used: {len(self.feature_columns)}")

        # Split data for validation
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Build stacking regressor
        stacking_model = self.build_stacking_regressor()

        if perform_tuning:
            print("Performing randomized hyperparameter tuning with SMAPE scoring...")

            # Define hyperparameter search space
            param_distributions = {
                'ridge__alpha': np.logspace(-2, 2, 10),
                'lasso__alpha': np.logspace(-3, 1, 10),
                'rf__n_estimators': np.arange(50, 300, 50),
                'rf__max_depth': [None, 5, 10, 15, 20],
                'gbm__n_estimators': np.arange(50, 300, 50),
                'gbm__learning_rate': np.linspace(0.01, 0.2, 10),
                'svr__C': np.logspace(-2, 2, 10),
                'svr__gamma': ['scale', 'auto']
            }

            # Perform RandomizedSearchCV with SMAPE
            random_search = RandomizedSearchCV(
                estimator=stacking_model,
                param_distributions=param_distributions,
                n_iter=20,
                scoring=smape_scorer,
                cv=3,
                verbose=2,
                random_state=42,
                n_jobs=-1
            )

            random_search.fit(X_train, y_train)

            # Best model and parameters
            self.stacking_model = random_search.best_estimator_
            self.best_params_ = random_search.best_params_

            print(f"Best parameters found: {self.best_params_}")

        else:
            print("Training without hyperparameter tuning...")
            self.stacking_model = stacking_model
            self.stacking_model.fit(X_train, y_train)

        # Evaluate the model on validation set
        y_pred = self.stacking_model.predict(X_val)

        val_smape = smape(y_val, y_pred)
        val_mae = mean_absolute_error(y_val, y_pred)
        val_mse = mean_squared_error(y_val, y_pred)
        val_r2 = r2_score(y_val, y_pred)

        print("\n=== VALIDATION PERFORMANCE ===")
        print(f"SMAPE: {val_smape:.4f}%")
        print(f"MAE: ${val_mae:.2f}")
        print(f"MSE: ${val_mse:.2f}")
        print(f"R²: {val_r2:.4f}")

        # Cross-validation SMAPE
        cv_scores = cross_val_score(
            self.stacking_model, X, y, cv=3, scoring=smape_scorer, n_jobs=-1
        )
        cv_smape_scores = -cv_scores  # Convert back to positive values

        print(f"\nCross-validation SMAPE: {cv_smape_scores.mean():.4f}% (+/- {cv_smape_scores.std() * 2:.4f}%)")

        # Retrain on full dataset
        print("\nRetraining on full training dataset...")
        self.stacking_model.fit(X, y)
        self.is_trained = True

        # FIXED: Proper feature importance for stacking regressor
        print("\n=== STACKING MODEL ANALYSIS ===")
        
        # Show base model weights in the meta-model
        if hasattr(self.stacking_model.final_estimator_, 'coef_'):
            base_model_names = [name for name, _ in self.stacking_model.estimators_]
            meta_coefficients = abs(self.stacking_model.final_estimator_.coef_)
            
            # Create a DataFrame showing base model importance in the meta-model
            meta_importance = pd.DataFrame({
                'base_model': base_model_names,
                'meta_weight': meta_coefficients
            }).sort_values('meta_weight', ascending=False)
            
            print("Base Model Weights in Meta-Model:")
            print(meta_importance)
        
        # Calculate aggregated feature importance from base models
        try:
            feature_importance = self._calculate_aggregated_feature_importance()
            print("\nTop 10 Most Important Features (Aggregated from Base Models):")
            print(feature_importance.head(10))
        except Exception as e:
            print(f"\nNote: Feature importance calculation skipped: {e}")

        return {
            'smape': val_smape,
            'mae': val_mae,
            'mse': val_mse,
            'r2': val_r2,
            'cv_smape_mean': cv_smape_scores.mean(),
            'cv_smape_std': cv_smape_scores.std()
        }
    
    def predict(self, test_df):
        """
        Predict prices for test data using the trained StackingRegressor
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        
        print("Making predictions on test data...")
        
        # Preprocess test data
        df_processed = self.preprocess_data(test_df, is_training=False)
        
        # Prepare features
        X_test = self.prepare_features(df_processed, is_training=False)
        
        print(f"Test data shape after preprocessing: {X_test.shape}")
        print(f"Expected features: {len(self.expected_columns)}")
        print(f"Actual features: {len(X_test.columns)}")
        
        # Make predictions
        predictions = self.stacking_model.predict(X_test)
        
        # Create results dataframe
        results = test_df.copy()
        results['predicted_price'] = predictions
        
        return results
    
    def save_model(self, filepath):
        """
        Save the trained model and preprocessors
        """
        if not self.is_trained:
            raise ValueError("No trained model to save")
        
        model_data = {
            'scaler': self.scaler,
            'imputer': self.imputer,
            'feature_columns': self.feature_columns,
            'expected_columns': self.expected_columns,
            'stacking_model': self.stacking_model,
            'best_params': self.best_params_,
            'is_trained': self.is_trained
        }
        
        joblib.dump(model_data, filepath)
        print(f"SMAPE-tuned StackingRegressor saved to {filepath}")
    
    def load_model(self, filepath):
        """
        Load a trained model and preprocessors
        """
        model_data = joblib.load(filepath)
        
        self.scaler = model_data['scaler']
        self.imputer = model_data['imputer']
        self.feature_columns = model_data['feature_columns']
        self.expected_columns = model_data['expected_columns']
        self.stacking_model = model_data['stacking_model']
        self.best_params_ = model_data['best_params']
        self.is_trained = model_data['is_trained']
        
        print(f"SMAPE-tuned StackingRegressor loaded from {filepath}")
        if self.best_params_:
            print(f"Best parameters: {self.best_params_}")

# Example usage function
def run_complete_pipeline(train_df, test_df, perform_tuning=True):
    """
    Complete pipeline for training and prediction
    """
    print("=== AMAZON PRODUCT PRICE PREDICTION PIPELINE ===")
    
    # Initialize predictor
    smape_predictor = RobustSMAPEStackingPricePredictor()
    
    try:
        # Train the model
        training_results = smape_predictor.train(train_df, perform_tuning=perform_tuning)
        
        # Make predictions
        test_predictions = smape_predictor.predict(test_df)
        
        print("\n=== PREDICTION COMPLETE ===")
        print(f"Predicted prices for {len(test_predictions)} test samples")
        print(f"Price range: ${test_predictions['predicted_price'].min():.2f} - ${test_predictions['predicted_price'].max():.2f}")
        
        return smape_predictor, training_results, test_predictions
        
    except Exception as e:
        print(f"Error in pipeline: {e}")
        import traceback
        print(f"Traceback: {traceback.format_exc()}")
        return None, None, None

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
import joblib

# Define SMAPE function and scorer
def smape(actual, predicted):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    
    # Handle cases where both actual and predicted are zero
    denominator = (np.abs(actual) + np.abs(predicted))
    # Add small epsilon to avoid division by zero
    denominator = np.where(denominator == 0, 1e-10, denominator)
    
    smape_value = (100 / len(actual)) * np.sum(2 * np.abs(predicted - actual) / denominator)
    return smape_value

# Create SMAPE scorer for sklearn (lower is better)
smape_scorer = make_scorer(
    lambda y_true, y_pred: smape(y_true, y_pred),
    greater_is_better=False
)

# Feature extraction functions
def extract_features_from_catalog(catalog_text):
    """
    Extract structured features from the catalog_content field
    """
    features = {
        'item_name': '',
        'pack_size': '',
        'weight': 0.0,
        'volume': 0.0,
        'count': 0.0,
        'unit': '',
        'flavor': '',
        'brand': '',
        'is_organic': 0,
        'is_gluten_free': 0,
        'is_vegan': 0,
        'is_kosher': 0,
        'is_sugar_free': 0,
        'is_low_carb': 0,
        'is_non_gmo': 0,
        'calories_per_serving': 0.0,
        'protein_content': 0.0,
        'fiber_content': 0.0
    }
    
    if pd.isna(catalog_text):
        return features
    
    text = str(catalog_text).lower()
    
    # Extract item name
    item_name_match = re.search(r'item name:\s*([^\n]+)', catalog_text, re.IGNORECASE)
    if item_name_match:
        features['item_name'] = item_name_match.group(1).strip()
    
    # Extract pack size information
    pack_patterns = [
        r'pack of\s*(\d+)',
        r'(\d+)\s*count',
        r'(\d+)\s*pack',
        r'(\d+)\s*ct'
    ]
    for pattern in pack_patterns:
        match = re.search(pattern, text)
        if match:
            features['pack_size'] = match.group(1)
            break
    
    # Extract weight
    weight_patterns = [
        r'(\d+\.?\d*)\s*oz',
        r'(\d+\.?\d*)\s*ounce',
        r'(\d+\.?\d*)\s*lb',
        r'(\d+\.?\d*)\s*pound'
    ]
    for pattern in weight_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['weight'] = float(match.group(1))
            except:
                features['weight'] = 0.0
            break
    
    # Extract volume
    volume_patterns = [
        r'(\d+\.?\d*)\s*fl\s*oz',
        r'(\d+\.?\d*)\s*fluid\s*ounce',
        r'(\d+\.?\d*)\s*ml',
        r'(\d+\.?\d*)\s*liter'
    ]
    for pattern in volume_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['volume'] = float(match.group(1))
            except:
                features['volume'] = 0.0
            break
    
    # Extract count
    count_patterns = [
        r'(\d+)\s*tea bags',
        r'(\d+)\s*capsules',
        r'(\d+)\s*pods',
        r'(\d+)\s*cookies'
    ]
    for pattern in count_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['count'] = float(match.group(1))
            except:
                features['count'] = 0.0
            break
    
    # Extract unit from the structured Unit field
    unit_match = re.search(r'unit:\s*([^\n]+)', catalog_text, re.IGNORECASE)
    if unit_match:
        features['unit'] = unit_match.group(1).strip().lower()
    
    # Extract flavor information
    flavor_keywords = ['vanilla', 'chocolate', 'strawberry', 'lemon', 'mint', 'berry', 
                      'caramel', 'honey', 'spice', 'cinnamon', 'ginger', 'peach']
    for flavor in flavor_keywords:
        if flavor in text:
            features['flavor'] = flavor
            break
    
    # Extract brand names
    brand_patterns = [
        r'manufacturer:\s*([^\n]+)',
        r'brand:\s*([^\n]+)'
    ]
    for pattern in brand_patterns:
        match = re.search(pattern, catalog_text, re.IGNORECASE)
        if match:
            features['brand'] = match.group(1).strip()
            break
    
    # Health and dietary attributes
    features['is_organic'] = 1 if any(term in text for term in ['organic', 'usda organic']) else 0
    features['is_gluten_free'] = 1 if 'gluten free' in text else 0
    features['is_vegan'] = 1 if 'vegan' in text else 0
    features['is_kosher'] = 1 if 'kosher' in text else 0
    features['is_sugar_free'] = 1 if any(term in text for term in ['sugar free', 'no sugar', 'zero sugar']) else 0
    features['is_low_carb'] = 1 if any(term in text for term in ['low carb', 'keto', 'keto-friendly']) else 0
    features['is_non_gmo'] = 1 if any(term in text for term in ['non-gmo', 'non gmo']) else 0
    
    # Extract nutritional information
    calorie_match = re.search(r'(\d+)\s*calories', text)
    if calorie_match:
        try:
            features['calories_per_serving'] = float(calorie_match.group(1))
        except:
            features['calories_per_serving'] = 0.0
    
    protein_match = re.search(r'(\d+\.?\d*)\s*g\s*protein', text)
    if protein_match:
        try:
            features['protein_content'] = float(protein_match.group(1))
        except:
            features['protein_content'] = 0.0
    
    fiber_match = re.search(r'(\d+\.?\d*)\s*g\s*fiber', text)
    if fiber_match:
        try:
            features['fiber_content'] = float(fiber_match.group(1))
        except:
            features['fiber_content'] = 0.0
    
    return features

def create_derived_features(df, has_price=True):
    """
    Create derived features, handling the case where price might be missing
    """
    # Product category based on item name
    def categorize_product(item_name):
        if pd.isna(item_name):
            return 'other'
        item_name = str(item_name).lower()
        if any(word in item_name for word in ['tea', 'chai']):
            return 'tea'
        elif any(word in item_name for word in ['coffee', 'brew']):
            return 'coffee'
        elif any(word in item_name for word in ['snack', 'chip', 'cracker', 'cookie']):
            return 'snack'
        elif any(word in item_name for word in ['sauce', 'dressing', 'oil']):
            return 'condiment'
        elif any(word in item_name for word in ['spice', 'seasoning']):
            return 'spice'
        elif any(word in item_name for word in ['candy', 'chocolate']):
            return 'candy'
        elif any(word in item_name for word in ['pasta', 'rice', 'grain']):
            return 'grain'
        else:
            return 'other'
    
    df['product_category'] = df['item_name'].apply(categorize_product)
    
    # Text length features
    df['catalog_content_length'] = df['catalog_content'].str.len().fillna(0)
    
    # Only create price-related features if price is available
    if has_price and 'price' in df.columns:
        if 'weight' in df.columns:
            weight_safe = df['weight'].replace(0, np.nan)
            df['price_per_oz'] = df['price'] / weight_safe
            df['price_per_oz'] = df['price_per_oz'].replace([np.inf, -np.inf], np.nan)
        
        if 'volume' in df.columns:
            volume_safe = df['volume'].replace(0, np.nan)
            df['price_per_fl_oz'] = df['price'] / volume_safe
            df['price_per_fl_oz'] = df['price_per_fl_oz'].replace([np.inf, -np.inf], np.nan)
        
        if 'count' in df.columns:
            count_safe = df['count'].replace(0, np.nan)
            df['price_per_count'] = df['price'] / count_safe
            df['price_per_count'] = df['price_per_count'].replace([np.inf, -np.inf], np.nan)
    
    return df

class RobustSMAPEStackingPricePredictor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')
        self.feature_columns = []
        self.expected_columns = []
        self.stacking_model = None
        self.is_trained = False
        self.best_params_ = None
        
    def preprocess_data(self, df, is_training=True):
        """
        Preprocess the data: extract features, handle missing values, encode categorical variables
        """
        print("Step 1: Extracting features from catalog content...")
        
        # Make a copy to avoid modifying original data
        df_processed = df.copy()
        
        # Extract features
        catalog_features = df_processed['catalog_content'].apply(extract_features_from_catalog)
        features_df = pd.DataFrame(catalog_features.tolist(), index=df_processed.index)
        
        # Concatenate with original data
        df_enhanced = pd.concat([df_processed, features_df], axis=1)
        
        # Ensure all expected numerical columns exist with proper defaults
        numerical_columns = ['weight', 'volume', 'count', 'calories_per_serving', 'protein_content', 'fiber_content']
        for col in numerical_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 0.0
            else:
                df_enhanced[col] = pd.to_numeric(df_enhanced[col], errors='coerce').fillna(0.0)
        
        # Ensure all expected boolean columns exist
        boolean_columns = ['is_organic', 'is_gluten_free', 'is_vegan', 'is_kosher', 
                          'is_sugar_free', 'is_low_carb', 'is_non_gmo']
        for col in boolean_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 0
            else:
                df_enhanced[col] = df_enhanced[col].astype(int)
        
        # Handle categorical columns
        categorical_columns = ['unit', 'flavor', 'brand', 'item_name']
        for col in categorical_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 'unknown'
            else:
                df_enhanced[col] = df_enhanced[col].fillna('unknown')
        
        # Create derived features
        has_price = is_training and 'price' in df_enhanced.columns
        df_enhanced = create_derived_features(df_enhanced, has_price=has_price)
        
        # Handle outliers in price if it exists and we're training
        if is_training and 'price' in df_enhanced.columns:
            Q1 = df_enhanced['price'].quantile(0.25)
            Q3 = df_enhanced['price'].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = max(0, Q1 - 1.5 * IQR)  # Ensure lower bound is not negative
            upper_bound = Q3 + 1.5 * IQR
            
            df_enhanced['price'] = np.where(df_enhanced['price'] > upper_bound, upper_bound, df_enhanced['price'])
            df_enhanced['price'] = np.where(df_enhanced['price'] < lower_bound, lower_bound, df_enhanced['price'])
        
        return df_enhanced
    
    def prepare_features(self, df, is_training=True):
        """
        Prepare features for modeling with robust error handling
        """
        # Define core features that should always be present
        core_features = [
            'weight', 'volume', 'count', 'catalog_content_length',
            'is_organic', 'is_gluten_free', 'is_vegan', 'is_kosher', 
            'is_sugar_free', 'is_low_carb', 'is_non_gmo'
        ]
        
        # Add price per unit features if available (only in training)
        if is_training:
            price_per_features = [col for col in df.columns if col.startswith('price_per')]
            feature_columns = core_features + price_per_features
        else:
            feature_columns = core_features.copy()
        
        # Ensure all feature columns exist in the dataframe
        missing_features = set(feature_columns) - set(df.columns)
        for feature in missing_features:
            df[feature] = 0.0  # Add missing features with default value
        
        # Select only the feature columns we want
        feature_columns = [col for col in feature_columns if col in df.columns]
        X = df[feature_columns].copy()
        
        # Replace infinite values with NaN
        X = X.replace([np.inf, -np.inf], np.nan)
        
        if is_training:
            self.feature_columns = feature_columns
            self.expected_columns = feature_columns.copy()
            
            # Fit imputer and scaler on training data
            X_imputed = self.imputer.fit_transform(X)
            
            # Scale numerical features
            numerical_cols = [col for col in X.columns if X[col].dtype in ['float64', 'float32', 'int64']]
            if numerical_cols:
                X_imputed = self.scaler.fit_transform(X_imputed)
            
            X_processed = pd.DataFrame(X_imputed, columns=feature_columns, index=X.index)
            
            if 'price' in df.columns:
                y = df['price'].copy()
                return X_processed, y
            else:
                return X_processed
        else:
            # Ensure we have the same columns as training
            missing_cols = set(self.expected_columns) - set(X.columns)
            extra_cols = set(X.columns) - set(self.expected_columns)
            
            for col in missing_cols:
                X[col] = 0.0
            for col in extra_cols:
                X = X.drop(col, axis=1)
            
            # Reorder columns to match training
            X = X[self.expected_columns]
            
            # Transform using fitted imputer and scaler
            X_imputed = self.imputer.transform(X)
            X_imputed = self.scaler.transform(X_imputed)
            
            X_processed = pd.DataFrame(X_imputed, columns=self.expected_columns, index=X.index)
            return X_processed
    
    def build_stacking_regressor(self):
        """
        Build a StackingRegressor with multiple base models
        """
        # Base models with robust parameters
        base_models = [
            ('ridge', Ridge(alpha=1.0, random_state=42)),
            ('lasso', Lasso(alpha=0.1, random_state=42)),
            ('rf', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
            ('gbm', GradientBoostingRegressor(n_estimators=100, random_state=42)),
            ('svr', SVR(kernel='rbf', C=1.0))
        ]
        
        # Meta-model
        meta_model = LinearRegression()
        
        # Create StackingRegressor
        stacking_regressor = StackingRegressor(
            estimators=base_models,
            final_estimator=meta_model,
            cv=5,
            passthrough=False,
            n_jobs=-1
        )
        
        return stacking_regressor
    
    def _calculate_aggregated_feature_importance(self):
        """
        Calculate feature importance by aggregating importances from all base models
        """
        # Initialize importance array
        aggregated_importance = np.zeros(len(self.feature_columns))
        
        # Get importances from each base model
        for name, model in self.stacking_model.estimators_:
            try:
                if hasattr(model, 'feature_importances_'):
                    # Tree-based models (RandomForest, GradientBoosting)
                    importance = model.feature_importances_
                elif hasattr(model, 'coef_'):
                    # Linear models (Ridge, Lasso)
                    importance = abs(model.coef_)
                else:
                    # Models without feature importance (SVR) - skip
                    continue
                
                # Ensure the importance array matches our feature dimensions
                if len(importance) == len(self.feature_columns):
                    aggregated_importance += importance
                    
            except Exception as e:
                print(f"Warning: Could not get importance from {name}: {e}")
                continue
        
        # Normalize the importance scores
        if aggregated_importance.sum() > 0:
            aggregated_importance = aggregated_importance / aggregated_importance.sum()
        
        # Create DataFrame
        feature_importance_df = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': aggregated_importance
        }).sort_values('importance', ascending=False)
        
        return feature_importance_df
    
    def train(self, train_df, perform_tuning=True):
        print("Starting SMAPE-tuned StackingRegressor training...")

        # Preprocess training data
        df_processed = self.preprocess_data(train_df, is_training=True)

        # Prepare features
        X, y = self.prepare_features(df_processed, is_training=True)

        print(f"Training data shape: {X.shape}")
        print(f"Target variable shape: {y.shape}")
        print(f"Features used: {len(self.feature_columns)}")

        # Split data for validation
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Build stacking regressor
        stacking_model = self.build_stacking_regressor()

        if perform_tuning:
            print("Performing randomized hyperparameter tuning with SMAPE scoring...")

            # Define hyperparameter search space
            param_distributions = {
                'ridge__alpha': np.logspace(-2, 2, 10),
                'lasso__alpha': np.logspace(-3, 1, 10),
                'rf__n_estimators': np.arange(50, 300, 50),
                'rf__max_depth': [None, 5, 10, 15, 20],
                'gbm__n_estimators': np.arange(50, 300, 50),
                'gbm__learning_rate': np.linspace(0.01, 0.2, 10),
                'svr__C': np.logspace(-2, 2, 10),
                'svr__gamma': ['scale', 'auto']
            }

            # Perform RandomizedSearchCV with SMAPE
            random_search = RandomizedSearchCV(
                estimator=stacking_model,
                param_distributions=param_distributions,
                n_iter=20,
                scoring=smape_scorer,
                cv=3,
                verbose=2,
                random_state=42,
                n_jobs=-1
            )

            random_search.fit(X_train, y_train)

            # Best model and parameters
            self.stacking_model = random_search.best_estimator_
            self.best_params_ = random_search.best_params_

            print(f"Best parameters found: {self.best_params_}")

        else:
            print("Training without hyperparameter tuning...")
            self.stacking_model = stacking_model
            self.stacking_model.fit(X_train, y_train)

        # Evaluate the model on validation set
        y_pred = self.stacking_model.predict(X_val)

        val_smape = smape(y_val, y_pred)
        val_mae = mean_absolute_error(y_val, y_pred)
        val_mse = mean_squared_error(y_val, y_pred)
        val_r2 = r2_score(y_val, y_pred)

        print("\n=== VALIDATION PERFORMANCE ===")
        print(f"SMAPE: {val_smape:.4f}%")
        print(f"MAE: ${val_mae:.2f}")
        print(f"MSE: ${val_mse:.2f}")
        print(f"R²: {val_r2:.4f}")

        # Cross-validation SMAPE
        cv_scores = cross_val_score(
            self.stacking_model, X, y, cv=3, scoring=smape_scorer, n_jobs=-1
        )
        cv_smape_scores = -cv_scores  # Convert back to positive values

        print(f"\nCross-validation SMAPE: {cv_smape_scores.mean():.4f}% (+/- {cv_smape_scores.std() * 2:.4f}%)")

        # Retrain on full dataset
        print("\nRetraining on full training dataset...")
        self.stacking_model.fit(X, y)
        self.is_trained = True

        # FIXED: Proper feature importance for stacking regressor
        print("\n=== STACKING MODEL ANALYSIS ===")
        
        # Show base model weights in the meta-model
        if hasattr(self.stacking_model.final_estimator_, 'coef_'):
            base_model_names = [name for name, _ in self.stacking_model.estimators_]
            meta_coefficients = abs(self.stacking_model.final_estimator_.coef_)
            
            # Create a DataFrame showing base model importance in the meta-model
            meta_importance = pd.DataFrame({
                'base_model': base_model_names,
                'meta_weight': meta_coefficients
            }).sort_values('meta_weight', ascending=False)
            
            print("Base Model Weights in Meta-Model:")
            print(meta_importance)
        
        # Calculate aggregated feature importance from base models
        try:
            feature_importance = self._calculate_aggregated_feature_importance()
            print("\nTop 10 Most Important Features (Aggregated from Base Models):")
            print(feature_importance.head(10))
        except Exception as e:
            print(f"\nNote: Feature importance calculation skipped: {e}")

        return {
            'smape': val_smape,
            'mae': val_mae,
            'mse': val_mse,
            'r2': val_r2,
            'cv_smape_mean': cv_smape_scores.mean(),
            'cv_smape_std': cv_smape_scores.std()
        }
    
    def predict(self, test_df):
        """
        Predict prices for test data using the trained StackingRegressor
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        
        print("Making predictions on test data...")
        
        # Preprocess test data
        df_processed = self.preprocess_data(test_df, is_training=False)
        
        # Prepare features
        X_test = self.prepare_features(df_processed, is_training=False)
        
        print(f"Test data shape after preprocessing: {X_test.shape}")
        print(f"Expected features: {len(self.expected_columns)}")
        print(f"Actual features: {len(X_test.columns)}")
        
        # Make predictions
        predictions = self.stacking_model.predict(X_test)
        
        # Create results dataframe
        results = test_df.copy()
        results['predicted_price'] = predictions
        
        return results
    
    def save_model(self, filepath):
        """
        Save the trained model and preprocessors
        """
        if not self.is_trained:
            raise ValueError("No trained model to save")
        
        model_data = {
            'scaler': self.scaler,
            'imputer': self.imputer,
            'feature_columns': self.feature_columns,
            'expected_columns': self.expected_columns,
            'stacking_model': self.stacking_model,
            'best_params': self.best_params_,
            'is_trained': self.is_trained
        }
        
        joblib.dump(model_data, filepath)
        print(f"SMAPE-tuned StackingRegressor saved to {filepath}")
    
    def load_model(self, filepath):
        """
        Load a trained model and preprocessors
        """
        model_data = joblib.load(filepath)
        
        self.scaler = model_data['scaler']
        self.imputer = model_data['imputer']
        self.feature_columns = model_data['feature_columns']
        self.expected_columns = model_data['expected_columns']
        self.stacking_model = model_data['stacking_model']
        self.best_params_ = model_data['best_params']
        self.is_trained = model_data['is_trained']
        
        print(f"SMAPE-tuned StackingRegressor loaded from {filepath}")
        if self.best_params_:
            print(f"Best parameters: {self.best_params_}")

# Usage example:
def run_complete_pipeline(train_df, test_df, perform_tuning=True):
    """
    Complete pipeline for training and prediction
    """
    print("=== AMAZON PRODUCT PRICE PREDICTION PIPELINE ===")
    
    # Initialize predictor
    smape_predictor = RobustSMAPEStackingPricePredictor()
    
    try:
        # Train the model
        training_results = smape_predictor.train(train_df, perform_tuning=perform_tuning)
        
        # Make predictions
        test_predictions = smape_predictor.predict(test_df)
        
        print("\n=== PREDICTION COMPLETE ===")
        print(f"Predicted prices for {len(test_predictions)} test samples")
        print(f"Price range: ${test_predictions['predicted_price'].min():.2f} - ${test_predictions['predicted_price'].max():.2f}")
        
        return smape_predictor, training_results, test_predictions
        
    except Exception as e:
        print(f"Error in pipeline: {e}")
        import traceback
        print(f"Traceback: {traceback.format_exc()}")
        return None, None, None

ModuleNotFoundError: No module named 'your_utils'

In [11]:
# MAIN EXECUTION WITH ERROR HANDLING

def run_complete_pipeline(train_file='../data/train.csv', test_file='../data/test.csv'):
    """
    Complete pipeline with robust error handling
    """
    try:
        print("=== AMAZON PRODUCT PRICE PREDICTION PIPELINE ===\n")
        
        # Step 1: Load training data (with prices)
        print("1. Loading training data...")
        train_df = pd.read_csv(train_file)
        
        print(f"Training data shape: {train_df.shape}")
        print("Training columns:", train_df.columns.tolist())
        
        if 'price' not in train_df.columns:
            raise ValueError("Training data must contain 'price' column")
        
        missing_prices = train_df['price'].isnull().sum()
        print(f"Available prices in training data: {len(train_df) - missing_prices}")
        print(f"Missing prices in training data: {missing_prices}")
        
        # Step 2: Load test data (without prices)
        print("\n2. Loading test data...")
        test_df = pd.read_csv(test_file)
        
        print(f"Test data shape: {test_df.shape}")
        print("Test columns:", test_df.columns.tolist())
        
        if 'price' in test_df.columns:
            if test_df['price'].isnull().all():
                print("Test data contains 'price' column with all null values - will ignore for predictions")
                test_df = test_df.drop('price', axis=1)
            else:
                print("Warning: Test data contains non-null 'price' column")
        
        # Step 3: Initialize and train the model
        print("\n3. Training SMAPE-tuned StackingRegressor...")
        smape_predictor = RobustSMAPEStackingPricePredictor()
        
        # Train with hyperparameter tuning
        training_results = smape_predictor.train(train_df, perform_tuning=True)
        
        # Step 4: Save the trained model
        print("\n4. Saving trained model...")
        smape_predictor.save_model('../Model/robust_smape_stacking_predictor.pkl')
        
        # Step 5: Make predictions on test data
        print("\n5. Making predictions on test data...")
        test_predictions = smape_predictor.predict(test_df)
        
        print("Prediction completed!")
        print(f"Predicted prices for {len(test_predictions)} test products")
        
        # Step 6: Display prediction results
        print("\n6. Prediction results:")
        print(test_predictions[['sample_id', 'predicted_price']].head(10))
        
        print("\nTest Prediction Statistics:")
        print(test_predictions['predicted_price'].describe())
        
        # Step 7: Save results
        print("\n7. Saving results...")
        test_predictions[['sample_id', 'predicted_price']].to_csv('../Model/final_test_predictions.csv', index=False)
        print("Predictions saved to 'final_test_predictions.csv'")
        
        return smape_predictor, test_predictions, training_results
        
    except Exception as e:
        print(f"Error in pipeline: {str(e)}")
        print("Traceback:", traceback.format_exc())
        return None, None, None

# Run the complete pipeline
predictor, predictions, results = run_complete_pipeline('../data/train.csv', '../data/test.csv')

=== AMAZON PRODUCT PRICE PREDICTION PIPELINE ===

1. Loading training data...
Training data shape: (50000, 4)
Training columns: ['sample_id', 'catalog_content', 'image_link', 'price']
Available prices in training data: 50000
Missing prices in training data: 0

2. Loading test data...
Test data shape: (25000, 3)
Test columns: ['sample_id', 'catalog_content', 'image_link']

3. Training SMAPE-tuned StackingRegressor...
Starting SMAPE-tuned StackingRegressor training...
Step 1: Extracting features from catalog content...
Training data shape: (50000, 14)
Target variable shape: (50000,)
Features used: 14
Performing randomized hyperparameter tuning with SMAPE scoring...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters found: {'svr__gamma': 'auto', 'svr__C': np.float64(0.0774263682681127), 'ridge__alpha': np.float64(35.93813663804626), 'rf__n_estimators': np.int64(100), 'rf__max_depth': 20, 'lasso__alpha': np.float64(0.1668100537200059), 'gbm__n_estimators': np.int6

In [None]:
# VISUALIZATION AND ANALYSIS

if predictions is not None:
    # Enhanced visualization for test predictions
    plt.figure(figsize=(20, 6))
    
    # 1. Distribution of predicted prices in test set
    plt.subplot(1, 4, 1)
    sns.histplot(predictions['predicted_price'], bins=50, kde=True)
    plt.title('Distribution of Predicted Prices\n(Test Set)')
    plt.xlabel('Predicted Price ($)')
    
    # 2. Price by product category in test set
    plt.subplot(1, 4, 2)
    if 'product_category' in predictions.columns:
        category_prices = predictions.groupby('product_category')['predicted_price'].mean().sort_values(ascending=False)
        sns.barplot(y=category_prices.index, x=category_prices.values)
        plt.title('Average Predicted Price by Category\n(Test Set)')
        plt.xlabel('Average Price ($)')
    else:
        plt.text(0.5, 0.5, 'Product Category\nNot Available', 
                horizontalalignment='center', verticalalignment='center',
                transform=plt.gca().transAxes)
        plt.title('Product Category Analysis')
    
    # 3. Feature importance
    plt.subplot(1, 4, 3)
    if hasattr(predictor.stacking_model.final_estimator_, 'coef_'):
        feature_importance = pd.DataFrame({
            'feature': predictor.feature_columns,
            'importance': abs(predictor.stacking_model.final_estimator_.coef_)
        }).sort_values('importance', ascending=False).head(10)
        
        sns.barplot(data=feature_importance, y='feature', x='importance')
        plt.title('Top 10 Feature Importances')
        plt.xlabel('Importance')
    else:
        plt.text(0.5, 0.5, 'Feature Importance\nNot Available', 
                horizontalalignment='center', verticalalignment='center',
                transform=plt.gca().transAxes)
        plt.title('Feature Importance')
    
    # 4. SMAPE performance
    plt.subplot(1, 4, 4)
    metrics = ['SMAPE', 'MAE', 'R²']
    values = [results['smape'], results['mae'], results['r2']]
    
    # Normalize for visualization
    normalized_values = [results['smape']/100, results['mae']/max(values[1], 1), results['r2']]
    
    bars = plt.bar(metrics, normalized_values)
    plt.title('Model Performance Metrics\n(Normalized)')
    plt.xticks(rotation=45)
    
    # Add value labels
    for i, (metric, value) in enumerate(zip(metrics, values)):
        if metric == 'SMAPE':
            plt.text(i, normalized_values[i] + 0.02, f'{value:.2f}%', ha='center')
        elif metric == 'MAE':
            plt.text(i, normalized_values[i] + 0.02, f'${value:.2f}', ha='center')
        else:
            plt.text(i, normalized_values[i] + 0.02, f'{value:.4f}', ha='center')
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed results
    print("\n=== DETAILED RESULTS ===")
    print(f"Best Model: {type(predictor.stacking_model).__name__}")
    print(f"Validation SMAPE: {results['smape']:.4f}%")
    print(f"Validation MAE: ${results['mae']:.2f}")
    print(f"Validation R²: {results['r2']:.4f}")
    print(f"Cross-validation SMAPE: {results['cv_smape_mean']:.4f}% ± {results['cv_smape_std']:.4f}%")
    
    if predictor.best_params_:
        print(f"\nBest Hyperparameters: {predictor.best_params_}")
    
    print(f"\nFeatures used: {len(predictor.feature_columns)}")
    print("All features:", predictor.feature_columns)

In [15]:
import pandas as pd
import numpy as np
import traceback
import os
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
import joblib

# ========== UTILITY FUNCTIONS ==========

def smape(actual, predicted):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    
    # Handle cases where both actual and predicted are zero
    denominator = (np.abs(actual) + np.abs(predicted))
    # Add small epsilon to avoid division by zero
    denominator = np.where(denominator == 0, 1e-10, denominator)
    
    smape_value = (100 / len(actual)) * np.sum(2 * np.abs(predicted - actual) / denominator)
    return smape_value

# Create SMAPE scorer for sklearn (lower is better)
smape_scorer = make_scorer(
    lambda y_true, y_pred: smape(y_true, y_pred),
    greater_is_better=False
)

def extract_features_from_catalog(catalog_text):
    """
    Extract structured features from the catalog_content field
    """
    features = {
        'item_name': '',
        'pack_size': '',
        'weight': 0.0,
        'volume': 0.0,
        'count': 0.0,
        'unit': '',
        'flavor': '',
        'brand': '',
        'is_organic': 0,
        'is_gluten_free': 0,
        'is_vegan': 0,
        'is_kosher': 0,
        'is_sugar_free': 0,
        'is_low_carb': 0,
        'is_non_gmo': 0,
        'calories_per_serving': 0.0,
        'protein_content': 0.0,
        'fiber_content': 0.0
    }
    
    if pd.isna(catalog_text):
        return features
    
    text = str(catalog_text).lower()
    
    # Extract item name
    item_name_match = re.search(r'item name:\s*([^\n]+)', catalog_text, re.IGNORECASE)
    if item_name_match:
        features['item_name'] = item_name_match.group(1).strip()
    
    # Extract pack size information
    pack_patterns = [
        r'pack of\s*(\d+)',
        r'(\d+)\s*count',
        r'(\d+)\s*pack',
        r'(\d+)\s*ct'
    ]
    for pattern in pack_patterns:
        match = re.search(pattern, text)
        if match:
            features['pack_size'] = match.group(1)
            break
    
    # Extract weight
    weight_patterns = [
        r'(\d+\.?\d*)\s*oz',
        r'(\d+\.?\d*)\s*ounce',
        r'(\d+\.?\d*)\s*lb',
        r'(\d+\.?\d*)\s*pound'
    ]
    for pattern in weight_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['weight'] = float(match.group(1))
            except:
                features['weight'] = 0.0
            break
    
    # Extract volume
    volume_patterns = [
        r'(\d+\.?\d*)\s*fl\s*oz',
        r'(\d+\.?\d*)\s*fluid\s*ounce',
        r'(\d+\.?\d*)\s*ml',
        r'(\d+\.?\d*)\s*liter'
    ]
    for pattern in volume_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['volume'] = float(match.group(1))
            except:
                features['volume'] = 0.0
            break
    
    # Extract count
    count_patterns = [
        r'(\d+)\s*tea bags',
        r'(\d+)\s*capsules',
        r'(\d+)\s*pods',
        r'(\d+)\s*cookies'
    ]
    for pattern in count_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                features['count'] = float(match.group(1))
            except:
                features['count'] = 0.0
            break
    
    # Extract unit from the structured Unit field
    unit_match = re.search(r'unit:\s*([^\n]+)', catalog_text, re.IGNORECASE)
    if unit_match:
        features['unit'] = unit_match.group(1).strip().lower()
    
    # Extract flavor information
    flavor_keywords = ['vanilla', 'chocolate', 'strawberry', 'lemon', 'mint', 'berry', 
                      'caramel', 'honey', 'spice', 'cinnamon', 'ginger', 'peach']
    for flavor in flavor_keywords:
        if flavor in text:
            features['flavor'] = flavor
            break
    
    # Extract brand names
    brand_patterns = [
        r'manufacturer:\s*([^\n]+)',
        r'brand:\s*([^\n]+)'
    ]
    for pattern in brand_patterns:
        match = re.search(pattern, catalog_text, re.IGNORECASE)
        if match:
            features['brand'] = match.group(1).strip()
            break
    
    # Health and dietary attributes
    features['is_organic'] = 1 if any(term in text for term in ['organic', 'usda organic']) else 0
    features['is_gluten_free'] = 1 if 'gluten free' in text else 0
    features['is_vegan'] = 1 if 'vegan' in text else 0
    features['is_kosher'] = 1 if 'kosher' in text else 0
    features['is_sugar_free'] = 1 if any(term in text for term in ['sugar free', 'no sugar', 'zero sugar']) else 0
    features['is_low_carb'] = 1 if any(term in text for term in ['low carb', 'keto', 'keto-friendly']) else 0
    features['is_non_gmo'] = 1 if any(term in text for term in ['non-gmo', 'non gmo']) else 0
    
    # Extract nutritional information
    calorie_match = re.search(r'(\d+)\s*calories', text)
    if calorie_match:
        try:
            features['calories_per_serving'] = float(calorie_match.group(1))
        except:
            features['calories_per_serving'] = 0.0
    
    protein_match = re.search(r'(\d+\.?\d*)\s*g\s*protein', text)
    if protein_match:
        try:
            features['protein_content'] = float(protein_match.group(1))
        except:
            features['protein_content'] = 0.0
    
    fiber_match = re.search(r'(\d+\.?\d*)\s*g\s*fiber', text)
    if fiber_match:
        try:
            features['fiber_content'] = float(fiber_match.group(1))
        except:
            features['fiber_content'] = 0.0
    
    return features

def create_derived_features(df, has_price=True):
    """
    Create derived features, handling the case where price might be missing
    """
    # Product category based on item name
    def categorize_product(item_name):
        if pd.isna(item_name):
            return 'other'
        item_name = str(item_name).lower()
        if any(word in item_name for word in ['tea', 'chai']):
            return 'tea'
        elif any(word in item_name for word in ['coffee', 'brew']):
            return 'coffee'
        elif any(word in item_name for word in ['snack', 'chip', 'cracker', 'cookie']):
            return 'snack'
        elif any(word in item_name for word in ['sauce', 'dressing', 'oil']):
            return 'condiment'
        elif any(word in item_name for word in ['spice', 'seasoning']):
            return 'spice'
        elif any(word in item_name for word in ['candy', 'chocolate']):
            return 'candy'
        elif any(word in item_name for word in ['pasta', 'rice', 'grain']):
            return 'grain'
        else:
            return 'other'
    
    df['product_category'] = df['item_name'].apply(categorize_product)
    
    # Text length features
    df['catalog_content_length'] = df['catalog_content'].str.len().fillna(0)
    
    # Only create price-related features if price is available
    if has_price and 'price' in df.columns:
        if 'weight' in df.columns:
            weight_safe = df['weight'].replace(0, np.nan)
            df['price_per_oz'] = df['price'] / weight_safe
            df['price_per_oz'] = df['price_per_oz'].replace([np.inf, -np.inf], np.nan)
        
        if 'volume' in df.columns:
            volume_safe = df['volume'].replace(0, np.nan)
            df['price_per_fl_oz'] = df['price'] / volume_safe
            df['price_per_fl_oz'] = df['price_per_fl_oz'].replace([np.inf, -np.inf], np.nan)
        
        if 'count' in df.columns:
            count_safe = df['count'].replace(0, np.nan)
            df['price_per_count'] = df['price'] / count_safe
            df['price_per_count'] = df['price_per_count'].replace([np.inf, -np.inf], np.nan)
    
    return df

# ========== MAIN PREDICTOR CLASS ==========

class RobustSMAPEStackingPricePredictor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='median')
        self.feature_columns = []
        self.expected_columns = []
        self.stacking_model = None
        self.is_trained = False
        self.best_params_ = None
        
    def preprocess_data(self, df, is_training=True):
        """
        Preprocess the data: extract features, handle missing values, encode categorical variables
        """
        print("Step 1: Extracting features from catalog content...")
        
        # Make a copy to avoid modifying original data
        df_processed = df.copy()
        
        # Extract features
        catalog_features = df_processed['catalog_content'].apply(extract_features_from_catalog)
        features_df = pd.DataFrame(catalog_features.tolist(), index=df_processed.index)
        
        # Concatenate with original data
        df_enhanced = pd.concat([df_processed, features_df], axis=1)
        
        # Ensure all expected numerical columns exist with proper defaults
        numerical_columns = ['weight', 'volume', 'count', 'calories_per_serving', 'protein_content', 'fiber_content']
        for col in numerical_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 0.0
            else:
                df_enhanced[col] = pd.to_numeric(df_enhanced[col], errors='coerce').fillna(0.0)
        
        # Ensure all expected boolean columns exist
        boolean_columns = ['is_organic', 'is_gluten_free', 'is_vegan', 'is_kosher', 
                          'is_sugar_free', 'is_low_carb', 'is_non_gmo']
        for col in boolean_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 0
            else:
                df_enhanced[col] = df_enhanced[col].astype(int)
        
        # Handle categorical columns
        categorical_columns = ['unit', 'flavor', 'brand', 'item_name']
        for col in categorical_columns:
            if col not in df_enhanced.columns:
                df_enhanced[col] = 'unknown'
            else:
                df_enhanced[col] = df_enhanced[col].fillna('unknown')
        
        # Create derived features
        has_price = is_training and 'price' in df_enhanced.columns
        df_enhanced = create_derived_features(df_enhanced, has_price=has_price)
        
        # Handle outliers in price if it exists and we're training
        if is_training and 'price' in df_enhanced.columns:
            Q1 = df_enhanced['price'].quantile(0.25)
            Q3 = df_enhanced['price'].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = max(0, Q1 - 1.5 * IQR)  # Ensure lower bound is not negative
            upper_bound = Q3 + 1.5 * IQR
            
            df_enhanced['price'] = np.where(df_enhanced['price'] > upper_bound, upper_bound, df_enhanced['price'])
            df_enhanced['price'] = np.where(df_enhanced['price'] < lower_bound, lower_bound, df_enhanced['price'])
        
        return df_enhanced
    
    def prepare_features(self, df, is_training=True):
        """
        Prepare features for modeling with robust error handling
        """
        # Define core features that should always be present
        core_features = [
            'weight', 'volume', 'count', 'catalog_content_length',
            'is_organic', 'is_gluten_free', 'is_vegan', 'is_kosher', 
            'is_sugar_free', 'is_low_carb', 'is_non_gmo'
        ]
        
        # Add price per unit features if available (only in training)
        if is_training:
            price_per_features = [col for col in df.columns if col.startswith('price_per')]
            feature_columns = core_features + price_per_features
        else:
            feature_columns = core_features.copy()
        
        # Ensure all feature columns exist in the dataframe
        missing_features = set(feature_columns) - set(df.columns)
        for feature in missing_features:
            df[feature] = 0.0  # Add missing features with default value
        
        # Select only the feature columns we want
        feature_columns = [col for col in feature_columns if col in df.columns]
        X = df[feature_columns].copy()
        
        # Replace infinite values with NaN
        X = X.replace([np.inf, -np.inf], np.nan)
        
        if is_training:
            self.feature_columns = feature_columns
            self.expected_columns = feature_columns.copy()
            
            # Fit imputer and scaler on training data
            X_imputed = self.imputer.fit_transform(X)
            
            # Scale numerical features
            numerical_cols = [col for col in X.columns if X[col].dtype in ['float64', 'float32', 'int64']]
            if numerical_cols:
                X_imputed = self.scaler.fit_transform(X_imputed)
            
            X_processed = pd.DataFrame(X_imputed, columns=feature_columns, index=X.index)
            
            if 'price' in df.columns:
                y = df['price'].copy()
                return X_processed, y
            else:
                return X_processed
        else:
            # Ensure we have the same columns as training
            missing_cols = set(self.expected_columns) - set(X.columns)
            extra_cols = set(X.columns) - set(self.expected_columns)
            
            for col in missing_cols:
                X[col] = 0.0
            for col in extra_cols:
                X = X.drop(col, axis=1)
            
            # Reorder columns to match training
            X = X[self.expected_columns]
            
            # Transform using fitted imputer and scaler
            X_imputed = self.imputer.transform(X)
            X_imputed = self.scaler.transform(X_imputed)
            
            X_processed = pd.DataFrame(X_imputed, columns=self.expected_columns, index=X.index)
            return X_processed
    
    def build_stacking_regressor(self):
        """
        Build a StackingRegressor with multiple base models
        """
        # Base models with robust parameters
        base_models = [
            ('ridge', Ridge(alpha=1.0, random_state=42)),
            ('lasso', Lasso(alpha=0.1, random_state=42)),
            ('rf', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)),
            ('gbm', GradientBoostingRegressor(n_estimators=100, random_state=42)),
            ('svr', SVR(kernel='rbf', C=1.0))
        ]
        
        # Meta-model
        meta_model = LinearRegression()
        
        # Create StackingRegressor
        stacking_regressor = StackingRegressor(
            estimators=base_models,
            final_estimator=meta_model,
            cv=5,
            passthrough=False,
            n_jobs=-1
        )
        
        return stacking_regressor
    
    def _calculate_aggregated_feature_importance(self):
        """
        Calculate feature importance by aggregating importances from all base models
        """
        # Initialize importance array
        aggregated_importance = np.zeros(len(self.feature_columns))
        
        # Get importances from each base model
        for name, model in self.stacking_model.estimators_:
            try:
                if hasattr(model, 'feature_importances_'):
                    # Tree-based models (RandomForest, GradientBoosting)
                    importance = model.feature_importances_
                elif hasattr(model, 'coef_'):
                    # Linear models (Ridge, Lasso)
                    importance = abs(model.coef_)
                else:
                    # Models without feature importance (SVR) - skip
                    continue
                
                # Ensure the importance array matches our feature dimensions
                if len(importance) == len(self.feature_columns):
                    aggregated_importance += importance
                    
            except Exception as e:
                print(f"Warning: Could not get importance from {name}: {e}")
                continue
        
        # Normalize the importance scores
        if aggregated_importance.sum() > 0:
            aggregated_importance = aggregated_importance / aggregated_importance.sum()
        
        # Create DataFrame
        feature_importance_df = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': aggregated_importance
        }).sort_values('importance', ascending=False)
        
        return feature_importance_df
    
    def train(self, train_df, perform_tuning=True):
        print("Starting SMAPE-tuned StackingRegressor training...")

        # Preprocess training data
        df_processed = self.preprocess_data(train_df, is_training=True)

        # Prepare features
        X, y = self.prepare_features(df_processed, is_training=True)

        print(f"Training data shape: {X.shape}")
        print(f"Target variable shape: {y.shape}")
        print(f"Features used: {len(self.feature_columns)}")

        # Split data for validation
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Build stacking regressor
        stacking_model = self.build_stacking_regressor()

        if perform_tuning:
            print("Performing randomized hyperparameter tuning with SMAPE scoring...")

            # Define hyperparameter search space
            param_distributions = {
                'ridge__alpha': np.logspace(-2, 2, 10),
                'lasso__alpha': np.logspace(-3, 1, 10),
                'rf__n_estimators': np.arange(50, 300, 50),
                'rf__max_depth': [None, 5, 10, 15, 20],
                'gbm__n_estimators': np.arange(50, 300, 50),
                'gbm__learning_rate': np.linspace(0.01, 0.2, 10),
                'svr__C': np.logspace(-2, 2, 10),
                'svr__gamma': ['scale', 'auto']
            }

            # Perform RandomizedSearchCV with SMAPE
            random_search = RandomizedSearchCV(
                estimator=stacking_model,
                param_distributions=param_distributions,
                n_iter=20,
                scoring=smape_scorer,
                cv=3,
                verbose=2,
                random_state=42,
                n_jobs=-1
            )

            random_search.fit(X_train, y_train)

            # Best model and parameters
            self.stacking_model = random_search.best_estimator_
            self.best_params_ = random_search.best_params_

            print(f"Best parameters found: {self.best_params_}")

        else:
            print("Training without hyperparameter tuning...")
            self.stacking_model = stacking_model
            self.stacking_model.fit(X_train, y_train)

        # Evaluate the model on validation set
        y_pred = self.stacking_model.predict(X_val)

        val_smape = smape(y_val, y_pred)
        val_mae = mean_absolute_error(y_val, y_pred)
        val_mse = mean_squared_error(y_val, y_pred)
        val_r2 = r2_score(y_val, y_pred)

        print("\n=== VALIDATION PERFORMANCE ===")
        print(f"SMAPE: {val_smape:.4f}%")
        print(f"MAE: ${val_mae:.2f}")
        print(f"MSE: ${val_mse:.2f}")
        print(f"R²: {val_r2:.4f}")

        # Cross-validation SMAPE
        cv_scores = cross_val_score(
            self.stacking_model, X, y, cv=3, scoring=smape_scorer, n_jobs=-1
        )
        cv_smape_scores = -cv_scores  # Convert back to positive values

        print(f"\nCross-validation SMAPE: {cv_smape_scores.mean():.4f}% (+/- {cv_smape_scores.std() * 2:.4f}%)")

        # Retrain on full dataset
        print("\nRetraining on full training dataset...")
        self.stacking_model.fit(X, y)
        self.is_trained = True

        # FIXED: Proper feature importance for stacking regressor
        print("\n=== STACKING MODEL ANALYSIS ===")
        
        # Show base model weights in the meta-model
        if hasattr(self.stacking_model.final_estimator_, 'coef_'):
            base_model_names = [name for name, _ in self.stacking_model.estimators_]
            meta_coefficients = abs(self.stacking_model.final_estimator_.coef_)
            
            # Create a DataFrame showing base model importance in the meta-model
            meta_importance = pd.DataFrame({
                'base_model': base_model_names,
                'meta_weight': meta_coefficients
            }).sort_values('meta_weight', ascending=False)
            
            print("Base Model Weights in Meta-Model:")
            print(meta_importance)
        
        # Calculate aggregated feature importance from base models
        try:
            feature_importance = self._calculate_aggregated_feature_importance()
            print("\nTop 10 Most Important Features (Aggregated from Base Models):")
            print(feature_importance.head(10))
        except Exception as e:
            print(f"\nNote: Feature importance calculation skipped: {e}")

        return {
            'smape': val_smape,
            'mae': val_mae,
            'mse': val_mse,
            'r2': val_r2,
            'cv_smape_mean': cv_smape_scores.mean(),
            'cv_smape_std': cv_smape_scores.std()
        }
    
    def predict(self, test_df):
        """
        Predict prices for test data using the trained StackingRegressor
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        
        print("Making predictions on test data...")
        
        # Preprocess test data
        df_processed = self.preprocess_data(test_df, is_training=False)
        
        # Prepare features
        X_test = self.prepare_features(df_processed, is_training=False)
        
        print(f"Test data shape after preprocessing: {X_test.shape}")
        print(f"Expected features: {len(self.expected_columns)}")
        print(f"Actual features: {len(X_test.columns)}")
        
        # Make predictions
        predictions = self.stacking_model.predict(X_test)
        
        # Create results dataframe
        results = test_df.copy()
        results['predicted_price'] = predictions
        
        return results
    
    def save_model(self, filepath):
        """
        Save the trained model and preprocessors
        """
        if not self.is_trained:
            raise ValueError("No trained model to save")
        
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        model_data = {
            'scaler': self.scaler,
            'imputer': self.imputer,
            'feature_columns': self.feature_columns,
            'expected_columns': self.expected_columns,
            'stacking_model': self.stacking_model,
            'best_params': self.best_params_,
            'is_trained': self.is_trained
        }
        
        joblib.dump(model_data, filepath)
        print(f"SMAPE-tuned StackingRegressor saved to {filepath}")
    
    def load_model(self, filepath):
        """
        Load a trained model and preprocessors
        """
        model_data = joblib.load(filepath)
        
        self.scaler = model_data['scaler']
        self.imputer = model_data['imputer']
        self.feature_columns = model_data['feature_columns']
        self.expected_columns = model_data['expected_columns']
        self.stacking_model = model_data['stacking_model']
        self.best_params_ = model_data['best_params']
        self.is_trained = model_data['is_trained']
        
        print(f"SMAPE-tuned StackingRegressor loaded from {filepath}")
        if self.best_params_:
            print(f"Best parameters: {self.best_params_}")

# ========== MAIN EXECUTION PIPELINE ==========

def run_complete_pipeline(train_file='../data/train.csv', test_file='../data/test.csv'):
    """
    Complete pipeline with robust error handling
    """
    try:
        print("=== AMAZON PRODUCT PRICE PREDICTION PIPELINE ===\n")
        
        # Step 1: Load training data (with prices)
        print("1. Loading training data...")
        train_df = pd.read_csv(train_file)
        
        print(f"Training data shape: {train_df.shape}")
        print("Training columns:", train_df.columns.tolist())
        
        if 'price' not in train_df.columns:
            raise ValueError("Training data must contain 'price' column")
        
        missing_prices = train_df['price'].isnull().sum()
        print(f"Available prices in training data: {len(train_df) - missing_prices}")
        print(f"Missing prices in training data: {missing_prices}")
        
        # Step 2: Load test data (without prices)
        print("\n2. Loading test data...")
        test_df = pd.read_csv(test_file)
        
        print(f"Test data shape: {test_df.shape}")
        print("Test columns:", test_df.columns.tolist())
        
        if 'price' in test_df.columns:
            if test_df['price'].isnull().all():
                print("Test data contains 'price' column with all null values - will ignore for predictions")
                test_df = test_df.drop('price', axis=1)
            else:
                print("Warning: Test data contains non-null 'price' column")
        
        # Step 3: Initialize and train the model
        print("\n3. Training SMAPE-tuned StackingRegressor...")
        smape_predictor = RobustSMAPEStackingPricePredictor()
        
        # Train with hyperparameter tuning
        training_results = smape_predictor.train(train_df, perform_tuning=True)
        
        # Step 4: Save the trained model
        print("\n4. Saving trained model...")
        smape_predictor.save_model('../Model/robust_smape_stacking_predictor.pkl')
        
        # Step 5: Make predictions on test data
        print("\n5. Making predictions on test data...")
        test_predictions = smape_predictor.predict(test_df)
        
        print("Prediction completed!")
        print(f"Predicted prices for {len(test_predictions)} test products")
        
        # Step 6: Display prediction results
        print("\n6. Prediction results:")
        print(test_predictions[['sample_id', 'predicted_price']].head(10))
        
        print("\nTest Prediction Statistics:")
        print(test_predictions['predicted_price'].describe())
        
        # Step 7: Save results
        print("\n7. Saving results...")
        # Create Model directory if it doesn't exist
        os.makedirs('../Model', exist_ok=True)
        test_predictions[['sample_id', 'predicted_price']].to_csv('../Model/final_test_predictions.csv', index=False)
        print("Predictions saved to '../Model/final_test_predictions.csv'")
        
        return smape_predictor, test_predictions, training_results
        
    except FileNotFoundError as e:
        print(f"File not found error: {e}")
        print("Please check that the data files exist in the specified paths")
        return None, None, None
    except Exception as e:
        print(f"Error in pipeline: {str(e)}")
        print("Traceback:", traceback.format_exc())
        return None, None, None

# Run the complete pipeline
if __name__ == "__main__":
    predictor, predictions, results = run_complete_pipeline('../data/train.csv', '../data/test.csv')
    
    if predictor is not None:
        print("\n=== PIPELINE COMPLETED SUCCESSFULLY ===")
        print(f"Final SMAPE: {results['smape']:.4f}%")
        print(f"Final R²: {results['r2']:.4f}")
    else:
        print("\n=== PIPELINE FAILED ===")

=== AMAZON PRODUCT PRICE PREDICTION PIPELINE ===

1. Loading training data...


KeyboardInterrupt: 