# 1) Import Libraries and Data Loading

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import json
import pickle
import warnings
from pathlib import Path
from collections import Counter
import time

# Machine Learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
import scipy.sparse as sp

# Suppress warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully!")
print("üéØ Ready to build the Teddy Recommendation System!")

‚úÖ All libraries imported successfully!
üéØ Ready to build the Teddy Recommendation System!


In [2]:
# Load and preprocess data
def load_data():
    """Load products and user events data"""
    print("üîÑ Loading data...")
    
    # Load products
    with open('final_catalog_clean_urls.ndjson', 'r', encoding='utf-8') as f:
        raw_products = [json.loads(line) for line in f]
    
    # Load user events
    with open('catalog_user_events_gcp_final.ndjson', 'r', encoding='utf-8') as f:
        raw_events = [json.loads(line) for line in f]
    
    print(f"‚úÖ Loaded {len(raw_products):,} products and {len(raw_events):,} user events")
    return raw_products, raw_events

def preprocess_products(raw_products):
    """Clean and preprocess product data with enhanced field extraction"""
    print("üîÑ Preprocessing products with enhanced fields...")
    
    processed_products = []
    for product in raw_products:
        # Extract and clean basic fields
        product_info = {
            'product_id': str(product.get('id', '')),
            'title': str(product.get('title', '')),
            'description': str(product.get('description', '')),
            'category_main': str(product.get('categories', ['Unknown'])[0] if product.get('categories') else 'Unknown'),
            'brand_main': str(product.get('brands', ['Unknown'])[0] if product.get('brands') else 'Unknown'),
            'price': float(product.get('priceInfo', {}).get('price', 0)),
            
            # Enhanced fields for better recommendations
            'age_group': str(product.get('attributes', {}).get('age_group', {}).get('text', [''])[0] if product.get('attributes', {}).get('age_group', {}).get('text') else ''),
            'color': str(product.get('attributes', {}).get('color', {}).get('text', [''])[0] if product.get('attributes', {}).get('color', {}).get('text') else ''),
            'features': ' '.join(product.get('attributes', {}).get('features', {}).get('text', [])) if product.get('attributes', {}).get('features', {}).get('text') else '',
            'tags': ' '.join(product.get('tags', [])) if product.get('tags') else '',
            'availability': str(product.get('availability', 'UNKNOWN')),
            'original_price': float(product.get('priceInfo', {}).get('originalPrice', 0)),
            
            # Legacy fields
            'gender': str(product.get('attributes', {}).get('gender', {}).get('text', [''])[0] if product.get('attributes', {}).get('gender', {}).get('text') else '')
        }
        
        # Calculate discount percentage for deal-based recommendations
        if product_info['original_price'] > 0 and product_info['price'] > 0:
            product_info['discount_percent'] = ((product_info['original_price'] - product_info['price']) / product_info['original_price']) * 100
        else:
            product_info['discount_percent'] = 0.0
        
        # Create enhanced combined text features with new fields
        product_info['combined_features'] = f"{product_info['category_main']} {product_info['brand_main']} {product_info['age_group']} {product_info['color']} {product_info['features']} {product_info['tags']} {product_info['gender']}"
        product_info['content_text'] = f"{product_info['title']} {product_info['description']} {product_info['combined_features']}"
        
        processed_products.append(product_info)
    
    products_df = pd.DataFrame(processed_products)
    print(f"‚úÖ Processed {len(products_df):,} products with enhanced fields")
    print(f"üìä Categories: {products_df['category_main'].nunique()}, Brands: {products_df['brand_main'].nunique()}")
    print(f"üéØ New Fields: Age Groups: {products_df['age_group'].nunique()}, Colors: {products_df['color'].nunique()}")
    print(f"üì¶ Availability: {products_df['availability'].value_counts().to_dict()}")
    
    return products_df

def preprocess_events(raw_events):
    """Clean and preprocess user events"""
    print("üîÑ Preprocessing user events...")
    
    events_data = []
    for event in raw_events:
        # Extract visitor/user ID
        visitor_id = str(event.get('visitorId', ''))
        
        # Extract product details (can be multiple products per event)
        product_details = event.get('productDetails', [])
        if not product_details:
            continue
            
        # Extract event type and map it
        event_type = str(event.get('eventType', ''))
        event_type_mapped = {
            'detail-page-view': 'view',
            'add-to-cart': 'add_to_cart', 
            'purchase-complete': 'purchase'
        }.get(event_type, 'view')
        
        # Create event for each product in the event
        for product_detail in product_details:
            product_info = product_detail.get('product', {})
            product_id = str(product_info.get('id', ''))
            
            if product_id and visitor_id:
                event_info = {
                    'user_id': visitor_id,
                    'product_id': product_id,
                    'event_type': event_type_mapped,
                    'timestamp': event.get('eventTime', 0)
                }
                events_data.append(event_info)
    
    events_df = pd.DataFrame(events_data)
    
    # Create interaction matrix with weights
    # Weight: view=1, cart=2, purchase=3
    weight_map = {'view': 1, 'add_to_cart': 2, 'purchase': 3}
    events_df['weight'] = events_df['event_type'].map(weight_map).fillna(1)
    
    # Aggregate interactions
    interaction_matrix = events_df.groupby(['user_id', 'product_id'])['weight'].sum().reset_index()
    
    print(f"‚úÖ Processed {len(events_df):,} events into {len(interaction_matrix):,} user-product interactions")
    print(f"üë• Users: {interaction_matrix['user_id'].nunique():,}")
    print(f"üì¶ Products: {interaction_matrix['product_id'].nunique():,}")
    
    return events_df, interaction_matrix

# Load and preprocess all data
raw_products, raw_events = load_data()
products_df = preprocess_products(raw_products)
events_df, interaction_matrix = preprocess_events(raw_events)

üîÑ Loading data...
‚úÖ Loaded 14,339 products and 787,416 user events
üîÑ Preprocessing products with enhanced fields...
‚úÖ Processed 14,339 products with enhanced fields
üìä Categories: 46, Brands: 981
üéØ New Fields: Age Groups: 27, Colors: 13
üì¶ Availability: {'IN_STOCK': 14339}
üîÑ Preprocessing user events...
‚úÖ Loaded 14,339 products and 787,416 user events
üîÑ Preprocessing products with enhanced fields...
‚úÖ Processed 14,339 products with enhanced fields
üìä Categories: 46, Brands: 981
üéØ New Fields: Age Groups: 27, Colors: 13
üì¶ Availability: {'IN_STOCK': 14339}
üîÑ Preprocessing user events...
‚úÖ Processed 787,416 events into 696,888 user-product interactions
üë• Users: 466,475
‚úÖ Processed 787,416 events into 696,888 user-product interactions
üë• Users: 466,475
üì¶ Products: 14,339
üì¶ Products: 14,339


In [3]:
# Create feature matrices and mappings
def create_feature_matrices(products_df, interaction_matrix):
    """Create TF-IDF matrix and user-product mappings"""
    print("üîÑ Creating feature matrices...")
    
    # Create TF-IDF matrix for content-based filtering
    tfidf_vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.8
    )
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(products_df['content_text'])
    print(f"‚úÖ TF-IDF matrix created: {tfidf_matrix.shape}")
    
    # Create user and product mappings for collaborative filtering
    users = interaction_matrix['user_id'].unique()
    products = interaction_matrix['product_id'].unique()
    
    user_to_idx = {user: idx for idx, user in enumerate(users)}
    product_to_idx = {product: idx for idx, product in enumerate(products)}
    idx_to_user = {idx: user for user, idx in user_to_idx.items()}
    idx_to_product = {idx: product for product, idx in product_to_idx.items()}
    
    # Create sparse matrix for collaborative filtering
    rows = [user_to_idx[user] for user in interaction_matrix['user_id']]
    cols = [product_to_idx[product] for product in interaction_matrix['product_id']]
    data = interaction_matrix['weight'].values
    
    sparse_matrix = csr_matrix((data, (rows, cols)), shape=(len(users), len(products)))
    
    print(f"‚úÖ Sparse matrix created: {sparse_matrix.shape}")
    print(f"üìä Matrix density: {sparse_matrix.nnz / (sparse_matrix.shape[0] * sparse_matrix.shape[1]) * 100:.4f}%")
    
    return tfidf_matrix, tfidf_vectorizer, sparse_matrix, user_to_idx, product_to_idx, idx_to_user, idx_to_product

# Create all feature matrices
tfidf_matrix, tfidf_vectorizer, sparse_matrix, user_to_idx, product_to_idx, idx_to_user, idx_to_product = create_feature_matrices(products_df, interaction_matrix)

üîÑ Creating feature matrices...
‚úÖ TF-IDF matrix created: (14339, 5000)
‚úÖ TF-IDF matrix created: (14339, 5000)
‚úÖ Sparse matrix created: (466475, 14339)
üìä Matrix density: 0.0104%
‚úÖ Sparse matrix created: (466475, 14339)
üìä Matrix density: 0.0104%


# 2) Model Training with Enhanced Approaches

## i) Content-Based Recommender with Enhanced Brand Coverage

In [4]:
class ContentBasedRecommender:
    """Ultra-Enhanced Content-Based Recommender for Maximum Brand & Category Coverage"""
    
    def __init__(self, products_df, tfidf_matrix, interaction_matrix):
        self.products_df = products_df.reset_index(drop=True)
        self.tfidf_matrix = tfidf_matrix
        self.interaction_matrix = interaction_matrix
        
        # Create mappings
        self.product_id_to_idx = {pid: idx for idx, pid in enumerate(products_df['product_id'])}
        self.idx_to_product_id = {idx: pid for pid, idx in self.product_id_to_idx.items()}
        
        # Brand and category analytics
        self.brand_counts = Counter(products_df['brand_main'])
        self.category_counts = Counter(products_df['category_main'])
        self.total_brands = len(self.brand_counts)
        self.total_categories = len(self.category_counts)
        
        # Create brand-product and category-product mappings for diversity
        self.brand_products = {}
        self.category_products = {}
        for idx, row in products_df.iterrows():
            brand, category, pid = row['brand_main'], row['category_main'], row['product_id']
            self.brand_products.setdefault(brand, []).append((pid, idx))
            self.category_products.setdefault(category, []).append((pid, idx))
        
        print(f"‚úÖ Enhanced Content-Based initialized")
        print(f"üìä {len(products_df)} products, {self.total_brands} brands, {self.total_categories} categories")
    
    def get_user_recommendations(self, user_id, n_recommendations=10):
        """Enhanced recommendations with availability, age-appropriate, and discount filtering"""
        user_interactions = self.interaction_matrix[self.interaction_matrix['user_id'] == user_id]
        if user_interactions.empty:
            return self._cold_start_diverse_recommendations(n_recommendations)
        
        # Build user profile from interactions (key technique from old notebook)
        user_products = set(user_interactions['product_id'])
        user_brands = set()
        user_categories = set()
        user_age_groups = set()
        user_colors = set()
        
        for pid in user_products:
            if pid in self.product_id_to_idx:
                idx = self.product_id_to_idx[pid]
                product = self.products_df.iloc[idx]
                user_brands.add(product['brand_main'])
                user_categories.add(product['category_main'])
                if product['age_group']:
                    user_age_groups.add(product['age_group'])
                if product['color']:
                    user_colors.add(product['color'])
        
        # Create enhanced user profile vector
        user_categories_text = ' '.join(user_categories) if user_categories else ''
        user_brands_text = ' '.join(user_brands) if user_brands else ''
        user_ages_text = ' '.join(user_age_groups) if user_age_groups else ''
        user_colors_text = ' '.join(user_colors) if user_colors else ''
        user_profile_text = f"{user_categories_text} {user_brands_text} {user_ages_text} {user_colors_text}".strip()
        
        if not user_profile_text:
            return self._cold_start_diverse_recommendations(n_recommendations)
        
        # Compute similarities efficiently using TF-IDF (proven approach)
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity
        
        # Create user vector
        temp_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
        combined_texts = [user_profile_text] + self.products_df['content_text'].tolist()
        temp_matrix = temp_vectorizer.fit_transform(combined_texts)
        
        user_vector = temp_matrix[0:1]  # First row is user profile
        product_vectors = temp_matrix[1:]  # Rest are products
        
        # Calculate similarities
        similarities = cosine_similarity(user_vector, product_vectors).flatten()
        
        # Apply enhanced filtering and scoring logic
        recommendations_list = []
        brand_count = {}
        
        # Sort products by similarity
        product_indices = np.argsort(similarities)[::-1]
        
        for idx in product_indices:
            if len(recommendations_list) >= n_recommendations * 3:  # Search pool
                break
                
            product = self.products_df.iloc[idx]
            product_id = product['product_id']
            
            # Skip already interacted products
            if product_id in user_products:
                continue
            
            # AVAILABILITY FILTERING - Only recommend IN_STOCK products
            if product['availability'] != 'IN_STOCK':
                continue
            
            # AGE-APPROPRIATE FILTERING - Match user's age groups if available
            if user_age_groups and product['age_group']:
                age_compatible = False
                for user_age in user_age_groups:
                    if user_age in product['age_group'] or product['age_group'] in user_age:
                        age_compatible = True
                        break
                if not age_compatible:
                    continue
            
            brand = product['brand_main']
            similarity_score = similarities[idx]
            
            # Enhanced brand diversity scoring (key improvement from old notebook)
            brand_boost_factor = 1.5  # Brand boost for diversity
            if brand in user_brands:
                # Boost familiar brands
                final_score = similarity_score * brand_boost_factor
            else:
                # Boost new brands for diversity
                final_score = similarity_score * (brand_boost_factor * 1.2)
            
            # DISCOUNT-BASED SCORING - Boost products with good discounts
            if product['discount_percent'] > 0:
                discount_boost = min(1 + (product['discount_percent'] / 100), 2.0)  # Max 2x boost
                final_score *= discount_boost
            
            # COLOR PREFERENCE SCORING - Boost matching colors
            if user_colors and product['color'] and product['color'] in user_colors:
                final_score *= 1.3
            
            # Apply brand count penalty to ensure diversity
            if brand in brand_count:
                if brand_count[brand] >= 2:  # Limit per brand
                    continue
                final_score *= 0.8  # Slight penalty for repeated brands
            else:
                brand_count[brand] = 0
            
            brand_count[brand] += 1
            
            recommendations_list.append({
                'product_id': product_id,
                'title': product['title'],
                'brand': brand,
                'category': product['category_main'],
                'price': product['price'],
                'age_group': product['age_group'],
                'color': product['color'],
                'discount_percent': product['discount_percent'],
                'availability': product['availability'],
                'recommendation_score': final_score,
                'source': 'enhanced_similarity'
            })
        
        # Sort by final score and return top recommendations
        recommendations_list.sort(key=lambda x: x['recommendation_score'], reverse=True)
        return recommendations_list[:n_recommendations]
    
    def _cold_start_diverse_recommendations(self, n_recommendations):
        """Enhanced cold start with availability and discount filtering"""
        # Filter for IN_STOCK products only
        available_products = self.products_df[self.products_df['availability'] == 'IN_STOCK']
        
        # For cold start, recommend popular products with brand diversity
        popular_products = self.interaction_matrix.groupby('product_id')['weight'].sum().reset_index()
        popular_products = popular_products.sort_values('weight', ascending=False)
        
        recommendations = []
        brand_count = {}
        brand_boost_factor = 1.5
        
        for _, product_interaction in popular_products.iterrows():
            if len(recommendations) >= n_recommendations * 2:  # Search pool
                break
                
            product_id = product_interaction['product_id']
            if product_id not in self.product_id_to_idx:
                continue
                
            idx = self.product_id_to_idx[product_id]
            product = self.products_df.iloc[idx]
            
            # AVAILABILITY FILTERING for cold start
            if product['availability'] != 'IN_STOCK':
                continue
                
            brand = product['brand_main']
            
            # Apply brand diversity with boost (key technique from old notebook)
            base_score = product_interaction['weight']
            
            # DISCOUNT-BASED SCORING for cold start
            if product['discount_percent'] > 0:
                discount_boost = min(1 + (product['discount_percent'] / 100), 2.0)
                base_score *= discount_boost
            
            # Brand diversity scoring
            if brand in brand_count:
                if brand_count[brand] >= 2:  # Limit brands
                    continue
                final_score = base_score * 0.8  # Penalty for repeated brands
            else:
                # Boost for new brands with rarity consideration
                frequency = self.brand_counts[brand] / len(self.products_df)
                rarity_multiplier = min(20.0 / frequency, 100.0)
                final_score = base_score * brand_boost_factor * rarity_multiplier
                brand_count[brand] = 0
            
            brand_count[brand] += 1
            
            recommendations.append({
                'product_id': product_id,
                'title': product['title'],
                'brand': brand,
                'category': product['category_main'],
                'price': product['price'],
                'age_group': product['age_group'],
                'color': product['color'],
                'discount_percent': product['discount_percent'],
                'availability': product['availability'],
                'recommendation_score': final_score,
                'source': 'enhanced_cold_start'
            })
        
        # Sort by final score and return top recommendations
        recommendations.sort(key=lambda x: x['recommendation_score'], reverse=True)
        return recommendations[:n_recommendations]

# Initialize Enhanced Content-Based Recommender
print("üîÑ Training Enhanced Content-Based Recommender...")
content_recommender = ContentBasedRecommender(products_df, tfidf_matrix, interaction_matrix)
print("‚úÖ Enhanced Content-Based Recommender ready!")

üîÑ Training Enhanced Content-Based Recommender...
‚úÖ Enhanced Content-Based initialized
üìä 14339 products, 981 brands, 46 categories
‚úÖ Enhanced Content-Based Recommender ready!
‚úÖ Enhanced Content-Based initialized
üìä 14339 products, 981 brands, 46 categories
‚úÖ Enhanced Content-Based Recommender ready!


## ii) Collaborative Filtering with Matrix Factorization

In [5]:
from collections import Counter
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import numpy as np

class CollaborativeFilteringRecommender:
    """Collaborative Filtering with Enhanced Brand Diversity and Filtering"""
    
    def __init__(self, interaction_matrix, products_df, min_interactions=1):
        self.interaction_matrix = interaction_matrix
        self.products_df = products_df
        
        # Filter data
        user_counts = interaction_matrix['user_id'].value_counts()
        product_counts = interaction_matrix['product_id'].value_counts()
        active_users = user_counts[user_counts >= min_interactions].index
        available_products = product_counts[product_counts >= 1].index
        
        self.filtered_interaction_matrix = interaction_matrix[
            (interaction_matrix['user_id'].isin(active_users)) & 
            (interaction_matrix['product_id'].isin(available_products))
        ]
        
        self.unique_users = sorted(self.filtered_interaction_matrix['user_id'].unique())
        self.unique_products = sorted(self.filtered_interaction_matrix['product_id'].unique())
        
        # Create mappings and metadata
        self.user_to_idx = {user: idx for idx, user in enumerate(self.unique_users)}
        self.product_to_idx = {product: idx for idx, product in enumerate(self.unique_products)}
        self.idx_to_product = {idx: product for product, idx in self.product_to_idx.items()}
        self.brand_counts = Counter(products_df['brand_main'])
        self.product_metadata = {
            row['product_id']: {
                'brand': row['brand_main'], 'category': row['category_main'],
                'title': row['title'], 'price': row['price'],
                'age_group': row['age_group'], 'color': row['color'],
                'discount_percent': row['discount_percent'], 'availability': row['availability']
            }
            for _, row in products_df.iterrows()
        }
        
        print(f"‚úÖ Enhanced CF initialized - {len(self.unique_users):,} users, {len(self.unique_products):,} products")
    
    def _create_sparse_matrix(self):
        """Create sparse user-product interaction matrix"""
        rows, cols, data = [], [], []
        for _, row in self.filtered_interaction_matrix.iterrows():
            if row['user_id'] in self.user_to_idx and row['product_id'] in self.product_to_idx:
                rows.append(self.user_to_idx[row['user_id']])
                cols.append(self.product_to_idx[row['product_id']])
                data.append(row['weight'])
        
        return csr_matrix((data, (rows, cols)), shape=(len(self.unique_users), len(self.unique_products)))
    
    def train_model(self, n_factors=60):
        """Train SVD model with improved accuracy"""
        sparse_matrix = self._create_sparse_matrix()
        
        try:
            U, sigma, Vt = svds(sparse_matrix.astype(np.float64), k=n_factors, solver='arpack')
            
            # Store components with regularization
            self.U = U
            self.sigma = sigma + 0.01  # Light regularization
            self.Vt = Vt
            
            # Calculate RMSE on sample
            sample_size = min(10000, sparse_matrix.nnz)
            test_indices = np.random.choice(sparse_matrix.nnz, sample_size, replace=False)
            rows, cols = sparse_matrix.nonzero()
            
            actual = sparse_matrix.data[test_indices]
            predicted = [np.dot(U[rows[i], :], self.sigma * Vt[:, cols[i]]) for i in test_indices]
            
            rmse = np.sqrt(mean_squared_error(actual, predicted))
            print(f"‚úÖ Model trained - RMSE: {rmse:.4f}")
            
            self._create_brand_aware_popularity()
            
        except Exception as e:
            print(f"‚ö†Ô∏è SVD failed: {e}")
            self._create_brand_aware_popularity()
    
    def _create_brand_aware_popularity(self):
        """Create enhanced brand-diversified popularity fallback"""
        popularity = self.filtered_interaction_matrix.groupby('product_id')['weight'].sum()
        brand_boost = {}
        
        for product_id, score in popularity.items():
            if product_id in self.product_metadata:
                metadata = self.product_metadata[product_id]
                
                # Skip out-of-stock products
                if metadata['availability'] != 'IN_STOCK':
                    continue
                
                brand = metadata['brand']
                frequency = self.brand_counts[brand] / len(self.products_df)
                boost = min(2.0 / frequency, 20.0)
                
                # Add discount boost
                discount_boost = min(1 + (metadata['discount_percent'] / 100), 2.0)
                
                final_score = score * boost * discount_boost
                brand_boost[product_id] = final_score
        
        self.brand_aware_popularity = pd.Series(brand_boost).sort_values(ascending=False)
    
    def get_user_recommendations(self, user_id, n_recommendations=10):
        """Generate enhanced recommendations with filtering and scoring"""
        if user_id not in self.user_to_idx:
            return self._cold_start_recommend(n_recommendations)
        
        user_idx = self.user_to_idx[user_id]
        user_interactions = set(self.filtered_interaction_matrix[
            self.filtered_interaction_matrix['user_id'] == user_id]['product_id'])
        
        # Extract user preferences for filtering
        user_age_groups = set()
        user_colors = set()
        for pid in user_interactions:
            if pid in self.product_metadata:
                metadata = self.product_metadata[pid]
                if metadata['age_group']:
                    user_age_groups.add(metadata['age_group'])
                if metadata['color']:
                    user_colors.add(metadata['color'])
        
        # Generate predictions
        if hasattr(self, 'U') and self.U is not None:
            user_profile = self.U[user_idx, :]
            scores = np.dot(user_profile, self.sigma.reshape(-1, 1) * self.Vt).flatten()
            product_scores = list(zip(self.unique_products, scores))
        else:
            product_scores = [(pid, score) for pid, score in self.brand_aware_popularity.items()]
        
        # Filter out interacted products and apply enhanced filtering
        filtered_scores = []
        for pid, score in product_scores:
            if pid not in user_interactions and pid in self.product_metadata:
                metadata = self.product_metadata[pid]
                
                # AVAILABILITY FILTERING
                if metadata['availability'] != 'IN_STOCK':
                    continue
                
                # AGE-APPROPRIATE FILTERING
                if user_age_groups and metadata['age_group']:
                    age_compatible = False
                    for user_age in user_age_groups:
                        if user_age in metadata['age_group'] or metadata['age_group'] in user_age:
                            age_compatible = True
                            break
                    if not age_compatible:
                        continue
                
                # ENHANCED SCORING
                enhanced_score = score
                
                # Discount boost
                if metadata['discount_percent'] > 0:
                    discount_boost = min(1 + (metadata['discount_percent'] / 100), 2.0)
                    enhanced_score *= discount_boost
                
                # Color preference boost
                if user_colors and metadata['color'] and metadata['color'] in user_colors:
                    enhanced_score *= 1.3
                
                filtered_scores.append((pid, enhanced_score))
        
        return self._diversify_by_brand(filtered_scores, n_recommendations)
    
    def _diversify_by_brand(self, product_scores, num_recommendations):
        """Ultra-enhanced brand diversification with enhanced metadata"""
        recommendations, used_brands = [], set()
        sorted_scores = sorted(product_scores, key=lambda x: x[1], reverse=True)
        
        # PHASE 1: Ensure MAXIMUM brand diversity - one product per brand only
        for product_id, score in sorted_scores:
            if len(recommendations) >= num_recommendations:
                break
            
            if product_id in self.product_metadata:
                metadata = self.product_metadata[product_id]
                brand = metadata['brand']
                
                # Only add if brand not already used
                if brand not in used_brands:
                    # Apply ultra-high rarity boost
                    frequency = self.brand_counts[brand] / len(self.products_df)
                    rarity_multiplier = min(100.0 / frequency, 500.0)
                    
                    enhanced_score = score * rarity_multiplier
                    
                    recommendations.append({
                        'product_id': product_id,
                        'title': metadata['title'],
                        'category': metadata['category'],
                        'brand': brand,
                        'price': metadata['price'],
                        'age_group': metadata['age_group'],
                        'color': metadata['color'],
                        'discount_percent': metadata['discount_percent'],
                        'availability': metadata['availability'],
                        'predicted_rating': float(enhanced_score)
                    })
                    used_brands.add(brand)
        
        # PHASE 2: Fill remaining slots with enhanced brand selection
        if len(recommendations) < num_recommendations:
            # Find brands not yet represented
            all_brands = set(self.product_metadata[pid]['brand'] for pid in self.product_metadata.keys() 
                           if self.product_metadata[pid]['availability'] == 'IN_STOCK')
            unused_brands = all_brands - used_brands
            
            # Sort unused brands by rarity and add best products
            unused_brand_scores = []
            for brand in unused_brands:
                frequency = self.brand_counts[brand] / len(self.products_df)
                rarity_score = min(200.0 / frequency, 1000.0)
                unused_brand_scores.append((brand, rarity_score))
            
            unused_brand_scores.sort(key=lambda x: x[1], reverse=True)
            
            # Add best product from each unused rare brand
            remaining_slots = num_recommendations - len(recommendations)
            for brand, rarity_score in unused_brand_scores[:remaining_slots]:
                # Find best product from this brand in the original scores
                best_pid = None
                best_score = -1
                
                for product_id, score in product_scores:
                    if (product_id in self.product_metadata and 
                        self.product_metadata[product_id]['brand'] == brand and
                        self.product_metadata[product_id]['availability'] == 'IN_STOCK' and
                        score > best_score):
                        best_pid = product_id
                        best_score = score
                
                if best_pid and best_pid in self.product_metadata:
                    metadata = self.product_metadata[best_pid]
                    recommendations.append({
                        'product_id': best_pid,
                        'title': metadata['title'],
                        'category': metadata['category'],
                        'brand': brand,
                        'price': metadata['price'],
                        'age_group': metadata['age_group'],
                        'color': metadata['color'],
                        'discount_percent': metadata['discount_percent'],
                        'availability': metadata['availability'],
                        'predicted_rating': float(rarity_score)
                    })
        
        return recommendations[:num_recommendations]
    
    def _cold_start_recommend(self, num_recommendations):
        """Enhanced cold start using availability-filtered popularity"""
        available_products = [(pid, score) for pid, score in self.brand_aware_popularity.items() 
                            if pid in self.product_metadata and 
                            self.product_metadata[pid]['availability'] == 'IN_STOCK']
        return self._diversify_by_brand(available_products[:num_recommendations*2], num_recommendations)

# Initialize and train Enhanced Collaborative Filtering model
cf_recommender = CollaborativeFilteringRecommender(interaction_matrix, products_df, min_interactions=1)
cf_recommender.train_model(n_factors=60)
print("‚úÖ Enhanced Collaborative Filtering ready!")

‚úÖ Enhanced CF initialized - 466,475 users, 14,339 products
‚úÖ Model trained - RMSE: 1.7233
‚úÖ Enhanced Collaborative Filtering ready!
‚úÖ Model trained - RMSE: 1.7233
‚úÖ Enhanced Collaborative Filtering ready!


## iii) Hybrid Recommendation System

In [16]:
from sklearn.preprocessing import MinMaxScaler

class HybridRecommendationSystem:
    """Enhanced Hybrid Recommender with Advanced Filtering and Scoring"""
    
    def __init__(self, content_recommender, cf_recommender, content_weight=0.65, cf_weight=0.35):
        self.content_recommender = content_recommender
        self.cf_recommender = cf_recommender
        self.content_weight = content_weight
        self.cf_weight = cf_weight
        print(f"‚úÖ Enhanced Hybrid System initialized (Content: {content_weight*100}%, CF: {cf_weight*100}%)")
    
    def _normalize_scores(self, recommendations, score_field):
        """Normalize scores to 0-1 range"""
        if not recommendations or len(set(rec[score_field] for rec in recommendations)) == 1:
            for rec in recommendations:
                rec[f'normalized_{score_field}'] = 1.0
            return recommendations
        
        scores = [rec[score_field] for rec in recommendations]
        scaler = MinMaxScaler()
        normalized = scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()
        
        for i, rec in enumerate(recommendations):
            rec[f'normalized_{score_field}'] = normalized[i]
        return recommendations
    
    def get_user_recommendations(self, user_id, n_recommendations=10):
        """Get enhanced hybrid recommendations with advanced filtering"""
        # Get recommendations from both enhanced systems
        content_recs = self.content_recommender.get_user_recommendations(user_id, n_recommendations * 3)
        cf_recs = self.cf_recommender.get_user_recommendations(user_id, n_recommendations * 2)
        
        # Handle empty results
        if not content_recs and not cf_recs:
            return []
        elif not content_recs:
            return cf_recs[:n_recommendations]
        elif not cf_recs:
            return content_recs[:n_recommendations]
        
        # Normalize and combine scores with enhanced weighting
        content_recs = self._normalize_scores(content_recs, 'recommendation_score')
        cf_recs = self._normalize_scores(cf_recs, 'predicted_rating')
        
        combined_scores = {}
        product_info = {}
        brand_sources = {}
        
        # Process content recommendations with enhanced field integration
        for rec in content_recs:
            pid, brand = rec['product_id'], rec['brand']
            
            # Enhanced weighting based on content quality
            base_weight = self.content_weight
            
            # Boost content recommendations with rich metadata
            if rec.get('age_group') and rec.get('color'):
                base_weight *= 1.2  # Boost products with complete metadata
            
            # Boost discounted products in content
            if rec.get('discount_percent', 0) > 10:
                base_weight *= 1.1
            
            combined_scores[pid] = base_weight * rec['normalized_recommendation_score']
            product_info[pid] = rec
            brand_sources.setdefault(brand, set()).add('content')
        
        # Process CF recommendations with enhanced scoring
        for rec in cf_recs:
            pid, brand = rec['product_id'], rec['brand']
            
            # Enhanced CF weighting
            base_weight = self.cf_weight
            
            # Boost CF recommendations with user interaction patterns
            if rec.get('discount_percent', 0) > 15:
                base_weight *= 1.15  # Strong boost for high discount CF recs
            
            score = base_weight * rec['normalized_predicted_rating']
            combined_scores[pid] = combined_scores.get(pid, 0) + score
            
            if pid not in product_info:
                product_info[pid] = rec
            brand_sources.setdefault(brand, set()).add('cf')
        
        # Apply enhanced cross-validation and metadata bonuses
        for pid in combined_scores:
            rec = product_info[pid]
            brand = rec['brand']
            
            # Cross-validation bonus (appears in both systems)
            if len(brand_sources.get(brand, set())) == 2:
                combined_scores[pid] *= 1.2
            
            # Age-appropriate bonus (safety priority)
            if rec.get('age_group'):
                combined_scores[pid] *= 1.05
            
            # High discount bonus
            if rec.get('discount_percent', 0) > 20:
                combined_scores[pid] *= 1.1
            
            # Color variety bonus (for aesthetic diversity)
            if rec.get('color') and rec['color'] not in ['Unknown', '']:
                combined_scores[pid] *= 1.03
        
        # Enhanced brand-first selection strategy with metadata priorities
        sorted_products = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
        recommendations, used_brands = [], set()
        
        # First pass: unique brands with enhanced metadata priority
        for pid, score in sorted_products:
            if len(recommendations) >= n_recommendations:
                break
            rec = product_info[pid]
            
            # Enhanced brand diversity with metadata consideration
            if rec['brand'] not in used_brands:
                # Ensure all enhanced fields are included
                recommendation = {
                    'product_id': pid, 'title': rec['title'], 'category': rec['category'],
                    'brand': rec['brand'], 'price': rec['price'],
                    'age_group': rec.get('age_group', ''),
                    'color': rec.get('color', ''),
                    'discount_percent': rec.get('discount_percent', 0),
                    'availability': rec.get('availability', 'UNKNOWN'),
                    'hybrid_score': score,
                    'recommendation_type': 'enhanced_hybrid'
                }
                recommendations.append(recommendation)
                used_brands.add(rec['brand'])
        
        # Second pass: fill remaining slots with enhanced brand limits
        brand_counts = {rec['brand']: 1 for rec in recommendations}
        for pid, score in sorted_products:
            if len(recommendations) >= n_recommendations:
                break
            if any(r['product_id'] == pid for r in recommendations):
                continue
            
            rec = product_info[pid]
            # Allow up to 2 products per brand, prioritizing high-scoring items
            if brand_counts.get(rec['brand'], 0) < 2:
                recommendation = {
                    'product_id': pid, 'title': rec['title'], 'category': rec['category'],
                    'brand': rec['brand'], 'price': rec['price'],
                    'age_group': rec.get('age_group', ''),
                    'color': rec.get('color', ''),
                    'discount_percent': rec.get('discount_percent', 0),
                    'availability': rec.get('availability', 'UNKNOWN'),
                    'hybrid_score': score,
                    'recommendation_type': 'enhanced_hybrid'
                }
                recommendations.append(recommendation)
                brand_counts[rec['brand']] = brand_counts.get(rec['brand'], 0) + 1
        
        return recommendations

# Initialize Enhanced Hybrid Recommendation System 
print("üîÑ Creating Enhanced Hybrid Recommendation System...")
hybrid_recommender = HybridRecommendationSystem(content_recommender, cf_recommender)
print("‚úÖ Enhanced Hybrid Recommendation System ready!")

üîÑ Creating Enhanced Hybrid Recommendation System...
‚úÖ Enhanced Hybrid System initialized (Content: 65.0%, CF: 35.0%)
‚úÖ Enhanced Hybrid Recommendation System ready!


# üîç 4) Model Evaluation & Performance Analysis

In [7]:
# Enhanced Model Evaluation with Brand and Category Coverage
import time
import random

class EnhancedEvaluator:
    def __init__(self, models, products_df, interaction_matrix):
        self.models = models
        self.products_df = products_df
        self.total_brands = products_df['brand_main'].nunique()
        self.total_categories = products_df['category_main'].nunique()
        self.test_users = random.sample(list(interaction_matrix['user_id'].unique()), 20)
    
    def evaluate_model(self, name, model):
        """Ultra-comprehensive evaluation with maximized diversity testing"""
        try:
            # Performance test
            start_time = time.time()
            test_recs = model.get_user_recommendations(self.test_users[0], 5)
            response_time = time.time() - start_time
            
            # ENHANCED Coverage analysis with MORE users and LARGER recommendation lists
            all_brands, all_categories = set(), set()
            total_recs = 0
            
            # Test with ALL 20 users and request 20 recommendations each for maximum diversity
            for user in self.test_users:
                try:
                    user_recs = model.get_user_recommendations(user, 20)  # Increased from 8 to 20
                    
                    for rec in user_recs:
                        brand = rec.get('brand', '')
                        category = rec.get('category', '')
                        if brand and brand != 'Unknown':
                            all_brands.add(brand)
                        if category and category != 'Unknown':
                            all_categories.add(category)
                        total_recs += 1
                except Exception as e:
                    print(f"Error with user {user}: {e}")
                    continue
            
            # Calculate enhanced metrics
            brand_coverage = (len(all_brands) / self.total_brands) * 100
            category_coverage = (len(all_categories) / self.total_categories) * 100
            coverage_score = (brand_coverage * 0.7 + category_coverage * 0.3)
            
            return {
                'brand_coverage': brand_coverage,
                'category_coverage': category_coverage,
                'coverage_score': coverage_score,
                'response_time': response_time,
                'total_recs': total_recs,
                'unique_brands': len(all_brands),
                'unique_categories': len(all_categories),
                'success_rate': 100 if total_recs > 0 else 0
            }
            
        except Exception as e:
            return {
                'brand_coverage': 0, 'category_coverage': 0, 'coverage_score': 0,
                'response_time': 0, 'total_recs': 0, 'unique_brands': 0,
                'unique_categories': 0, 'success_rate': 0, 'error': str(e)
            }

# Initialize evaluator
print("Setting up enhanced evaluation framework...")
evaluator = EnhancedEvaluator({'Content-Based': content_recommender}, products_df, interaction_matrix)
print("‚úÖ Enhanced evaluator ready!")

Setting up enhanced evaluation framework...
‚úÖ Enhanced evaluator ready!


In [13]:
# Content-Based Recommender Evaluation
print("üéØ EVALUATING CONTENT-BASED RECOMMENDER")
print("="*50)

# Test Content-Based model
print("Testing Content-Based model...")
content_results = evaluator.evaluate_model('Content-Based', content_recommender)

# Display Content-Based results
print("\nCONTENT-BASED PERFORMANCE ANALYSIS")
print("=" * 45)

if 'error' in content_results:
    print(f"Content-Based: Error - {content_results['error']}")
else:
    # Rating system
    score = content_results['coverage_score']
    if score >= 40:
        rating = "OUTSTANDING"
    elif score >= 25:
        rating = "EXCELLENT"
    elif score >= 15:
        rating = "GOOD"
    elif score >= 8:
        rating = "FAIR"
    else:
        rating = "POOR"
    
    print(f"\nContent-Based:")
    print(f"  Brand Coverage: {content_results['brand_coverage']:.1f}% ({content_results.get('unique_brands', 0)} brands)")
    print(f"  Category Coverage: {content_results['category_coverage']:.1f}% ({content_results.get('unique_categories', 0)} categories)")
    print(f"  Overall Score: {content_results['coverage_score']:.1f}% ({rating})")
    print(f"  Response: {content_results['response_time']:.3f}s")
    print(f"  Total Recs: {content_results['total_recs']}")
    print(f"  Success: {content_results['success_rate']:.0f}%")

print(f"\nCatalog Stats: {evaluator.total_brands} brands, {evaluator.total_categories} categories")
print("Content-Based evaluation complete!")

üéØ EVALUATING CONTENT-BASED RECOMMENDER
Testing Content-Based model...

CONTENT-BASED PERFORMANCE ANALYSIS

Content-Based:
  Brand Coverage: 18.9% (185 brands)
  Category Coverage: 69.6% (32 categories)
  Overall Score: 34.1% (EXCELLENT)
  Response: 0.643s
  Total Recs: 400
  Success: 100%

Catalog Stats: 981 brands, 46 categories
Content-Based evaluation complete!

CONTENT-BASED PERFORMANCE ANALYSIS

Content-Based:
  Brand Coverage: 18.9% (185 brands)
  Category Coverage: 69.6% (32 categories)
  Overall Score: 34.1% (EXCELLENT)
  Response: 0.643s
  Total Recs: 400
  Success: 100%

Catalog Stats: 981 brands, 46 categories
Content-Based evaluation complete!


In [14]:
# Collaborative Filtering Evaluation
print("ü§ù EVALUATING COLLABORATIVE FILTERING")
print("="*40)

# Test Collaborative Filtering model
print("Testing Collaborative Filtering model...")
cf_results = evaluator.evaluate_model('Collaborative Filtering', cf_recommender)

# Display Collaborative Filtering results
print("\nCOLLABORATIVE FILTERING PERFORMANCE ANALYSIS")
print("=" * 50)

if 'error' in cf_results:
    print(f"Collaborative Filtering: Error - {cf_results['error']}")
else:
    # Rating system
    score = cf_results['coverage_score']
    if score >= 40:
        rating = "OUTSTANDING"
    elif score >= 25:
        rating = "EXCELLENT"
    elif score >= 15:
        rating = "GOOD"
    elif score >= 8:
        rating = "FAIR"
    else:
        rating = "POOR"
    
    print(f"\nCollaborative Filtering:")
    print(f"  Brand Coverage: {cf_results['brand_coverage']:.1f}% ({cf_results.get('unique_brands', 0)} brands)")
    print(f"  Category Coverage: {cf_results['category_coverage']:.1f}% ({cf_results.get('unique_categories', 0)} categories)")
    print(f"  Overall Score: {cf_results['coverage_score']:.1f}% ({rating})")
    print(f"  Response: {cf_results['response_time']:.3f}s")
    print(f"  Total Recs: {cf_results['total_recs']}")
    print(f"  Success: {cf_results['success_rate']:.0f}%")

print("Collaborative Filtering evaluation complete!")

ü§ù EVALUATING COLLABORATIVE FILTERING
Testing Collaborative Filtering model...

COLLABORATIVE FILTERING PERFORMANCE ANALYSIS

Collaborative Filtering:
  Brand Coverage: 9.4% (92 brands)
  Category Coverage: 60.9% (28 categories)
  Overall Score: 24.8% (GOOD)
  Response: 0.088s
  Total Recs: 400
  Success: 100%
Collaborative Filtering evaluation complete!

COLLABORATIVE FILTERING PERFORMANCE ANALYSIS

Collaborative Filtering:
  Brand Coverage: 9.4% (92 brands)
  Category Coverage: 60.9% (28 categories)
  Overall Score: 24.8% (GOOD)
  Response: 0.088s
  Total Recs: 400
  Success: 100%
Collaborative Filtering evaluation complete!


In [17]:
# Hybrid System Evaluation
print("üîÑ EVALUATING HYBRID SYSTEM")
print("="*30)

# Test Hybrid System
print("Testing Hybrid System...")
hybrid_results = evaluator.evaluate_model('Hybrid System', hybrid_recommender)

# Display Hybrid System results
print("\nHYBRID SYSTEM PERFORMANCE ANALYSIS")
print("=" * 40)

if 'error' in hybrid_results:
    print(f"Hybrid System: Error - {hybrid_results['error']}")
else:
    # Rating system
    score = hybrid_results['coverage_score']
    if score >= 40:
        rating = "OUTSTANDING"
    elif score >= 25:
        rating = "EXCELLENT"
    elif score >= 15:
        rating = "GOOD"
    elif score >= 8:
        rating = "FAIR"
    else:
        rating = "POOR"
    
    print(f"\nHybrid System:")
    print(f"  Brand Coverage: {hybrid_results['brand_coverage']:.1f}% ({hybrid_results.get('unique_brands', 0)} brands)")
    print(f"  Category Coverage: {hybrid_results['category_coverage']:.1f}% ({hybrid_results.get('unique_categories', 0)} categories)")
    print(f"  Overall Score: {hybrid_results['coverage_score']:.1f}% ({rating})")
    print(f"  Response: {hybrid_results['response_time']:.3f}s")
    print(f"  Total Recs: {hybrid_results['total_recs']}")
    print(f"  Success: {hybrid_results['success_rate']:.0f}%")

# Combine all results for final summary
results = {
    'Content-Based': content_results,
    'Collaborative Filtering': cf_results,
    'Hybrid System': hybrid_results
}

print("\n" + "="*60)
print("üèÜ FINAL EVALUATION SUMMARY")
print("="*60)

for model_name, metrics in results.items():
    if 'error' not in metrics:
        score = metrics['coverage_score']
        if score >= 40:
            rating = "OUTSTANDING"
        elif score >= 25:
            rating = "EXCELLENT"
        elif score >= 15:
            rating = "GOOD"
        elif score >= 8:
            rating = "FAIR"
        else:
            rating = "POOR"
        
        print(f"{model_name}: {metrics['coverage_score']:.1f}% ({rating})")

print("\nEvaluation complete! ‚úÖ")

üîÑ EVALUATING HYBRID SYSTEM
Testing Hybrid System...

HYBRID SYSTEM PERFORMANCE ANALYSIS

Hybrid System:
  Brand Coverage: 18.8% (184 brands)
  Category Coverage: 71.7% (33 categories)
  Overall Score: 34.7% (EXCELLENT)
  Response: 1.312s
  Total Recs: 400
  Success: 100%

üèÜ FINAL EVALUATION SUMMARY
Content-Based: 34.1% (EXCELLENT)
Collaborative Filtering: 24.8% (GOOD)
Hybrid System: 34.7% (EXCELLENT)

Evaluation complete! ‚úÖ

HYBRID SYSTEM PERFORMANCE ANALYSIS

Hybrid System:
  Brand Coverage: 18.8% (184 brands)
  Category Coverage: 71.7% (33 categories)
  Overall Score: 34.7% (EXCELLENT)
  Response: 1.312s
  Total Recs: 400
  Success: 100%

üèÜ FINAL EVALUATION SUMMARY
Content-Based: 34.1% (EXCELLENT)
Collaborative Filtering: 24.8% (GOOD)
Hybrid System: 34.7% (EXCELLENT)

Evaluation complete! ‚úÖ


# üíæ 5) Model SAVING & Production Setup

In [18]:
# üèÜ SELECT BEST MODEL (Fast)
print("üîç Selecting best model...")

# Quick scoring based on coverage
scores = {
    'Content-Based': content_results.get('coverage_score', 0),
    'Collaborative Filtering': cf_results.get('coverage_score', 0), 
    'Hybrid System': hybrid_results.get('coverage_score', 0)
}

# Display all model scores
print("\nüìä MODEL SCORES:")
for model_name, score in scores.items():
    print(f"  {model_name}: {score:.1f}/100")

print("\nüî¨ SCORE CALCULATION:")
print("  Coverage Score = (Brand Coverage √ó 70%) + (Category Coverage √ó 30%)")
print("  Brand Coverage = (Unique Brands Found / Total Brands) √ó 100")
print("  Category Coverage = (Unique Categories Found / Total Categories) √ó 100")

# Find winner
best_model_name = max(scores, key=scores.get)
best_score = scores[best_model_name]

# Set model reference
if best_model_name == "Content-Based":
    selected_model = content_recommender
elif best_model_name == "Collaborative Filtering":
    selected_model = cf_recommender
else:
    selected_model = hybrid_recommender

print(f"\nüèÜ WINNER: {best_model_name}")
print(f"üìä Best Score: {best_score:.1f}/100")
print("‚úÖ Best model selected!")

üîç Selecting best model...

üìä MODEL SCORES:
  Content-Based: 34.1/100
  Collaborative Filtering: 24.8/100
  Hybrid System: 34.7/100

üî¨ SCORE CALCULATION:
  Coverage Score = (Brand Coverage √ó 70%) + (Category Coverage √ó 30%)
  Brand Coverage = (Unique Brands Found / Total Brands) √ó 100
  Category Coverage = (Unique Categories Found / Total Categories) √ó 100

üèÜ WINNER: Hybrid System
üìä Best Score: 34.7/100
‚úÖ Best model selected!


In [None]:
# üíæ SAVE BEST MODEL (Minimal & Fast)
import pickle
import os
import json
from datetime import datetime
import shutil

print("üíæ Saving best model...")

# Clear old models
if os.path.exists("saved_models_production"):
    shutil.rmtree("saved_models_production")

# Create save directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
save_dir = f"saved_models_production/best_teddy_model_{timestamp}"
os.makedirs(save_dir, exist_ok=True)

# Save minimal data based on model type
if best_model_name == "Content-Based":
    model_data = {
        'model_type': 'content_based',
        'products_df': selected_model.products_df,
        'product_id_to_idx': selected_model.product_id_to_idx,
        'brand_counts': selected_model.brand_counts,
        'interaction_matrix': selected_model.interaction_matrix
    }
elif best_model_name == "Collaborative Filtering":
    model_data = {
        'model_type': 'collaborative_filtering',
        'user_to_idx': selected_model.user_to_idx,
        'product_to_idx': selected_model.product_to_idx,
        'unique_products': selected_model.unique_products,
        'filtered_interaction_matrix': selected_model.filtered_interaction_matrix,
        'product_metadata': selected_model.product_metadata,
        'brand_aware_popularity': getattr(selected_model, 'brand_aware_popularity', None)
    }
else:  # Hybrid
    model_data = {
        'model_type': 'hybrid',
        'content_weight': selected_model.content_weight,
        'cf_weight': selected_model.cf_weight,
        # Include components from both sub-models for complete functionality
        'products_df': selected_model.content_recommender.products_df,
        'product_id_to_idx': selected_model.content_recommender.product_id_to_idx,
        'brand_counts': selected_model.content_recommender.brand_counts,
        'interaction_matrix': selected_model.content_recommender.interaction_matrix,
        'user_to_idx': selected_model.cf_recommender.user_to_idx,
        'product_to_idx': selected_model.cf_recommender.product_to_idx,
        'unique_products': selected_model.cf_recommender.unique_products,
        'filtered_interaction_matrix': selected_model.cf_recommender.filtered_interaction_matrix,
        'product_metadata': selected_model.cf_recommender.product_metadata,
        'brand_aware_popularity': getattr(selected_model.cf_recommender, 'brand_aware_popularity', None)
    }

# Save files
with open(f"{save_dir}/best_model.pkl", "wb") as f:
    pickle.dump(model_data, f)

with open(f"{save_dir}/preprocessors.pkl", "wb") as f:
    pickle.dump({'tfidf_vectorizer': tfidf_vectorizer}, f)

with open(f"{save_dir}/metadata.json", "w") as f:
    json.dump({'best_model': best_model_name, 'timestamp': timestamp}, f)

print(f"‚úÖ Saved: {save_dir}")
print(f"üèÜ Model: {best_model_name}")
print("üöÄ Ready for production!")

üíæ Saving best model...
‚úÖ Saved: saved_models_production/best_teddy_model_20251109_164429
üèÜ Model: Hybrid System
üöÄ Ready for production!
‚úÖ Saved: saved_models_production/best_teddy_model_20251109_164429
üèÜ Model: Hybrid System
üöÄ Ready for production!


: 

# üìä ACTUAL RESULTS FROM THIS NOTEBOOK RUN

Let's document the real performance results we achieved:

In [None]:
# Display the ACTUAL results from our evaluation
print("üìä ACTUAL RESULTS FROM THIS NOTEBOOK RUN:")
print("=" * 50)

print("\nüéØ CONTENT-BASED RESULTS:")
if 'content_results' in locals() and content_results:
    if 'error' not in content_results:
        print(f"  Brand Coverage: {content_results['brand_coverage']:.1f}%")
        print(f"  Category Coverage: {content_results['category_coverage']:.1f}%") 
        print(f"  Coverage Score: {content_results['coverage_score']:.1f}%")
        print(f"  Response Time: {content_results['response_time']:.3f}s")
    else:
        print(f"  Error: {content_results.get('error', 'Unknown error')}")
else:
    print("  Results not available - run evaluation cells above")

print("\nü§ù COLLABORATIVE FILTERING RESULTS:")
if 'cf_results' in locals() and cf_results:
    if 'error' not in cf_results:
        print(f"  Brand Coverage: {cf_results['brand_coverage']:.1f}%")
        print(f"  Category Coverage: {cf_results['category_coverage']:.1f}%")
        print(f"  Coverage Score: {cf_results['coverage_score']:.1f}%")
        print(f"  Response Time: {cf_results['response_time']:.3f}s")
    else:
        print(f"  Error: {cf_results.get('error', 'Unknown error')}")
else:
    print("  Results not available - run evaluation cells above")

print("\nüéØ HYBRID SYSTEM RESULTS:")
if 'hybrid_results' in locals() and hybrid_results:
    if 'error' not in hybrid_results:
        print(f"  Brand Coverage: {hybrid_results['brand_coverage']:.1f}%")
        print(f"  Category Coverage: {hybrid_results['category_coverage']:.1f}%")
        print(f"  Coverage Score: {hybrid_results['coverage_score']:.1f}%")
        print(f"  Response Time: {hybrid_results['response_time']:.3f}s")
    else:
        print(f"  Error: {hybrid_results.get('error', 'Unknown error')}")
else:
    print("  Results not available - run evaluation cells above")

print("\nüèÜ BEST MODEL SELECTED:")
if 'best_model_name' in locals() and 'best_score' in locals():
    print(f"  Winner: {best_model_name}")
    print(f"  Score: {best_score:.1f}%")
else:
    print("  Model selection not completed - run selection cell above")

print("\n‚ö†Ô∏è NOTE: These are the ACTUAL results from this notebook run.")
print("Any results in README.md should match these numbers!")