In [None]:
import bigframes.pandas as bpd
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import networkx as nx
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')


class SubstituteQualityValidator:
    """
    Validates and scores the quality of semantic substitute recommendations.
    This complements our BigQuery semantic analysis with quality metrics.
    """

    def __init__(self, project_id, dataset_id):
        self.project_id = project_id
        self.dataset_id = dataset_id

    def validate_substitute_quality(self, product_id, substitute_candidates):
        """
        Validate substitute quality using multiple quality dimensions.

        Args:
            product_id (int): Original product ID
            substitute_candidates (list): List of substitute product IDs

        Returns:
            pandas.DataFrame: Quality validation results with scores
        """
        candidates_str = ','.join(map(str, substitute_candidates))

        query = f"""
        WITH original_product AS (
          SELECT id, name, category, brand, retail_price, embedding
          FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean`
          WHERE id = {product_id}
        ),
        vector_results AS (
          SELECT
            base.id AS substitute_id,
            distance
          FROM VECTOR_SEARCH(
            TABLE `{self.project_id}.{self.dataset_id}.product_embeddings_clean`,
            'embedding',
            (SELECT embedding FROM original_product),
            top_k => 50,
            distance_type => 'COSINE'
          ) AS query
          WHERE base.id IN ({candidates_str}) -- restrict to candidate IDs
        ),
        substitute_validation AS (
          SELECT
            s.id AS substitute_id,
            s.name AS substitute_name,
            s.category AS substitute_category,
            s.brand AS substitute_brand,
            s.retail_price AS substitute_price,
            o.name AS original_name,
            o.category AS original_category,
            o.brand AS original_brand,
            o.retail_price AS original_price,

            -- Quality Dimension 1: Semantic Similarity
            ROUND(1 - vr.distance, 3) AS semantic_similarity,

            -- Quality Dimension 2: Price Appropriateness
            CASE
              WHEN ABS(s.retail_price - o.retail_price) / o.retail_price <= 0.2 THEN 1.0
              WHEN ABS(s.retail_price - o.retail_price) / o.retail_price <= 0.4 THEN 0.8
              WHEN ABS(s.retail_price - o.retail_price) / o.retail_price <= 0.6 THEN 0.6
              ELSE 0.3
            END AS price_appropriateness,

            -- Quality Dimension 3: Brand Compatibility
            CASE
              WHEN s.brand = o.brand THEN 1.0
              WHEN s.retail_price BETWEEN o.retail_price * 0.8 AND o.retail_price * 1.2 THEN 0.8
              ELSE 0.6
            END AS brand_compatibility,

            -- Quality Dimension 4: Category Logic
            CASE
              WHEN s.category = o.category THEN 1.0
              WHEN (1 - vr.distance) > 0.7 THEN 0.9
              WHEN (1 - vr.distance) > 0.5 THEN 0.7
              ELSE 0.4
            END AS category_logic_score

          FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean` s
          JOIN vector_results vr ON s.id = vr.substitute_id
          CROSS JOIN original_product o
        )
        SELECT
          *,
          -- Overall Quality Score (weighted combination)
          ROUND(
            semantic_similarity * 0.4 +
            price_appropriateness * 0.25 +
            brand_compatibility * 0.2 +
            category_logic_score * 0.15, 3
          ) AS overall_quality_score,

          -- Quality Grade
          CASE
            WHEN (semantic_similarity * 0.4 +
                  price_appropriateness * 0.25 +
                  brand_compatibility * 0.2 +
                  category_logic_score * 0.15) >= 0.8 THEN 'Excellent'
            WHEN (semantic_similarity * 0.4 +
                  price_appropriateness * 0.25 +
                  brand_compatibility * 0.2 +
                  category_logic_score * 0.15) >= 0.7 THEN 'Good'
            WHEN (semantic_similarity * 0.4 +
                  price_appropriateness * 0.25 +
                  brand_compatibility * 0.2 +
                  category_logic_score * 0.15) >= 0.6 THEN 'Fair'
            ELSE 'Poor'
          END AS quality_grade
        FROM substitute_validation
        ORDER BY overall_quality_score DESC;
        """

        return bpd.read_gbq(query).to_pandas()


class SubstitutePerformanceTracker:
    """
    Tracks real-time performance of substitute recommendations.
    This adds performance monitoring to our semantic recommendations.
    """

    def __init__(self, project_id, dataset_id):
        self.project_id = project_id
        self.dataset_id = dataset_id

    def analyze_substitute_effectiveness(self, time_window_days=30):
        """
        Analyze how effective different types of substitutes are.

        Args:
            time_window_days (int): Time window for analysis

        Returns:
            dict: Performance analysis results
        """
        # Simulate substitute performance analysis
        query = f"""
        WITH product_pair_calculations AS (
            SELECT
                p1.category as original_category,
                p2.category as substitute_category,
                (1 - COSINE_DISTANCE(p1.embedding, p2.embedding)) as semantic_similarity_raw,
                p1.retail_price as original_price_raw,
                p2.retail_price as substitute_price_raw
            FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean` p1
            CROSS JOIN `{self.project_id}.{self.dataset_id}.product_embeddings_clean` p2
            WHERE p1.id != p2.id
              AND (1 - COSINE_DISTANCE(p1.embedding, p2.embedding)) > 0.5
        ),
        substitute_performance AS (
            SELECT
                original_category,
                substitute_category,
                -- Performance indicators
                CASE
                    WHEN original_category = substitute_category THEN 'Direct Substitute'
                    WHEN semantic_similarity_raw > 0.7 THEN 'Strong Cross-Category'
                    ELSE 'Weak Cross-Category'
                END as substitute_type,

                -- Simulated effectiveness score
                ROUND(
                    semantic_similarity_raw * 0.6 +
                    (1 - ABS(original_price_raw - substitute_price_raw) / GREATEST(original_price_raw, substitute_price_raw)) * 0.4, 3
                ) as effectiveness_score,
                semantic_similarity_raw,
                original_price_raw,
                substitute_price_raw
            FROM product_pair_calculations
        )
        SELECT
            original_category,
            substitute_category,
            substitute_type,
            COUNT(*) as substitute_pairs,
            ROUND(AVG(semantic_similarity_raw), 3) as avg_similarity,
            ROUND(AVG(original_price_raw), 2) as avg_original_price,
            ROUND(AVG(substitute_price_raw), 2) as avg_substitute_price,
            ROUND(AVG(effectiveness_score), 3) as effectiveness_score
        FROM substitute_performance
        GROUP BY original_category, substitute_category, substitute_type
        HAVING COUNT(*) >= 5
        ORDER BY effectiveness_score DESC
        LIMIT 25
        """

        performance_df = bpd.read_gbq(query).to_pandas()

        # Calculate performance metrics
        metrics = {
            'total_substitute_types': len(performance_df),
            'avg_effectiveness': performance_df['effectiveness_score'].mean(),
            'best_performing_type': performance_df.iloc[0]['substitute_type'] if len(performance_df) > 0 else None,
            'cross_category_performance': performance_df[
                performance_df['substitute_type'].str.contains('Cross-Category')
            ]['effectiveness_score'].mean() if len(performance_df) > 0 else 0
        }

        return {
            'performance_data': performance_df,
            'metrics': metrics
        }


class AdvancedSubstituteClustering:
    """
    Advanced clustering analysis specifically for substitute relationships.
    This adds clustering insights to our semantic analysis.
    """

    def __init__(self, project_id, dataset_id):
        self.project_id = project_id
        self.dataset_id = dataset_id

    def discover_substitute_clusters(self, category=None, min_similarity=0.6):
        """
        Discover clusters of products that serve as substitutes for each other.

        Args:
            category (str, optional): Focus on specific category
            min_similarity (float): Minimum similarity for clustering

        Returns:
            dict: Clustering results with substitute groups
        """
        category_filter = f"WHERE category = '{category}'" if category else ""

        # Get product data for clustering
        query = f"""
        SELECT
            id, name, category, brand, retail_price, embedding
        FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean`
        {category_filter}
        ORDER BY RAND()
        LIMIT 500
        """

        df = bpd.read_gbq(query).to_pandas()

        # Extract embeddings
        embeddings = np.array([np.array(emb) for emb in df['embedding'].values])

        # Use DBSCAN for substitute clustering
        dbscan = DBSCAN(eps=0.3, min_samples=3, metric='cosine')
        cluster_labels = dbscan.fit_predict(embeddings)

        df['substitute_cluster'] = cluster_labels

        # Analyze substitute clusters
        cluster_analysis = []
        unique_clusters = set(cluster_labels)
        unique_clusters.discard(-1)  # Remove noise cluster

        for cluster_id in unique_clusters:
            cluster_products = df[df['substitute_cluster'] == cluster_id]

            if len(cluster_products) >= 3:  # Only meaningful clusters
                cluster_info = {
                    'cluster_id': cluster_id,
                    'product_count': len(cluster_products),
                    'categories': cluster_products['category'].value_counts().to_dict(),
                    'brands': cluster_products['brand'].value_counts().to_dict(),
                    'price_range': {
                        'min': cluster_products['retail_price'].min(),
                        'max': cluster_products['retail_price'].max(),
                        'avg': cluster_products['retail_price'].mean()
                    },
                    'sample_products': cluster_products[['name', 'category', 'brand', 'retail_price']].head(5).to_dict('records'),
                    'substitute_strength': self._calculate_cluster_cohesion(cluster_products, embeddings)
                }
                cluster_analysis.append(cluster_info)

        return {
            'clustered_data': df,
            'cluster_analysis': sorted(cluster_analysis, key=lambda x: x['substitute_strength'], reverse=True),
            'total_clusters': len(unique_clusters),
            'noise_products': len(df[df['substitute_cluster'] == -1])
        }

    def _calculate_cluster_cohesion(self, cluster_products, all_embeddings):
        """Calculate internal cohesion of a substitute cluster."""
        cluster_indices = cluster_products.index.tolist()
        cluster_embeddings = all_embeddings[cluster_indices]

        # Calculate average pairwise similarity within cluster
        similarities = []
        for i in range(len(cluster_embeddings)):
            for j in range(i+1, len(cluster_embeddings)):
                sim = 1 - np.dot(cluster_embeddings[i], cluster_embeddings[j]) / (
                    np.linalg.norm(cluster_embeddings[i]) * np.linalg.norm(cluster_embeddings[j])
                )
                similarities.append(1 - sim)  # Convert distance to similarity

        return np.mean(similarities) if similarities else 0


class InteractiveSubstituteExplorer:
    """
    Interactive exploration tools for substitute relationships.
    This adds interactive analysis to our semantic recommendations.
    """

    def __init__(self, project_id, dataset_id):
        self.project_id = project_id
        self.dataset_id = dataset_id

    def create_substitute_decision_tree_viz(self, product_id):
        """
        Create interactive decision tree visualization for substitute selection.

        Args:
            product_id (int): Product ID to analyze

        Returns:
            plotly.graph_objects.Figure: Interactive decision tree
        """
        # Get substitute candidates
        query = f"""
        WITH original_product AS (
            SELECT id, name, category, brand, retail_price, embedding
            FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean`
            WHERE id = {product_id}
        ),
        substitute_tree AS (
            SELECT
                s.id,
                s.name,
                s.category,
                s.brand,
                s.retail_price,
                (1 - COSINE_DISTANCE(o.embedding, s.embedding)) as similarity,

                -- Decision tree branches
                CASE
                    WHEN s.category = o.category THEN 'Same Category'
                    ELSE 'Different Category'
                END as category_branch,

                CASE
                    WHEN ABS(s.retail_price - o.retail_price) / o.retail_price <= 0.2 THEN 'Similar Price'
                    ELSE 'Different Price' -- Simplified for now
                END as price_branch,

                CASE
                    WHEN s.brand = o.brand THEN 'Same Brand'
                    ELSE 'Different Brand'
                END as brand_branch

            FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean` s
            CROSS JOIN original_product o
            WHERE s.id != {product_id}
              AND (1 - COSINE_DISTANCE(o.embedding, s.embedding)) > 0.5
            ORDER BY similarity DESC
            LIMIT 20
        )
        SELECT * FROM substitute_tree
        """

        df = bpd.read_gbq(query).to_pandas()

        # Create sunburst chart as decision tree
        fig = px.sunburst(
            df,
            path=['category_branch', 'price_branch', 'brand_branch'],
            values='similarity',
            title=f'Substitute Decision Tree for Product {product_id}',
            hover_data=['name', 'similarity']
        )

        fig.update_layout(width=700, height=700)

        return fig

    def create_substitute_comparison_matrix(self, product_ids):
        """
        Create comparison matrix for multiple products' substitute relationships.

        Args:
            product_ids (list): List of product IDs to compare

        Returns:
            plotly.graph_objects.Figure: Comparison matrix heatmap
        """
        products_str = ','.join(map(str, product_ids))

        query = f"""
        WITH product_pairs AS (
            SELECT
                p1.id as product1_id,
                p1.name as product1_name,
                p2.id as product2_id,
                p2.name as product2_name,
                (1 - COSINE_DISTANCE(p1.embedding, p2.embedding)) as similarity
            FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean` p1
            CROSS JOIN `{self.project_id}.{self.dataset_id}.product_embeddings_clean` p2
            WHERE p1.id IN ({products_str})
              AND p2.id IN ({products_str})
        )
        SELECT * FROM product_pairs
        ORDER BY product1_id, product2_id
        """

        df = bpd.read_gbq(query).to_pandas()

        # Create pivot table
        pivot_table = df.pivot(
            index='product1_name',
            columns='product2_name',
            values='similarity'
        )

        # Create heatmap
        fig = px.imshow(
            pivot_table.values,
            x=[name[:20] + '...' if len(name) > 20 else name for name in pivot_table.columns],
            y=[name[:20] + '...' if len(name) > 20 else name for name in pivot_table.index],
            color_continuous_scale='RdYlBu_r',
            title='Product Substitute Comparison Matrix',
            labels={'color': 'Similarity Score'}
        )

        fig.update_layout(width=800, height=600)

        return fig


class SubstituteABTestingFramework:
    """
    A/B testing framework for substitute recommendations.
    This adds testing capabilities to validate our semantic recommendations.
    """

    def __init__(self, project_id, dataset_id):
        self.project_id = project_id
        self.dataset_id = dataset_id

    def design_substitute_ab_test(self, original_product_id, substitute_candidates, test_metrics):
        """
        Design A/B test for substitute recommendations.

        Args:
            original_product_id (int): Original product ID
            substitute_candidates (list): List of substitute product IDs to test
            test_metrics (list): Metrics to track ['similarity', 'price_match', 'category_match']

        Returns:
            dict: A/B test design and baseline metrics
        """
        candidates_str = ','.join(map(str, substitute_candidates))

        query = f"""
        WITH original_product AS (
            SELECT id, name, category, brand, retail_price, embedding
            FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean`
            WHERE id = {original_product_id}
        ),
        test_variants AS (
            SELECT
                s.id as variant_id,
                s.name as variant_name,
                s.category as variant_category,
                s.brand as variant_brand,
                s.retail_price as variant_price,
                o.name as original_name,
                o.category as original_category,
                o.retail_price as original_price,

                -- Test Metrics
                (1 - COSINE_DISTANCE(o.embedding, s.embedding)) as similarity_score,
                ABS(s.retail_price - o.retail_price) / o.retail_price as price_difference_ratio,
                CASE WHEN s.category = o.category THEN 1 ELSE 0 END as category_match,

                -- Variant Quality Score
                ROUND(
                    (1 - COSINE_DISTANCE(o.embedding, s.embedding)) * 0.5 +
                    (1 - ABS(s.retail_price - o.retail_price) / o.retail_price) * 0.3 +
                    CASE WHEN s.category = o.category THEN 0.2 ELSE 0 END, 3
                ) as variant_quality_score

            FROM `{self.project_id}.{self.dataset_id}.product_embeddings_clean` s
            CROSS JOIN original_product o
            WHERE s.id IN ({candidates_str})
        )
        SELECT
            *,
            ROW_NUMBER() OVER (ORDER BY variant_quality_score DESC) as test_variant_rank
        FROM test_variants
        ORDER BY variant_quality_score DESC
        """

        variants_df = bpd.read_gbq(query).to_pandas()

        # Design test groups
        test_design = {
            'original_product': {
                'id': original_product_id,
                'name': variants_df.iloc[0]['original_name'] if len(variants_df) > 0 else 'Unknown'
            },
            'control_group': {
                'variant_id': 'original',
                'description': 'No substitute recommendation'
            },
            'test_groups': []
        }

        for idx, (_, variant) in enumerate(variants_df.iterrows()):
            test_design['test_groups'].append({
                'group_name': f'Variant_{idx+1}',
                'variant_id': variant['variant_id'],
                'variant_name': variant['variant_name'],
                'expected_performance': {
                    'similarity_score': variant['similarity_score'],
                    'price_match_score': 1 - variant['price_difference_ratio'],
                    'category_match': variant['category_match'],
                    'overall_quality': variant['variant_quality_score']
                }
            })

        return {
            'test_design': test_design,
            'variants_data': variants_df,
            'recommended_test_duration': '14 days',
            'sample_size_recommendation': 'Minimum 100 interactions per variant'
        }


def run_complementary_analysis(project_id, dataset_id, sample_product_id=21018):
    """
    Run all complementary analyses that enhance our existing notebook.

    Args:
        project_id (str): Google Cloud Project ID
        dataset_id (str): BigQuery dataset ID
        sample_product_id (int): Sample product for demonstrations
    """
    print("COMPLEMENTARY SEMANTIC SUBSTITUTE ANALYSIS")
    print("=" * 55)
    print("Adding unique features to enhance your BigQuery semantic recommendations...")

    # Initialize all complementary analyzers
    quality_validator = SubstituteQualityValidator(project_id, dataset_id)
    performance_tracker = SubstitutePerformanceTracker(project_id, dataset_id)
    clustering_analyzer = AdvancedSubstituteClustering(project_id, dataset_id)
    interactive_explorer = InteractiveSubstituteExplorer(project_id, dataset_id)
    ab_testing = SubstituteABTestingFramework(project_id, dataset_id)

    results = {}

    # 1. Substitute Quality Validation
    print(f"\n Validating substitute quality for product {sample_product_id}...")
    sample_substitutes = [21284, 20850, 20921, 21255, 20585]  # Sample substitute IDs
    quality_results = quality_validator.validate_substitute_quality(sample_product_id, sample_substitutes)
    results['quality_validation'] = quality_results

    print(f"Quality validation completed for {len(quality_results)} substitutes")
    if len(quality_results) > 0:
        print(f"Best substitute quality score: {quality_results['overall_quality_score'].max():.3f}")
        print(f"Average quality grade: {quality_results['quality_grade'].mode().iloc[0] if len(quality_results) > 0 else 'N/A'}")

    # 2. Performance Tracking
    print(f"\n Analyzing substitute performance patterns...")
    performance_results = performance_tracker.analyze_substitute_effectiveness()
    results['performance_analysis'] = performance_results

    print(f"Analyzed {performance_results['metrics']['total_substitute_types']} substitute types")
    print(f"Average effectiveness score: {performance_results['metrics']['avg_effectiveness']:.3f}")
    print(f"Best performing type: {performance_results['metrics']['best_performing_type']}")

    # 3. Advanced Clustering
    print(f"\n Discovering substitute clusters...")
    clustering_results = clustering_analyzer.discover_substitute_clusters(category="Jeans")
    results['clustering_analysis'] = clustering_results

    print(f"Discovered {clustering_results['total_clusters']} substitute clusters")
    print(f"Products in noise cluster: {clustering_results['noise_products']}")

    if clustering_results['cluster_analysis']:
        best_cluster = clustering_results['cluster_analysis'][0]
        print(f"Strongest cluster has {best_cluster['product_count']} products with strength {best_cluster['substitute_strength']:.3f}")

    # 4. Interactive Visualizations
    print(f"\n Creating interactive substitute visualizations...")
    try:
        decision_tree_viz = interactive_explorer.create_substitute_decision_tree_viz(sample_product_id)
        comparison_matrix = interactive_explorer.create_substitute_comparison_matrix([sample_product_id, 21019, 21020, 21021])

        results['visualizations'] = {
            'decision_tree': decision_tree_viz,
            'comparison_matrix': comparison_matrix
        }

        print("Interactive visualizations created successfully")
        decision_tree_viz.show()
        comparison_matrix.show()

    except Exception as e:
        print(f"Visualization creation encountered an issue: {e}")

    # 5. A/B Testing Framework
    print(f"\n Designing A/B test for substitute recommendations...")
    ab_test_design = ab_testing.design_substitute_ab_test(
        sample_product_id,
        sample_substitutes,
        ['similarity', 'price_match', 'category_match']
    )
    results['ab_test_framework'] = ab_test_design

    print(f"A/B test designed with {len(ab_test_design['test_design']['test_groups'])} variants")
    print(f"Recommended test duration: {ab_test_design['recommended_test_duration']}")

    print(f"\nCOMPLEMENTARY ANALYSIS COMPLETE!")
    print("=" * 55)
    print("These features enhance your BigQuery semantic substitute recommender with:")
    print("• Quality validation and scoring")
    print("• Performance tracking and monitoring")
    print("• Advanced clustering insights")
    print("• Interactive exploration tools")
    print("• A/B testing framework")

    return results


# Example usage functions for Jupyter notebook integration
def quick_quality_check(project_id, dataset_id, product_id, substitute_ids):
    """Quick quality validation for substitute recommendations."""
    validator = SubstituteQualityValidator(project_id, dataset_id)
    return validator.validate_substitute_quality(product_id, substitute_ids)

def analyze_substitute_clusters(project_id, dataset_id, category=None):
    """Quick clustering analysis for substitutes."""
    clustering = AdvancedSubstituteClustering(project_id, dataset_id)
    return clustering.discover_substitute_clusters(category)

def track_substitute_performance(project_id, dataset_id):
    """Quick performance tracking for substitutes."""
    tracker = SubstitutePerformanceTracker(project_id, dataset_id)
    return tracker.analyze_substitute_effectiveness()

In [None]:
# Run all complementary features
# 🔧 Project Configuration - CHANGE THESE TO YOUR VALUES
PROJECT_ID = "bigquery-hackathon-471715"  # @param {type:"string"} # Your Google Cloud Project ID
DATASET_ID = "thelook_ecommerce"          # @param {type:"string"} # The BigQuery dataset containing the product data
LOCATION = "US"                           # @param {type:"string"} # The location of your BigQuery dataset (e.g., "US", "EU")

# Configure BigFrames to use the specified project and location
bpd.options.bigquery.project = PROJECT_ID
bpd.options.bigquery.location = LOCATION
bpd.options.display.max_columns = None # Allow displaying all columns in BigFrames DataFrames

print("Environment configured successfully!")
print(f"Project: {PROJECT_ID}")
print(f"Dataset: {DATASET_ID}")
print(f"Location: {LOCATION}")
results = run_complementary_analysis(PROJECT_ID, DATASET_ID, sample_product_id=21018)

Environment configured successfully!
Project: bigquery-hackathon-471715
Dataset: thelook_ecommerce
Location: US
COMPLEMENTARY SEMANTIC SUBSTITUTE ANALYSIS
Adding unique features to enhance your BigQuery semantic recommendations...

 Validating substitute quality for product 21018...


Quality validation completed for 5 substitutes
Best substitute quality score: 0.996
Average quality grade: Excellent

 Analyzing substitute performance patterns...


Analyzed 25 substitute types
Average effectiveness score: 0.763
Best performing type: Strong Cross-Category

 Discovering substitute clusters...


Discovered 1 substitute clusters
Products in noise cluster: 0
Strongest cluster has 500 products with strength 0.689

 Creating interactive substitute visualizations...


Interactive visualizations created successfully



 Designing A/B test for substitute recommendations...


A/B test designed with 5 variants
Recommended test duration: 14 days

COMPLEMENTARY ANALYSIS COMPLETE!
These features enhance your BigQuery semantic substitute recommender with:
• Quality validation and scoring
• Performance tracking and monitoring
• Advanced clustering insights
• Interactive exploration tools
• A/B testing framework
