In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# B2B Marketplace Data Analysis - Interactive Notebook\n",
    "\n",
    "This notebook provides an interactive environment for exploring and analyzing B2B marketplace data.\n",
    "\n",
    "## Setup and Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('../')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Import project modules\n",
    "from src.analysis.eda import B2BDataAnalyzer\n",
    "from src.analysis.visualizations import B2BDataVisualizer\n",
    "from src.data_processing.cleaner import DataCleaner\n",
    "\n",
    "# Configure plotting\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data\n",
    "\n",
    "Load your cleaned dataset for analysis:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the latest cleaned data\n",
    "# Replace with your actual file path\n",
    "data_path = '../data/processed/cleaned_b2b_data_latest.csv'\n",
    "\n",
    "try:\n",
    "    df = pd.read_csv(data_path)\n",
    "    print(f\"✅ Data loaded successfully!\")\n",
    "    print(f\"📊 Dataset shape: {df.shape}\")\n",
    "    print(f\"📋 Columns: {list(df.columns)}\")\nexcept FileNotFoundError:\n",
    "    print(\"❌ Data file not found. Please run the scraping pipeline first.\")\n",
    "    print(\"Run: python main.py --categories electronics textiles\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Quick Data Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Basic data information\n",
    "print(\"Dataset Info:\")\n",
    "print(f\"Shape: {df.shape}\")\n",
    "print(f\"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB\")\n",
    "print(\"\\nColumn Info:\")\n",
    "print(df.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# First few rows\n",
    "display(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Missing data analysis\n",
    "missing_data = df.isnull().sum()\n",
    "missing_pct = (missing_data / len(df)) * 100\n",
    "\n",
    "missing_summary = pd.DataFrame({\n",
    "    'Missing_Count': missing_data,\n",
    "    'Missing_Percentage': missing_pct.round(2)\n",
    "}).sort_values('Missing_Count', ascending=False)\n",
    "\n",
    "print(\"Missing Data Summary:\")\n",
    "display(missing_summary[missing_summary['Missing_Count'] > 0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comprehensive Analysis\n",
    "\n",
    "Run the full EDA pipeline:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize analyzer\n",
    "analyzer = B2BDataAnalyzer(df=df)\n",
    "\n",
    "# Generate comprehensive insights\n",
    "insights = analyzer.generate_comprehensive_report()\n",
    "\n",
    "print(\"✅ Analysis completed!\")\n",
    "print(f\"📊 Generated insights for {len(insights)} categories\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Interactive Visualizations\n",
    "\n",
    "### Category Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Category analysis\n",
    "if 'category' in df.columns:\n",
    "    category_counts = df['category'].value_counts()\n",
    "    \n",
    "    # Interactive pie chart\n",
    "    fig = px.pie(\n",
    "        values=category_counts.values,\n",
    "        names=category_counts.index,\n",
    "        title=\"Product Category Distribution\",\n",
    "        hole=0.4\n",
    "    )\n",
    "    fig.update_traces(textposition='inside', textinfo='percent+label')\n",
    "    fig.show()
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Text Analysis and Word Clouds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Text analysis of product titles\n",
    "if 'title' in df.columns:\n",
    "    titles = df['title'].dropna().astype(str)\n",
    "    \n",
    "    # Basic text statistics\n",
    "    avg_title_length = titles.str.len().mean()\n",
    "    avg_word_count = titles.str.split().str.len().mean()\n",
    "    \n",
    "    print(f\"Text Analysis - Product Titles:\")\n",
    "    print(f\"  • Total Titles: {len(titles):,}\")\n",
    "    print(f\"  • Average Length: {avg_title_length:.1f} characters\")\n",
    "    print(f\"  • Average Words: {avg_word_count:.1f} words\")\n",
    "    \n",
    "    # Word frequency analysis\n",
    "    from collections import Counter\n",
    "    import re\n",
    "    \n",
    "    # Extract all words\n",
    "    all_words = []\n",
    "    for title in titles:\n",
    "        words = re.findall(r'\\b[a-zA-Z]{3,}\\b', title.lower())\n",
    "        all_words.extend(words)\n",
    "    \n",
    "    # Remove common stop words\n",
    "    stop_words = {'the', 'and', 'for', 'with', 'from', 'are', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'may', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'boy', 'did', 'its', 'let', 'put', 'say', 'she', 'too', 'use'}\n",
    "    \n",
    "    filtered_words = [word for word in all_words if word not in stop_words]\n",
    "    word_freq = Counter(filtered_words)\n",
    "    \n",
    "    # Top words chart\n",
    "    top_words = dict(word_freq.most_common(20))\n",
    "    \n",
    "    fig = px.bar(\n",
    "        x=list(top_words.values()),\n",
    "        y=list(top_words.keys()),\n",
    "        orientation='h',\n",
    "        title=\"Top 20 Most Common Words in Product Titles\",\n",
    "        labels={'x': 'Frequency', 'y': 'Words'}\n",
    "    )\n",
    "    fig.update_layout(height=600, yaxis={'categoryorder': 'total ascending'})\n",
    "    fig.show()\n",
    "    \n",
    "    print(f\"\\n  • Most Common Words: {', '.join([word for word, _ in word_freq.most_common(10)])}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Quality Assessment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data quality dashboard\n",
    "quality_metrics = {}\n",
    "\n",
    "# Missing data analysis\n",
    "missing_data = df.isnull().sum()\n",
    "missing_pct = (missing_data / len(df)) * 100\n",
    "\n",
    "# Create quality visualization\n",
    "fig = make_subplots(\n",
    "    rows=2, cols=2,\n",
    "    subplot_titles=[\n",
    "        'Missing Data by Column (%)',\n",
    "        'Data Completeness Score',\n",
    "        'Price Data Quality',\n",
    "        'Text Data Quality'\n",
    "    ],\n",
    "    specs=[\n",
    "        [{'type': 'bar'}, {'type': 'indicator'}],\n",
    "        [{'type': 'bar'}, {'type': 'bar'}]\n",
    "    ]\n",
    ")\n",
    "\n",
    "# Missing data percentages\n",
    "fig.add_trace(\n",
    "    go.Bar(\n",
    "        x=missing_pct.index,\n",
    "        y=missing_pct.values,\n",
    "        name='Missing %',\n",
    "        marker_color='red'\n",
    "    ),\n",
    "    row=1, col=1\n",
    ")\n",
    "\n",
    "# Overall completeness score\n",
    "completeness_score = (1 - (missing_data.sum() / (len(df) * len(df.columns)))) * 100\n",
    "\n",
    "fig.add_trace(\n",
    "    go.Indicator(\n",
    "        mode=\"gauge+number\",\n",
    "        value=completeness_score,\n",
    "        domain={'x': [0, 1], 'y': [0, 1]},\n",
    "        title={'text': \"Data Completeness\"},\n",
    "        gauge={\n",
    "            'axis': {'range': [None, 100]},\n",
    "            'bar': {'color': \"darkblue\"},\n",
    "            'steps': [\n",
    "                {'range': [0, 50], 'color': \"lightgray\"},\n",
    "                {'range': [50, 80], 'color': \"yellow\"},\n",
    "                {'range': [80, 100], 'color': \"lightgreen\"}\n",
    "            ],\n",
    "            'threshold': {\n",
    "                'line': {'color': \"red\", 'width': 4},\n",
    "                'thickness': 0.75,\n",
    "                'value': 90\n",
    "            }\n",
    "        }\n",
    "    ),\n",
    "    row=1, col=2\n",
    ")\n",
    "\n",
    "# Price quality metrics\n",
    "if 'numeric_price' in df.columns:\n",
    "    price_data = df['numeric_price'].dropna()\n",
    "    price_metrics = {\n",
    "        'Valid Prices': len(price_data),\n",
    "        'Zero Prices': (price_data == 0).sum(),\n",
    "        'Negative Prices': (price_data < 0).sum(),\n",
    "    }\n",
    "    \n",
    "    fig.add_trace(\n",
    "        go.Bar(\n",
    "            x=list(price_metrics.keys()),\n",
    "            y=list(price_metrics.values()),\n",
    "            name='Price Quality',\n",
    "            marker_color=['green', 'orange', 'red']\n",
    "        ),\n",
    "        row=2, col=1\n",
    "    )\n",
    "\n",
    "# Text quality metrics\n",
    "if 'title' in df.columns:\n",
    "    titles = df['title'].astype(str)\n",
    "    text_metrics = {\n",
    "        'Non-empty Titles': (titles != '').sum(),\n",
    "        'Short Titles (<10 chars)': (titles.str.len() < 10).sum(),\n",
    "        'Long Titles (>100 chars)': (titles.str.len() > 100).sum()\n",
    "    }\n",
    "    \n",
    "    fig.add_trace(\n",
    "        go.Bar(\n",
    "            x=list(text_metrics.keys()),\n",
    "            y=list(text_metrics.values()),\n",
    "            name='Text Quality',\n",
    "            marker_color=['green', 'orange', 'blue']\n",
    "        ),\n",
    "        row=2, col=2\n",
    "    )\n",
    "\n",
    "fig.update_layout(\n",
    "    height=800,\n",
    "    title_text=\"Data Quality Assessment Dashboard\",\n",
    "    showlegend=False\n",
    ")\n",
    "\n",
    "fig.update_xaxes(tickangle=45, row=1, col=1)\n",
    "fig.update_xaxes(tickangle=45, row=2, col=1)\n",
    "fig.update_xaxes(tickangle=45, row=2, col=2)\n",
    "\n",
    "fig.show()\n",
    "\n",
    "print(f\"\\nData Quality Summary:\")\n",
    "print(f\"  • Overall Completeness: {completeness_score:.1f}%\")\n",
    "print(f\"  • Columns with Missing Data: {(missing_data > 0).sum()} / {len(df.columns)}\")\n",
    "if 'numeric_price' in df.columns:\n",
    "    price_coverage = (df['numeric_price'].notna().sum() / len(df)) * 100\n",
    "    print(f\"  • Price Data Coverage: {price_coverage:.1f}%\")\n",
    "if 'title' in df.columns:\n",
    "    title_quality = (df['title'].astype(str).str.len() > 5).sum() / len(df) * 100\n",
    "    print(f\"  • Title Quality Score: {title_quality:.1f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom Analysis\n",
    "\n",
    "Add your own custom analysis here:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom analysis playground\n",
    "# Example: Find the most expensive products in each category\n",
    "\n",
    "if 'category' in df.columns and 'numeric_price' in df.columns:\n",
    "    print(\"Most Expensive Products by Category:\")\n",
    "    print(\"=\" * 50)\n",
    "    \n",
    "    for category in df['category'].unique():\n",
    "        category_df = df[df['category'] == category]\n",
    "        most_expensive = category_df.nlargest(3, 'numeric_price')[['title', 'supplier_name', 'numeric_price', 'location']]\n",
    "        \n",
    "        print(f\"\\n{category.upper()}:\")\n",
    "        for idx, row in most_expensive.iterrows():\n",
    "            price = f\"₹{row['numeric_price']:,.2f}\" if pd.notna(row['numeric_price']) else \"Price N/A\"\n",
    "            print(f\"  • {row['title'][:60]}... - {price}\")\n",
    "            print(f\"    Supplier: {row['supplier_name']} | Location: {row['location']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Price comparison across categories\n",
    "if 'category' in df.columns and 'numeric_price' in df.columns:\n",
    "    price_comparison = df.groupby('category')['numeric_price'].agg([\n",
    "        'count', 'mean', 'median', 'std', 'min', 'max'\n",
    "    ]).round(2)\n",
    "    \n",
    "    print(\"\\nPrice Statistics by Category:\")\n",
    "    print(\"=\" * 60)\n",
    "    display(price_comparison)\n",
    "    \n",
    "    # Visualization\n",
    "    fig = px.box(\n",
    "        df.dropna(subset=['numeric_price']), \n",
    "        x='category', \n",
    "        y='numeric_price',\n",
    "        title=\"Price Distribution by Category (Box Plot)\",\n",
    "        log_y=True\n",
    "    )\n",
    "    fig.update_layout(height=500)\n",
    "    fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Export Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Export key findings to a summary file\n",
    "import json\n",
    "from datetime import datetime\n",
    "\n",
    "summary_findings = {\n",
    "    'analysis_date': datetime.now().isoformat(),\n",
    "    'dataset_summary': {\n",
    "        'total_records': len(df),\n",
    "        'columns': len(df.columns),\n",
    "        'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024\n",
    "    }\n",
    "}\n",
    "\n",
    "# Add category insights\n",
    "if 'category' in df.columns:\n",
    "    category_counts = df['category'].value_counts()\n",
    "    summary_findings['category_insights'] = {\n",
    "        'total_categories': len(category_counts),\n",
    "        'distribution': category_counts.to_dict()\n",
    "    }\n",
    "\n",
    "# Add price insights\n",
    "if 'numeric_price' in df.columns:\n",
    "    price_data = df['numeric_price'].dropna()\n",
    "    if len(price_data) > 0:\n",
    "        summary_findings['price_insights'] = {\n",
    "            'products_with_price': len(price_data),\n",
    "            'avg_price': float(price_data.mean()),\n",
    "            'median_price': float(price_data.median()),\n",
    "            'price_range': [float(price_data.min()), float(price_data.max())]\n",
    "        }\n",
    "\n",
    "# Save summary\n",
    "output_file = '../data/outputs/notebook_analysis_summary.json'\n",
    "with open(output_file, 'w') as f:\n",
    "    json.dump(summary_findings, f, indent=2, default=str)\n",
    "\n",
    "print(f\"✅ Analysis summary exported to: {output_file}\")\n",
    "print(\"\\n🎉 Interactive analysis completed!\")\n",
    "print(\"\\nNext steps:\")\n",
    "print(\"1. Review the generated visualizations\")\n",
    "print(\"2. Export findings for presentation\")\n",
    "print(\"3. Use insights for business decisions\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}\n",
    "    \n",
    "    print(\"\\nCategory Summary:\")\n",
    "    for cat, count in category_counts.items():\n",
    "        pct = (count / len(df)) * 100\n",
    "        print(f\"  • {cat}: {count:,} products ({pct:.1f}%)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Price Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Price distribution analysis\n",
    "if 'numeric_price' in df.columns:\n",
    "    price_data = df['numeric_price'].dropna()\n",
    "    \n",
    "    if len(price_data) > 0:\n",
    "        # Create subplots\n",
    "        fig = make_subplots(\n",
    "            rows=2, cols=2,\n",
    "            subplot_titles=[\n",
    "                'Price Distribution (Log Scale)',\n",
    "                'Price by Category',\n",
    "                'Price Statistics',\n",
    "                'Price Outliers Detection'\n",
    "            ],\n",
    "            specs=[\n",
    "                [{'type': 'histogram'}, {'type': 'box'}],\n",
    "                [{'type': 'bar'}, {'type': 'scatter'}]\n",
    "            ]\n",
    "        )\n",
    "        \n",
    "        # Log-scale histogram\n",
    "        fig.add_trace(\n",
    "            go.Histogram(\n",
    "                x=np.log10(price_data + 1),\n",
    "                nbinsx=50,\n",
    "                name='Price Distribution (Log)',\n",
    "                marker_color='lightblue'\n",
    "            ),\n",
    "            row=1, col=1\n",
    "        )\n",
    "        \n",
    "        # Box plot by category\n",
    "        if 'category' in df.columns:\n",
    "            for category in df['category'].unique():\n",
    "                cat_prices = df[df['category'] == category]['numeric_price'].dropna()\n",
    "                if len(cat_prices) > 0:\n",
    "                    fig.add_trace(\n",
    "                        go.Box(\n",
    "                            y=cat_prices,\n",
    "                            name=category,\n",
    "                            boxpoints='outliers'\n",
    "                        ),\n",
    "                        row=1, col=2\n",
    "                    )\n",
    "        \n",
    "        # Price statistics bar chart\n",
    "        stats = {\n",
    "            'Mean': price_data.mean(),\n",
    "            'Median': price_data.median(),\n",
    "            'Q1': price_data.quantile(0.25),\n",
    "            'Q3': price_data.quantile(0.75)\n",
    "        }\n",
    "        \n",
    "        fig.add_trace(\n",
    "            go.Bar(\n",
    "                x=list(stats.keys()),\n",
    "                y=list(stats.values()),\n",
    "                name='Price Statistics',\n",
    "                marker_color='orange'\n",
    "            ),\n",
    "            row=2, col=1\n",
    "        )\n",
    "        \n",
    "        # Outlier detection scatter plot\n",
    "        q1 = price_data.quantile(0.25)\n",
    "        q3 = price_data.quantile(0.75)\n",
    "        iqr = q3 - q1\n",
    "        outlier_threshold = q3 + 1.5 * iqr\n",
    "        \n",
    "        outliers = df[df['numeric_price'] > outlier_threshold]['numeric_price']\n",
    "        normal_prices = df[df['numeric_price'] <= outlier_threshold]['numeric_price'].dropna()\n",
    "        \n",
    "        fig.add_trace(\n",
    "            go.Scatter(\n",
    "                x=range(len(normal_prices)),\n",
    "                y=normal_prices,\n",
    "                mode='markers',\n",
    "                name='Normal Prices',\n",
    "                marker_color='green'\n",
    "            ),\n",
    "            row=2, col=2\n",
    "        )\n",
    "        \n",
    "        if len(outliers) > 0:\n",
    "            fig.add_trace(\n",
    "                go.Scatter(\n",
    "                    x=range(len(normal_prices), len(normal_prices) + len(outliers)),\n",
    "                    y=outliers,\n",
    "                    mode='markers',\n",
    "                    name='Outliers',\n",
    "                    marker_color='red',\n",
    "                    marker_size=8\n",
    "                ),\n",
    "                row=2, col=2\n",
    "            )\n",
    "        \n",
    "        fig.update_layout(\n",
    "            height=800,\n",
    "            title_text=\"Comprehensive Price Analysis\",\n",
    "            showlegend=True\n",
    "        )\n",
    "        \n",
    "        fig.show()\n",
    "        \n",
    "        print(f\"\\nPrice Statistics:\")\n",
    "        print(f\"  • Count: {len(price_data):,}\")\n",
    "        print(f\"  • Mean: ₹{price_data.mean():,.2f}\")\n",
    "        print(f\"  • Median: ₹{price_data.median():,.2f}\")\n",
    "        print(f\"  • Std Dev: ₹{price_data.std():,.2f}\")\n",
    "        print(f\"  • Min: ₹{price_data.min():,.2f}\")\n",
    "        print(f\"  • Max: ₹{price_data.max():,.2f}\")\n",
    "        print(f\"  • Outliers: {len(outliers):,} ({len(outliers)/len(price_data)*100:.1f}%)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Supplier Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Supplier analysis\n",
    "if 'supplier_name' in df.columns:\n",
    "    supplier_counts = df['supplier_name'].value_counts()\n",
    "    \n",
    "    # Top suppliers chart\n",
    "    top_suppliers = supplier_counts.head(20)\n",
    "    \n",
    "    fig = px.bar(\n",
    "        x=top_suppliers.values,\n",
    "        y=top_suppliers.index,\n",
    "        orientation='h',\n",
    "        title=\"Top 20 Suppliers by Product Count\",\n",
    "        labels={'x': 'Number of Products', 'y': 'Supplier Name'}\n",
    "    )\n",
    "    fig.update_layout(height=600, yaxis={'categoryorder': 'total ascending'})\n",
    "    fig.show()\n",
    "    \n",
    "    # Supplier distribution\n",
    "    single_product_suppliers = (supplier_counts == 1).sum()\n",
    "    multi_product_suppliers = (supplier_counts > 1).sum()\n",
    "    \n",
    "    print(f\"\\nSupplier Analysis:\")\n",
    "    print(f\"  • Total Suppliers: {len(supplier_counts):,}\")\n",
    "    print(f\"  • Single Product Suppliers: {single_product_suppliers:,} ({single_product_suppliers/len(supplier_counts)*100:.1f}%)\")\n",
    "    print(f\"  • Multi Product Suppliers: {multi_product_suppliers:,} ({multi_product_suppliers/len(supplier_counts)*100:.1f}%)\")\n",
    "    print(f\"  • Avg Products per Supplier: {supplier_counts.mean():.1f}\")\n",
    "    print(f\"  • Max Products by Single Supplier: {supplier_counts.max():,}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Geographic Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Geographic distribution\n",
    "if 'extracted_state' in df.columns:\n",
    "    state_data = df['extracted_state'].dropna().value_counts()\n",
    "    \n",
    "    if len(state_data) > 0:\n",
    "        # Interactive choropleth-style chart\n",
    "        fig = px.bar(\n",
    "            x=state_data.values,\n",
    "            y=state_data.index,\n",
    "            orientation='h',\n",
    "            title=\"Geographic Distribution of Suppliers by State\",\n",
    "            labels={'x': 'Number of Suppliers', 'y': 'State'},\n",
    "            color=state_data.values,\n",
    "            color_continuous_scale='Viridis'\n",
    "        )\n",
    "        fig.update_layout(\n",
    "            height=600,\n",
    "            yaxis={'categoryorder': 'total ascending'}\n",
    "        )\n",
    "        fig.show()\n",
    "        \n",
    "        print(f\"\\nGeographic Distribution:\")\n",
    "        print(f\"  • States Covered: {len(state_data)}\")\n",
    "        print(f\"  • Top 5 States:\")\n",
    "        for i, (state, count) in enumerate(state_data.head().items()):\n",
    "            pct = (count / state_data.sum()) * 100\n",
    "            print(f\"    {i