In [9]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import os
from datetime import datetime

def load_and_explore_data():
    """Safely load and explore the master data"""
    print("Loading Master Sales Data for Regional Demographics Analysis...")
    
    try:
        # Load the master dataset
        master_data = pl.read_parquet("data/master_transactions.parquet")
        print(f"Master data loaded successfully!")
        print(f"Shape: {master_data.shape[0]:,} rows × {master_data.shape[1]} columns")
        
        # Display column info with data types
        print(f"\nAvailable Columns with Data Types:")
        for i, (col, dtype) in enumerate(zip(master_data.columns, master_data.dtypes), 1):
            print(f"   {i:2d}. {col} ({dtype})")
        
        # Check key columns for regional analysis
        key_columns = ['Customer ID', 'City', 'Country', 'Gender', 'Date Of Birth', 'Line_Total_USD']
        print(f"\nChecking Key Columns for Regional Analysis:")
        for col in key_columns:
            if col in master_data.columns:
                dtype = master_data[col].dtype
                print(f"   {col} - Available ({dtype})")
            else:
                print(f"   {col} - Missing")
        
        # Show sample data for key columns
        if all(col in master_data.columns for col in ['City', 'Country', 'Customer ID']):
            print(f"\nSample Geographic Data:")
            sample_geo = master_data.select(['Customer ID', 'City', 'Country', 'Gender', 'Date Of Birth']).head(5)
            print(sample_geo)
        
        return master_data
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

def process_regional_data(master_data):
    """Process the data for regional demographics analysis"""
    print("Processing Data for Regional Demographics Analysis...")
    
    try:
        # Get current year for age calculation
        current_year = datetime.now().year
        
        # Check the data type of Date Of Birth column
        dob_dtype = master_data['Date Of Birth'].dtype
        print(f"   Date Of Birth column type: {dob_dtype}")
        
        # Process the data step by step
        print("   Processing dates and calculating ages...")
        
        # Handle Date Of Birth based on its actual data type
        if str(dob_dtype) == 'Date':
            # If it's already a Date type, extract year directly
            processed_data = master_data.with_columns([
                pl.when(pl.col("Date Of Birth").is_not_null())
                .then(pl.lit(current_year) - pl.col("Date Of Birth").dt.year())
                .otherwise(None)
                .alias("Age")
            ])
        elif 'Utf8' in str(dob_dtype) or 'String' in str(dob_dtype):
            # If it's a string type, parse it first
            processed_data = master_data.with_columns([
                pl.when(pl.col("Date Of Birth").is_not_null())
                .then(
                    pl.lit(current_year) - 
                    pl.col("Date Of Birth").str.strptime(pl.Date, "%Y-%m-%d", strict=False).dt.year()
                )
                .otherwise(None)
                .alias("Age")
            ])
        else:
            # For other types, try to convert to string first then parse
            processed_data = master_data.with_columns([
                pl.when(pl.col("Date Of Birth").is_not_null())
                .then(
                    pl.lit(current_year) - 
                    pl.col("Date Of Birth").cast(pl.Utf8).str.strptime(pl.Date, "%Y-%m-%d", strict=False).dt.year()
                )
                .otherwise(None)
                .alias("Age")
            ])
        
        print(f"   Age calculation completed!")
        
        # Add age groups
        print("   Creating age groups...")
        processed_data = processed_data.with_columns([
            pl.when(pl.col("Age").is_null()).then(pl.lit("Unknown"))
            .when(pl.col("Age") < 18).then(pl.lit("Under 18"))
            .when(pl.col("Age") < 25).then(pl.lit("18-24"))
            .when(pl.col("Age") < 35).then(pl.lit("25-34"))
            .when(pl.col("Age") < 45).then(pl.lit("35-44"))
            .when(pl.col("Age") < 55).then(pl.lit("45-54"))
            .when(pl.col("Age") < 65).then(pl.lit("55-64"))
            .otherwise(pl.lit("65+"))
            .alias("Age_Group")
        ])
        
        # Add spending categories based on Line_Total_USD
        print("   Creating spending categories...")
        processed_data = processed_data.with_columns([
            pl.when(pl.col("Line_Total_USD").is_null()).then(pl.lit("Unknown"))
            .when(pl.col("Line_Total_USD") < 50).then(pl.lit("Low Spender"))
            .when(pl.col("Line_Total_USD") < 150).then(pl.lit("Medium Spender"))
            .when(pl.col("Line_Total_USD") < 300).then(pl.lit("High Spender"))
            .otherwise(pl.lit("Premium Spender"))
            .alias("Spending_Category")
        ])
        
        # Add purchase categories based on Invoice_Total_USD
        print("   Creating purchase categories...")
        processed_data = processed_data.with_columns([
            pl.when(pl.col("Invoice_Total_USD").is_null()).then(pl.lit("Unknown"))
            .when(pl.col("Invoice_Total_USD") < 100).then(pl.lit("Small Purchase"))
            .when(pl.col("Invoice_Total_USD") < 500).then(pl.lit("Medium Purchase"))
            .when(pl.col("Invoice_Total_USD") < 1000).then(pl.lit("Large Purchase"))
            .otherwise(pl.lit("Premium Purchase"))
            .alias("Purchase_Category")
        ])
        
        print(f"Data processing completed!")
        print(f"Processed {processed_data.shape[0]:,} records")
        
        # Show processing summary
        print(f"\nProcessing Summary:")
        print(f"   Countries: {processed_data['Country'].n_unique()}")
        print(f"   Cities: {processed_data['City'].n_unique()}")
        print(f"   Unique Customers: {processed_data['Customer ID'].n_unique():,}")
        print(f"   Total Transactions: {processed_data.shape[0]:,}")
        
        # Show age distribution
        age_stats = processed_data.group_by("Age_Group").agg(pl.count().alias("Count")).sort("Count", descending=True)
        print(f"\nAge Group Distribution:")
        for row in age_stats.iter_rows():
            print(f"   {row[0]}: {row[1]:,} customers")
        
        return processed_data
        
    except Exception as e:
        print(f"Error processing data: {str(e)}")
        print(f"   Debug info: Date Of Birth dtype = {master_data['Date Of Birth'].dtype}")
        return None

def analyze_regional_demographics(data):
    """Analyze regional demographics"""
    print("Analyzing Regional Demographics...")
    
    try:
        # City-level analysis
        print("   Analyzing city-level demographics...")
        city_stats = data.group_by(["City", "Country"]).agg([
            pl.col("Customer ID").n_unique().alias("Unique_Customers"),
            pl.count().alias("Total_Transactions"),
            pl.col("Age").mean().alias("Avg_Age"),
            pl.col("Line_Total_USD").sum().alias("Total_Revenue_USD"),
            pl.col("Line_Total_USD").mean().alias("Avg_Transaction_Value_USD"),
            pl.col("Invoice_Total_USD").mean().alias("Avg_Invoice_Value_USD"),
            pl.col("Quantity").sum().alias("Total_Items_Sold")
        ]).with_columns([
            (pl.col("Total_Transactions") / pl.col("Unique_Customers")).alias("Transactions_Per_Customer"),
            (pl.col("Total_Revenue_USD") / pl.col("Unique_Customers")).alias("Revenue_Per_Customer")
        ]).sort("Total_Revenue_USD", descending=True)
        
        # Country-level analysis
        print("   Analyzing country-level demographics...")
        country_stats = data.group_by("Country").agg([
            pl.col("Customer ID").n_unique().alias("Unique_Customers"),
            pl.count().alias("Total_Transactions"),
            pl.col("City").n_unique().alias("Unique_Cities"),
            pl.col("Age").mean().alias("Avg_Age"),
            pl.col("Line_Total_USD").sum().alias("Total_Revenue_USD"),
            pl.col("Line_Total_USD").mean().alias("Avg_Transaction_Value_USD"),
            pl.col("Store ID").n_unique().alias("Unique_Stores")
        ]).sort("Total_Revenue_USD", descending=True)
        
        print(f"Regional analysis completed!")
        print(f"   {city_stats.shape[0]} cities analyzed")
        print(f"   {country_stats.shape[0]} countries analyzed")
        
        # Show top 5 cities
        print(f"\nTop 5 Cities by Revenue:")
        top_cities = city_stats.head(5)
        for row in top_cities.iter_rows():
            print(f"   {row[0]}, {row[1]}: ${row[4]:,.2f} USD ({row[2]:,} customers)")
        
        return city_stats, country_stats
        
    except Exception as e:
        print(f"Error in regional analysis: {str(e)}")
        return None, None

def save_analysis_data(data, city_stats, country_stats):
    """Save analysis data to CSV files"""
    print("Saving Analysis Data...")
    
    try:
        # Create output directory
        os.makedirs("regional_demographics", exist_ok=True)
        
        # Save processed data (sample)
        print("   Saving processed data sample...")
        data_sample = data.select([
            "Customer ID", "City", "Country", "Gender", "Age", "Age_Group",
            "Line_Total_USD", "Spending_Category", "Invoice_Total_USD", "Purchase_Category"
        ]).head(10000)  # Save first 10k records as sample
        data_sample.write_csv("regional_demographics/processed_data_sample.csv")
        
        # Save city statistics
        print("   Saving city statistics...")
        city_stats.write_csv("regional_demographics/city_statistics.csv")
        
        # Save country statistics
        print("   Saving country statistics...")
        country_stats.write_csv("regional_demographics/country_statistics.csv")
        
        # Create summary statistics
        print("   Creating summary statistics...")
        summary_stats = {
            'analysis_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'total_countries': country_stats.shape[0],
            'total_cities': city_stats.shape[0],
            'total_customers': data['Customer ID'].n_unique(),
            'total_transactions': data.shape[0],
            'total_revenue_usd': float(data['Line_Total_USD'].sum())
        }
        
        # Save summary as text file
        with open("regional_demographics/summary_statistics.txt", "w") as f:
            f.write("REGIONAL DEMOGRAPHICS ANALYSIS SUMMARY\n")
            f.write("=" * 50 + "\n\n")
            for key, value in summary_stats.items():
                f.write(f"{key}: {value}\n")
        
        print("All analysis data saved successfully!")
        return summary_stats
        
    except Exception as e:
        print(f"Error saving data: {str(e)}")
        return None

def create_regional_visualizations(data, city_stats, country_stats):
    """Create regional demographic visualizations"""
    print("Creating Regional Demographics Visualizations...")
    
    figures = []
    
    try:
        # 1. Top Cities by Revenue
        print("   Creating top cities by revenue chart...")
        top_20_cities = city_stats.head(20).to_pandas()
        top_20_cities['City_Country'] = top_20_cities['City'] + ', ' + top_20_cities['Country']
        
        fig_cities_revenue = px.bar(
            top_20_cities,
            x='Total_Revenue_USD',
            y='City_Country',
            orientation='h',
            title='Top 20 Cities by Total Revenue (USD)',
            color='Avg_Transaction_Value_USD',
            color_continuous_scale='Viridis',
            text='Total_Revenue_USD'
        )
        fig_cities_revenue.update_traces(texttemplate='$%{text:,.0f}', textposition='outside')
        fig_cities_revenue.update_layout(
            height=700,
            title_font_size=18,
            yaxis={'categoryorder': 'total ascending'},
            xaxis_title="Total Revenue (USD)"
        )
        figures.append(fig_cities_revenue)
        
        # 2. Country Distribution
        print("   Creating country distribution chart...")
        country_data = country_stats.to_pandas()
        
        fig_countries = px.pie(
            country_data,
            values='Unique_Customers',
            names='Country',
            title='Customer Distribution by Country',
            color_discrete_sequence=px.colors.qualitative.Set3
        )
        fig_countries.update_traces(textposition='inside', textinfo='percent+label')
        fig_countries.update_layout(height=500, title_font_size=18)
        figures.append(fig_countries)
        
        # 3. Customer Performance Scatter
        print("   Creating customer performance scatter plot...")
        fig_performance = px.scatter(
            top_20_cities,
            x='Unique_Customers',
            y='Total_Revenue_USD',
            size='Avg_Invoice_Value_USD',
            color='Revenue_Per_Customer',
            hover_name='City_Country',
            title='Customer Performance Metrics by City',
            labels={
                'Unique_Customers': 'Number of Unique Customers',
                'Total_Revenue_USD': 'Total Revenue (USD)',
                'Revenue_Per_Customer': 'Revenue per Customer (USD)'
            },
            color_continuous_scale='Viridis'
        )
        fig_performance.update_layout(height=600, title_font_size=18)
        figures.append(fig_performance)
        
        # 4. Age Distribution by Top Cities
        print("   Creating age distribution chart...")
        top_10_cities_list = city_stats.head(10).select(["City", "Country"]).to_pandas()
        
        age_city_data = []
        for _, row in top_10_cities_list.iterrows():
            city_data = data.filter(
                (pl.col("City") == row['City']) & (pl.col("Country") == row['Country'])
            )
            age_dist = city_data.group_by("Age_Group").agg(pl.count().alias("Count")).to_pandas()
            total_count = age_dist['Count'].sum()
            for _, age_row in age_dist.iterrows():
                percentage = round((age_row['Count'] / total_count) * 100)
                age_city_data.append({
                    'City_Country': f"{row['City']}, {row['Country']}",
                    'Age_Group': age_row['Age_Group'],
                    'Count': age_row['Count'],
                    'Percentage': f"{percentage}%"
                })
        
        if age_city_data:
            age_city_df = pl.DataFrame(age_city_data).to_pandas()
            
            fig_age_cities = px.bar(
                age_city_df,
                x='City_Country',
                y='Count',
                color='Age_Group',
                title='Age Distribution by Top 10 Cities',
                color_discrete_sequence=px.colors.qualitative.Pastel,
                text='Percentage'
            )
            fig_age_cities.update_traces(textposition='inside')
            fig_age_cities.update_layout(
                height=600,
                title_font_size=18,
                xaxis_tickangle=-45
            )
            figures.append(fig_age_cities)
        
        # 5. Gender Distribution
        print("   Creating gender distribution chart...")
        gender_city_data = []
        for _, row in top_10_cities_list.iterrows():
            city_data = data.filter(
                (pl.col("City") == row['City']) & (pl.col("Country") == row['Country'])
            )
            gender_dist = city_data.group_by("Gender").agg(pl.count().alias("Count")).to_pandas()
            total_count = gender_dist['Count'].sum()
            for _, gender_row in gender_dist.iterrows():
                percentage = round((gender_row['Count'] / total_count) * 100)
                gender_city_data.append({
                    'City_Country': f"{row['City']}, {row['Country']}",
                    'Gender': gender_row['Gender'],
                    'Count': gender_row['Count'],
                    'Percentage': f"{percentage}%"
                })
        
        if gender_city_data:
            gender_city_df = pl.DataFrame(gender_city_data).to_pandas()
            
            fig_gender_cities = px.bar(
                gender_city_df,
                x='City_Country',
                y='Count',
                color='Gender',
                title='Gender Distribution by Top 10 Cities',
                color_discrete_sequence=['#FF69B4', '#4169E1', '#32CD32'],
                text='Percentage'
            )
            fig_gender_cities.update_traces(textposition='inside')
            fig_gender_cities.update_layout(
                height=600,
                title_font_size=18,
                xaxis_tickangle=-45
            )
            figures.append(fig_gender_cities)

        # 6. Revenue Heatmap by Country and Age Group
        print("   Creating revenue heatmap...")
        
        # Get top 10 countries and create heatmap data
        top_countries = country_stats.head(10).select("Country").to_series().to_list()
        age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
        
        heatmap_data = []
        heatmap_countries = []
        
        for country in top_countries:
            country_data = data.filter(pl.col("Country") == country)
            row_data = []
            for age_group in age_groups:
                revenue = country_data.filter(pl.col("Age_Group") == age_group)["Line_Total_USD"].sum()
                row_data.append(float(revenue))
            heatmap_data.append(row_data)
            heatmap_countries.append(country)
        
        fig_heatmap = go.Figure(data=go.Heatmap(
            z=heatmap_data,
            x=age_groups,
            y=heatmap_countries,
            colorscale='Viridis',
            text=[[f"${val:,.0f}" for val in row] for row in heatmap_data],
            texttemplate="%{text}",
            textfont={"size": 10}
        ))
        
        fig_heatmap.update_layout(
            title='Revenue Heatmap by Country and Age Group',
            title_font_size=18,
            height=500,
            xaxis_title="Age Group",
            yaxis_title="Country"
        )
        figures.append(fig_heatmap)
        
        print(f"Created {len(figures)} visualizations successfully!")
        return figures
        
    except Exception as e:
        print(f"Error creating visualizations: {str(e)}")
        return figures

def create_dashboard(figures, summary_stats):
    """Create HTML dashboard"""
    print("Creating Regional Demographics Dashboard...")
    
    try:
        html_content = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>Regional Demographics Analysis Dashboard</title>
            <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
            <style>
                body {{
                    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                    margin: 0;
                    padding: 20px;
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    min-height: 100vh;
                }}
                .container {{
                    max-width: 1400px;
                    margin: 0 auto;
                    background: white;
                    border-radius: 15px;
                    padding: 30px;
                    box-shadow: 0 20px 40px rgba(0,0,0,0.1);
                }}
                .header {{
                    text-align: center;
                    margin-bottom: 40px;
                    padding: 20px;
                    background: linear-gradient(135deg, #4CAF50, #45a049);
                    border-radius: 10px;
                    color: white;
                }}
                .stats-grid {{
                    display: grid;
                    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
                    gap: 20px;
                    margin-bottom: 30px;
                }}
                .stat-card {{
                    background: #f8f9fa;
                    padding: 20px;
                    border-radius: 10px;
                    text-align: center;
                    border-left: 4px solid #4CAF50;
                }}
                .chart-container {{
                    margin: 30px 0;
                    padding: 20px;
                    border: 2px solid #e0e0e0;
                    border-radius: 10px;
                    background: #fafafa;
                }}
                .chart {{
                    width: 100%;
                    height: 600px;
                }}
                .footer {{
                    text-align: center;
                    margin-top: 40px;
                    padding: 20px;
                    background: #f5f5f5;
                    border-radius: 10px;
                    color: #666;
                }}
            </style>
        </head>
        <body>
            <div class="container">
                <div class="header">
                    <h1>Regional Demographics Analysis Dashboard</h1>
                    <p>Comprehensive Geographic Analysis of Customer Data</p>
                    <p>Generated on: {summary_stats['analysis_date']}</p>
                </div>
                
                <div class="stats-grid">
                    <div class="stat-card">
                        <h3>Countries</h3>
                        <h2>{summary_stats['total_countries']}</h2>
                    </div>
                    <div class="stat-card">
                        <h3>Cities</h3>
                        <h2>{summary_stats['total_cities']}</h2>
                    </div>
                    <div class="stat-card">
                        <h3>Customers</h3>
                        <h2>{summary_stats['total_customers']:,}</h2>
                    </div>
                    <div class="stat-card">
                        <h3>Transactions</h3>
                        <h2>{summary_stats['total_transactions']:,}</h2>
                    </div>
                    <div class="stat-card">
                        <h3>Total Revenue</h3>
                        <h2>${summary_stats['total_revenue_usd']:,.0f}</h2>
                    </div>
                </div>
        """
        
        # Add chart containers
        for i in range(len(figures)):
            html_content += f"""
                <div class="chart-container">
                    <div id="chart{i+1}" class="chart"></div>
                </div>
            """
        
        html_content += """
                <div class="footer">
                    <p>Regional Demographics Analysis | Based on Actual Transaction Data</p>
                </div>
            </div>
            <script>
        """
        
        # Add JavaScript for each plot
        for i, fig in enumerate(figures):
            plot_json = fig.to_json()
            html_content += f"""
            var plotData{i+1} = {plot_json};
            Plotly.newPlot('chart{i+1}', plotData{i+1}.data, plotData{i+1}.layout, {{responsive: true}});
            """
        
        html_content += """
            </script>
        </body>
        </html>
        """
        
        # Save dashboard
        with open("regional_demographics/regional_demographics_dashboard.html", "w", encoding='utf-8') as f:
            f.write(html_content)
        
        print("Dashboard created successfully!")
        return True
        
    except Exception as e:
        print(f"Error creating dashboard: {str(e)}")
        return False

def run_regional_demographics_analysis():
    """Run complete regional demographics analysis"""
    print("Starting Regional Demographics Analysis...")
    print("=" * 60)
    
    # Step 1: Load and explore data
    master_data = load_and_explore_data()
    if master_data is None:
        return None
    
    # Step 2: Process data
    processed_data = process_regional_data(master_data)
    if processed_data is None:
        return None
    
    # Step 3: Analyze regional demographics
    city_stats, country_stats = analyze_regional_demographics(processed_data)
    if city_stats is None or country_stats is None:
        return None
    
    # Step 4: Save analysis data
    summary_stats = save_analysis_data(processed_data, city_stats, country_stats)
    if summary_stats is None:
        return None
    
    # Step 5: Create visualizations
    figures = create_regional_visualizations(processed_data, city_stats, country_stats)
    
    # Step 6: Create dashboard
    dashboard_success = create_dashboard(figures, summary_stats)
    
    if dashboard_success:
        print("\n" + "=" * 60)
        print("REGIONAL DEMOGRAPHICS ANALYSIS COMPLETED!")
        print("=" * 60)
        print("\nFILES CREATED:")
        print("   regional_demographics/regional_demographics_dashboard.html")
        print("   regional_demographics/city_statistics.csv")
        print("   regional_demographics/country_statistics.csv")
        print("   regional_demographics/processed_data_sample.csv")
        print("   regional_demographics/summary_statistics.txt")
        
        print(f"\nANALYSIS SUMMARY:")
        print(f"   Countries Analyzed: {summary_stats['total_countries']}")
        print(f"   Cities Analyzed: {summary_stats['total_cities']}")
        print(f"   Unique Customers: {summary_stats['total_customers']:,}")
        print(f"   Total Transactions: {summary_stats['total_transactions']:,}")
        print(f"   Total Revenue: ${summary_stats['total_revenue_usd']:,.2f} USD")
        
        print(f"\nOpen 'regional_demographics/regional_demographics_dashboard.html' to view the complete analysis!")
        
        return {
            'processed_data': processed_data,
            'city_stats': city_stats,
            'country_stats': country_stats,
            'figures': figures,
            'summary_stats': summary_stats
        }
    else:
        return None

# Execute Regional Demographics Analysis
print("REGIONAL DEMOGRAPHICS INTEGRATION")
print("=" * 60)
regional_results = run_regional_demographics_analysis()

if regional_results:
    print("\nRegional Demographics Analysis completed successfully!")
else:
    print("\nRegional Demographics Analysis failed!")


REGIONAL DEMOGRAPHICS INTEGRATION
Starting Regional Demographics Analysis...
Loading Master Sales Data for Regional Demographics Analysis...
Master data loaded successfully!
Shape: 6,416,029 rows × 41 columns

Available Columns with Data Types:
    1. Invoice ID (String)
    2. Line (Int64)
    3. Customer ID (Int64)
    4. Product ID (Int64)
    5. Size (String)
    6. Color (String)
    7. Unit Price (Float64)
    8. Quantity (Int64)
    9. Date (Date)
   10. Discount (Float64)
   11. Line Total (Float64)
   12. Store ID (Int64)
   13. Employee ID (Int64)
   14. Currency (String)
   15. Currency Symbol (String)
   16. SKU (String)
   17. Transaction Type (String)
   18. Payment Method (String)
   19. Invoice Total (Float64)
   20. Exchange_Rate_to_USD (Float64)
   21. Unit_Price_USD (Float64)
   22. Line_Total_USD (Float64)
   23. Invoice_Total_USD (Float64)
   24. Category (String)
   25. Sub Category (String)
   26. Description EN (String)
   27. Color_right (String)
   28. Sizes (


`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)


`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)




Age Group Distribution:
   18-24: 2,302,156 customers
   25-34: 1,669,823 customers
   35-44: 1,411,650 customers
   45-54: 669,500 customers
   55-64: 294,461 customers
   65+: 68,439 customers
Analyzing Regional Demographics...
   Analyzing city-level demographics...
   Analyzing country-level demographics...



`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)



Regional analysis completed!
   35 cities analyzed
   7 countries analyzed

Top 5 Cities by Revenue:
   New York, United States: $29.97 USD (102,758 customers)
   Los Angeles, United States: $29.97 USD (94,317 customers)
   上海, China: $30.54 USD (84,955 customers)
   广州, China: $30.39 USD (92,209 customers)
   深圳, China: $30.53 USD (90,888 customers)
Saving Analysis Data...
   Saving processed data sample...
   Saving city statistics...
   Saving country statistics...
   Creating summary statistics...
All analysis data saved successfully!
Creating Regional Demographics Visualizations...
   Creating top cities by revenue chart...
   Creating country distribution chart...
   Creating customer performance scatter plot...
   Creating age distribution chart...



`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)



   Creating gender distribution chart...



`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)



   Creating revenue heatmap...
Created 6 visualizations successfully!
Creating Regional Demographics Dashboard...
Dashboard created successfully!

REGIONAL DEMOGRAPHICS ANALYSIS COMPLETED!

FILES CREATED:
   regional_demographics/regional_demographics_dashboard.html
   regional_demographics/city_statistics.csv
   regional_demographics/country_statistics.csv
   regional_demographics/processed_data_sample.csv
   regional_demographics/summary_statistics.txt

ANALYSIS SUMMARY:
   Countries Analyzed: 7
   Cities Analyzed: 35
   Unique Customers: 1,283,707
   Total Transactions: 6,416,029
   Total Revenue: $288,786,449.85 USD

Open 'regional_demographics/regional_demographics_dashboard.html' to view the complete analysis!

Regional Demographics Analysis completed successfully!


Customer Demograohic

In [8]:
import polars as pl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
import numpy as np
from datetime import datetime, timedelta
import os

def load_master_data_for_demographics():
    """Load master data for demographics analysis"""
    print("Loading master data for demographics analysis...")
    
    try:
        if os.path.exists("data/master_transactions_with_coords.parquet"):
            master_data = pl.read_parquet("data/master_transactions_with_coords.parquet")
        else:
            master_data = pl.read_parquet("data/master_transactions.parquet")
        
        print(f"Master data loaded: {master_data.shape}")
        print(f"Available columns: {master_data.columns}")
        
        # Check for demographic columns
        demographic_cols = ['Date Of Birth', 'Gender', 'Age', 'Age Group', 'Occupation', 'Income Level']
        available_demo_cols = [col for col in demographic_cols if col in master_data.columns]
        print(f"Available demographic columns: {available_demo_cols}")
        
        return master_data
        
    except Exception as e:
        print(f"Error loading master data: {e}")
        return None

def prepare_demographics_analysis_data(master_data):
    """Prepare comprehensive demographics analysis data with age calculation"""
    print("Preparing demographics analysis data...")
    
    # Filter for sales only
    sales_data = master_data.filter(pl.col("Transaction Type") == "Sale")
    print(f"Sales data shape: {sales_data.shape}")
    
    # Calculate age from Date Of Birth
    current_date = datetime.now().date()
    print(f"Calculating age from Date Of Birth (current date: {current_date})")
    
    # Add age calculations and demographics
    sales_data = sales_data.with_columns([
        # Calculate age from Date Of Birth
        ((pl.lit(current_date) - pl.col("Date Of Birth")).dt.total_days() / 365.25).floor().cast(pl.Int32).alias("Age"),
        
        # Add time dimensions
        pl.col("Date").dt.strftime("%Y-%m").alias("Year_Month"),
        pl.col("Date").dt.quarter().alias("Quarter"),
        pl.col("Date").dt.weekday().alias("Day_of_Week"),
        pl.col("Date").dt.strftime("%A").alias("Day_Name"),
        
        # Calculate discount metrics
        (pl.col("Unit_Price_USD") * pl.col("Quantity") - pl.col("Line_Total_USD")).alias("Discount_Amount_USD"),
        ((pl.col("Unit_Price_USD") * pl.col("Quantity") - pl.col("Line_Total_USD")) / 
         (pl.col("Unit_Price_USD") * pl.col("Quantity")) * 100).alias("Discount_Percent"),
        
        # Customer identifier for aggregation
        pl.col("Customer ID").alias("Customer_ID")
    ])
    
    # Create age groups
    sales_data = sales_data.with_columns([
        pl.when(pl.col("Age") < 18)
        .then(pl.lit("Under 18"))
        .when(pl.col("Age") < 25)
        .then(pl.lit("18-24"))
        .when(pl.col("Age") < 35)
        .then(pl.lit("25-34"))
        .when(pl.col("Age") < 45)
        .then(pl.lit("35-44"))
        .when(pl.col("Age") < 55)
        .then(pl.lit("45-54"))
        .when(pl.col("Age") < 65)
        .then(pl.lit("55-64"))
        .otherwise(pl.lit("65+"))
        .alias("Age_Group")
    ])
    
    # Check age calculation results
    age_stats = sales_data.select([
        pl.col("Age").min().alias("Min_Age"),
        pl.col("Age").max().alias("Max_Age"),
        pl.col("Age").mean().alias("Avg_Age"),
        # pl.col("Age_Group").value_counts().alias("Age_Group_Counts")
    ])
    
    print("Age calculation results:")
    print(f"Age range: {age_stats.select('Min_Age').item()} - {age_stats.select('Max_Age').item()}")
    print(f"Average age: {age_stats.select('Avg_Age').item():.1f}")
    
    # Show age group distribution
    age_group_dist = sales_data.group_by("Age_Group").agg(pl.count().alias("count")).sort("count", descending=True)
    print("Age group distribution:")
    for row in age_group_dist.iter_rows():
        print(f"   {row[0]}: {row[1]:,} transactions")
    
    print("Creating age-based demographics analysis...")
    
    # 1. Age Demographics Analysis
    age_demographics = sales_data.group_by(["Age", "Age_Group"]).agg([
        pl.col("Customer_ID").n_unique().alias("Unique_Customers"),
        pl.col("Line_Total_USD").sum().alias("Total_Revenue_USD"),
        pl.col("Quantity").sum().alias("Total_Quantity"),
        pl.col("Line_Total_USD").count().alias("Total_Transactions"),
        pl.col("Unit_Price_USD").mean().alias("Avg_Unit_Price_USD"),
        (pl.col("Discount_Amount_USD").sum() / (pl.col("Unit_Price_USD") * pl.col("Quantity")).sum() * 100).alias("Avg_Discount_Percent"),
        pl.col("Category").n_unique().alias("Categories_Purchased"),
        pl.col("Store ID").n_unique().alias("Stores_Visited")
    ]).with_columns([
        (pl.col("Total_Revenue_USD") / pl.col("Unique_Customers")).alias("Revenue_Per_Customer"),
        (pl.col("Total_Transactions") / pl.col("Unique_Customers")).alias("Transactions_Per_Customer"),
        (pl.col("Total_Revenue_USD") / pl.col("Total_Transactions")).alias("Average_Order_Value")
    ]).sort("Age")
    
    # 2. Age Group Summary (for easier visualization)
    age_group_demographics = sales_data.group_by("Age_Group").agg([
        pl.col("Customer_ID").n_unique().alias("Unique_Customers"),
        pl.col("Line_Total_USD").sum().alias("Total_Revenue_USD"),
        pl.col("Quantity").sum().alias("Total_Quantity"),
        pl.col("Line_Total_USD").count().alias("Total_Transactions"),
        pl.col("Unit_Price_USD").mean().alias("Avg_Unit_Price_USD"),
        (pl.col("Discount_Amount_USD").sum() / (pl.col("Unit_Price_USD") * pl.col("Quantity")).sum() * 100).alias("Avg_Discount_Percent"),
        pl.col("Category").n_unique().alias("Categories_Purchased"),
        pl.col("Store ID").n_unique().alias("Stores_Visited"),
        pl.col("Age").mean().alias("Average_Age")
    ]).with_columns([
        (pl.col("Total_Revenue_USD") / pl.col("Unique_Customers")).alias("Revenue_Per_Customer"),
        (pl.col("Total_Transactions") / pl.col("Unique_Customers")).alias("Transactions_Per_Customer"),
        (pl.col("Total_Revenue_USD") / pl.col("Total_Transactions")).alias("Average_Order_Value")
    ])
    
    # Sort age groups in logical order
    age_group_order = ["Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
    age_group_demographics = age_group_demographics.with_columns([
        pl.col("Age_Group").map_elements(
            lambda x: age_group_order.index(x) if x in age_group_order else 999,
            return_dtype=pl.Int32
        ).alias("Age_Group_Order")
    ]).sort("Age_Group_Order").drop("Age_Group_Order")
    
    # 3. Gender Demographics Analysis
    if "Gender" in sales_data.columns:
        gender_demographics = sales_data.group_by("Gender").agg([
            pl.col("Customer_ID").n_unique().alias("Unique_Customers"),
            pl.col("Line_Total_USD").sum().alias("Total_Revenue_USD"),
            pl.col("Quantity").sum().alias("Total_Quantity"),
            pl.col("Line_Total_USD").count().alias("Total_Transactions"),
            pl.col("Unit_Price_USD").mean().alias("Avg_Unit_Price_USD"),
            (pl.col("Discount_Amount_USD").sum() / (pl.col("Unit_Price_USD") * pl.col("Quantity")).sum() * 100).alias("Avg_Discount_Percent"),
            pl.col("Category").n_unique().alias("Categories_Purchased"),
            pl.col("Store ID").n_unique().alias("Stores_Visited"),
            pl.col("Age").mean().alias("Average_Age")
        ]).with_columns([
            (pl.col("Total_Revenue_USD") / pl.col("Unique_Customers")).alias("Revenue_Per_Customer"),
            (pl.col("Total_Transactions") / pl.col("Unique_Customers")).alias("Transactions_Per_Customer"),
            (pl.col("Total_Revenue_USD") / pl.col("Total_Transactions")).alias("Average_Order_Value")
        ])
    else:
        print("Gender column not found - creating placeholder")
        gender_demographics = None
    
    print("Creating category preferences by demographics...")
    
    # 4. Age Group vs Category Analysis
    age_category_analysis = sales_data.group_by(["Age_Group", "Category"]).agg([
        pl.col("Line_Total_USD").sum().alias("Revenue_USD"),
        pl.col("Quantity").sum().alias("Quantity_Sold"),
        pl.col("Customer_ID").n_unique().alias("Unique_Customers"),
        pl.col("Line_Total_USD").count().alias("Transactions")
    ]).with_columns([
        (pl.col("Revenue_USD") / pl.col("Unique_Customers")).alias("Revenue_Per_Customer"),
        (pl.col("Revenue_USD") / pl.col("Transactions")).alias("Average_Order_Value")
    ]).sort(["Age_Group", "Revenue_USD"], descending=[False, True])
    
    # 5. Gender vs Category Analysis (if gender available)
    if "Gender" in sales_data.columns:
        gender_category_analysis = sales_data.group_by(["Gender", "Category"]).agg([
            pl.col("Line_Total_USD").sum().alias("Revenue_USD"),
            pl.col("Quantity").sum().alias("Quantity_Sold"),
            pl.col("Customer_ID").n_unique().alias("Unique_Customers"),
            pl.col("Line_Total_USD").count().alias("Transactions")
        ]).with_columns([
            (pl.col("Revenue_USD") / pl.col("Unique_Customers")).alias("Revenue_Per_Customer"),
            (pl.col("Revenue_USD") / pl.col("Transactions")).alias("Average_Order_Value")
        ]).sort(["Gender", "Revenue_USD"], descending=[False, True])
    else:
        gender_category_analysis = None
    
    print("Creating customer lifetime value by demographics...")
    
    # 6. Customer-Level Demographics with CLV
    customer_demographics_clv = sales_data.group_by([
        "Customer_ID", "Age", "Age_Group", "Gender" if "Gender" in sales_data.columns else pl.lit("Unknown").alias("Gender")
    ]).agg([
        pl.col("Line_Total_USD").sum().alias("Total_Spent_USD"),
        pl.col("Quantity").sum().alias("Total_Items"),
        pl.col("Line_Total_USD").count().alias("Total_Transactions"),
        pl.col("Date").min().alias("First_Purchase"),
        pl.col("Date").max().alias("Last_Purchase"),
        pl.col("Category").n_unique().alias("Categories_Explored"),
        pl.col("Store ID").n_unique().alias("Stores_Visited"),
        pl.col("Unit_Price_USD").mean().alias("Avg_Unit_Price"),
        (pl.col("Discount_Amount_USD").sum() / (pl.col("Unit_Price_USD") * pl.col("Quantity")).sum() * 100).alias("Avg_Discount_Percent")
    ]).with_columns([
        (pl.col("Total_Spent_USD") / pl.col("Total_Transactions")).alias("Average_Order_Value"),
        (pl.col("Last_Purchase") - pl.col("First_Purchase")).dt.total_days().alias("Customer_Lifespan_Days")
    ])
    
    # Convert to pandas for advanced CLV calculations
    customer_clv_df = customer_demographics_clv.to_pandas()
    customer_clv_df['Customer_Lifespan_Days'] = customer_clv_df['Customer_Lifespan_Days'].fillna(1).replace(0, 1)
    customer_clv_df['Monthly_Frequency'] = (customer_clv_df['Total_Transactions'] / customer_clv_df['Customer_Lifespan_Days'] * 30).fillna(1.0)
    
    # Simple CLV calculation
    customer_clv_df['Predicted_CLV'] = customer_clv_df['Average_Order_Value'] * customer_clv_df['Monthly_Frequency'] * 12
    customer_clv_df['Total_CLV'] = customer_clv_df['Total_Spent_USD'] + customer_clv_df['Predicted_CLV']
    
    print("Creating temporal demographics patterns...")
    
    # 7. Monthly Demographics Trends
    monthly_age_trends = sales_data.group_by(["Year_Month", "Age_Group"]).agg([
        pl.col("Line_Total_USD").sum().alias("Monthly_Revenue_USD"),
        pl.col("Customer_ID").n_unique().alias("Active_Customers"),
        pl.col("Line_Total_USD").count().alias("Transactions")
    ]).sort(["Year_Month", "Age_Group"])
    
    # 8. Day of Week Demographics Patterns
    dow_demographics = sales_data.group_by(["Day_Name", "Age_Group"]).agg([
        pl.col("Line_Total_USD").sum().alias("Revenue_USD"),
        pl.col("Customer_ID").n_unique().alias("Unique_Customers"),
        pl.col("Line_Total_USD").count().alias("Transactions")
    ])
    
    print("Creating discount sensitivity by demographics...")
    
    # 9. Discount Sensitivity by Demographics
    discount_sensitivity = sales_data.with_columns([
        pl.when(pl.col("Discount_Percent") <= 5)
        .then(pl.lit("0-5%"))
        .when(pl.col("Discount_Percent") <= 15)
        .then(pl.lit("6-15%"))
        .when(pl.col("Discount_Percent") <= 25)
        .then(pl.lit("16-25%"))
        .when(pl.col("Discount_Percent") <= 35)
        .then(pl.lit("26-35%"))
        .otherwise(pl.lit("35%+"))
        .alias("Discount_Bucket")
    ]).group_by(["Age_Group", "Discount_Bucket"]).agg([
        pl.col("Line_Total_USD").sum().alias("Revenue_USD"),
        pl.col("Customer_ID").n_unique().alias("Customers"),
        pl.col("Line_Total_USD").count().alias("Transactions")
    ])
    
    print("Demographics analysis data prepared successfully!")
    
    return {
        "sales_data": sales_data,
        "age_demographics": age_demographics,
        "age_group_demographics": age_group_demographics,
        "gender_demographics": gender_demographics,
        "age_category_analysis": age_category_analysis,
        "gender_category_analysis": gender_category_analysis,
        "customer_demographics_clv": pl.from_pandas(customer_clv_df),
        "monthly_age_trends": monthly_age_trends,
        "dow_demographics": dow_demographics,
        "discount_sensitivity": discount_sensitivity
    }

def create_demographics_dashboard(data_dict):
    """Create comprehensive demographics analysis dashboard"""
    print("Creating Demographics Analysis Dashboard...")
    
    # Convert to pandas for plotting
    age_demo_df = data_dict["age_group_demographics"].to_pandas()
    age_category_df = data_dict["age_category_analysis"].to_pandas()
    customer_clv_df = data_dict["customer_demographics_clv"].to_pandas()
    monthly_trends_df = data_dict["monthly_age_trends"].to_pandas()
    dow_demo_df = data_dict["dow_demographics"].to_pandas()
    discount_sens_df = data_dict["discount_sensitivity"].to_pandas()
    
    # Handle gender data
    if data_dict["gender_demographics"] is not None:
        gender_demo_df = data_dict["gender_demographics"].to_pandas()
        gender_category_df = data_dict["gender_category_analysis"].to_pandas()
    else:
        gender_demo_df = None
        gender_category_df = None
    
    # Create HTML structure
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Customer Demographics Analysis Dashboard</title>
        <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
            .chart-container { background-color: white; margin: 20px 0; padding: 20px; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
            .chart-title { font-size: 24px; font-weight: bold; text-align: center; margin-bottom: 20px; color: #333; }
            .dashboard-title { font-size: 36px; font-weight: bold; text-align: center; margin-bottom: 30px; color: #2c3e50; }
            .insights-box { background-color: #e8f4f8; padding: 15px; margin: 10px 0; border-radius: 8px; font-size: 14px; }
            .metric-highlight { background-color: #fff3cd; padding: 10px; margin: 5px 0; border-radius: 5px; font-weight: bold; }
            .demo-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin: 20px 0; }
            .demo-card { background-color: #f8f9fa; padding: 15px; border-radius: 8px; border-left: 4px solid #007bff; }
        </style>
    </head>
    <body>
        <div class="dashboard-title">Customer Demographics Analysis Dashboard</div>
        
        <div class="insights-box">
            <strong>Demographics Analysis Insights:</strong><br>
            • <strong>Age Distribution</strong>: Customer behavior patterns across different age groups<br>
            • <strong>Gender Preferences</strong>: Product and category preferences by gender<br>
            • <strong>Demographic CLV</strong>: Customer lifetime value analysis by demographics<br>
            • <strong>Purchase Patterns</strong>: Temporal and behavioral patterns by demographic segments<br>
            • <strong>Discount Sensitivity</strong>: How different demographics respond to pricing strategies
        </div>
    """
    
    # Create figures (same as before but without emojis in titles)
    # 1. Age Distribution Analysis
    fig1 = make_subplots(specs=[[{"secondary_y": True}]])
    
    fig1.add_trace(
        go.Bar(
            x=age_demo_df['Age_Group'],
            y=age_demo_df['Unique_Customers'],
            name='Number of Customers',
            marker_color='lightblue',
            yaxis='y'
        ),
        secondary_y=False,
    )
    
    fig1.add_trace(
        go.Scatter(
            x=age_demo_df['Age_Group'],
            y=age_demo_df['Revenue_Per_Customer'],
            mode='lines+markers',
            name='Revenue Per Customer (USD)',
            line=dict(color='red', width=3),
            marker=dict(size=8),
            yaxis='y2'
        ),
        secondary_y=True,
    )
    
    fig1.update_xaxes(title_text="Age Group")
    fig1.update_yaxes(title_text="Number of Customers", secondary_y=False)
    fig1.update_yaxes(title_text="Revenue Per Customer (USD)", secondary_y=True)
    fig1.update_layout(
        title_text="Customer Distribution and Revenue by Age Group",
        height=600,
        template="plotly_white"
    )
    
    # 2. Age vs Category Heatmap
    age_cat_pivot = age_category_df.pivot_table(
        values='Revenue_USD', 
        index='Age_Group', 
        columns='Category', 
        fill_value=0
    )
    
    fig2 = px.imshow(
        age_cat_pivot.values,
        labels=dict(x="Category", y="Age Group", color="Revenue (USD)"),
        x=age_cat_pivot.columns,
        y=age_cat_pivot.index,
        title="Category Preferences by Age Group (Revenue Heatmap)",
        color_continuous_scale="Viridis",
        height=600
    )
    fig2.update_layout(template="plotly_white")
    
    # 3. Customer Lifetime Value by Age
    fig3 = px.box(
        customer_clv_df,
        x="Age_Group",
        y="Total_CLV",
        title="Customer Lifetime Value Distribution by Age Group",
        height=600
    )
    fig3.update_layout(template="plotly_white")
    
    # 4. Gender Analysis (if available)
    if gender_demo_df is not None:
        fig4 = px.pie(
            gender_demo_df,
            values='Unique_Customers',
            names='Gender',
            title="Customer Distribution by Gender",
            height=500
        )
        fig4.update_traces(textposition='inside', textinfo='percent+label')
        fig4.update_layout(template="plotly_white")
    else:
        fig4 = go.Figure()
        fig4.add_annotation(
            text="Gender data not available in dataset",
            xref="paper", yref="paper",
            x=0.5, y=0.5, xanchor='center', yanchor='middle',
            showarrow=False, font=dict(size=20)
        )
        fig4.update_layout(
            title="Gender Analysis - Data Not Available",
            height=500,
            template="plotly_white"
        )
    
    # 5. Monthly Trends by Age Group
    fig5 = px.line(
        monthly_trends_df,
        x="Year_Month",
        y="Monthly_Revenue_USD",
        color="Age_Group",
        title="Monthly Revenue Trends by Age Group",
        markers=True,
        height=600
    )
    fig5.update_xaxes(tickangle=45)
    fig5.update_layout(template="plotly_white")
    
    # 6. Day of Week Patterns by Age
    fig6 = px.bar(
        dow_demo_df,
        x="Day_Name",
        y="Revenue_USD",
        color="Age_Group",
        title="Daily Purchase Patterns by Age Group",
        height=600
    )
    fig6.update_layout(template="plotly_white")
    
    # 7. Discount Sensitivity by Age Group
    fig7 = px.bar(
        discount_sens_df,
        x="Age_Group",
        y="Revenue_USD",
        color="Discount_Bucket",
        title="Discount Sensitivity by Age Group",
        height=600
    )
    fig7.update_layout(template="plotly_white")
    
    # 8. Age vs Purchase Behavior Scatter
    fig8 = px.scatter(
        customer_clv_df,
        x="Average_Order_Value",
        y="Total_Transactions",
        color="Age_Group",
        size="Total_CLV",
        hover_data={
            "Customer_ID": True,
            "Total_Spent_USD": ":,.2f",
            "Categories_Explored": True,
            "Stores_Visited": True
        },
        title="Purchase Behavior: Order Value vs Frequency by Age",
        height=700
    )
    fig8.update_layout(template="plotly_white")
    
    # 9. Gender vs Category Analysis (if available)
    if gender_category_df is not None:
        fig9 = px.bar(
            gender_category_df,
            x="Category",
            y="Revenue_USD",
            color="Gender",
            title="Category Preferences by Gender",
            height=600
        )
        fig9.update_xaxes(tickangle=45)
        fig9.update_layout(template="plotly_white")
    else:
        fig9 = go.Figure()
        fig9.add_annotation(
            text="Gender category analysis not available",
            xref="paper", yref="paper",
            x=0.5, y=0.5, xanchor='center', yanchor='middle',
            showarrow=False, font=dict(size=20)
        )
        fig9.update_layout(
            title="Gender vs Category Analysis - Data Not Available",
            height=500,
            template="plotly_white"
        )
    
    # 10. Demographics Summary Metrics
    fig10 = go.Figure()
    
    fig10.add_trace(go.Bar(
        x=age_demo_df['Age_Group'],
        y=age_demo_df['Total_Revenue_USD'],
        name='Total Revenue',
        marker_color='skyblue'
    ))
    
    fig10.update_layout(
        title="Total Revenue by Age Group",
        xaxis_title="Age Group",
        yaxis_title="Total Revenue (USD)",
        height=500,
        template="plotly_white"
    )
    
    # Convert all figures to HTML
    figures = [fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8, fig9, fig10]
    titles = [
        "Customer Distribution and Revenue by Age Group",
        "Category Preferences by Age Group (Revenue Heatmap)",
        "Customer Lifetime Value Distribution by Age Group",
        "Customer Distribution by Gender",
        "Monthly Revenue Trends by Age Group",
        "Daily Purchase Patterns by Age Group",
        "Discount Sensitivity by Age Group",
        "Purchase Behavior: Order Value vs Frequency by Age",
        "Category Preferences by Gender",
        "Total Revenue by Age Group"
    ]
    
    for i, (fig, title) in enumerate(zip(figures, titles)):
        html_content += f"""
        <div class="chart-container">
            <div class="chart-title">{title}</div>
            <div id="chart{i+1}"></div>
        </div>
        """
    
    # Add demographic insights summary
    total_customers = age_demo_df['Unique_Customers'].sum()
    total_revenue = age_demo_df['Total_Revenue_USD'].sum()
    highest_clv_age = customer_clv_df.groupby('Age_Group')['Total_CLV'].mean().idxmax()
    most_active_age = age_demo_df.loc[age_demo_df['Transactions_Per_Customer'].idxmax(), 'Age_Group']
    
    html_content += f"""
        <div class="insights-box">
            <strong>Key Demographics Insights:</strong><br>
            <div class="metric-highlight">Total Customers Analyzed: {total_customers:,}</div>
            <div class="metric-highlight">Total Revenue: ${total_revenue:,.0f}</div>
            <div class="metric-highlight">Highest CLV Age Group: {highest_clv_age}</div>
            <div class="metric-highlight">Most Active Age Group: {most_active_age}</div>
        </div>
        
        <div class="chart-container">
            <div class="chart-title">Age Group Performance Summary</div>
            <div class="demo-grid">
    """
    
    # Add age group summary cards
    for _, row in age_demo_df.iterrows():
        html_content += f"""
            <div class="demo-card">
                <h3>{row['Age_Group']}</h3>
                <p><strong>Customers:</strong> {row['Unique_Customers']:,}</p>
                <p><strong>Revenue:</strong> ${row['Total_Revenue_USD']:,.0f}</p>
                <p><strong>Avg Order Value:</strong> ${row['Average_Order_Value']:.2f}</p>
                <p><strong>Revenue/Customer:</strong> ${row['Revenue_Per_Customer']:,.2f}</p>
                <p><strong>Avg Discount:</strong> {row['Avg_Discount_Percent']:.1f}%</p>
                <p><strong>Avg Age:</strong> {row['Average_Age']:.1f} years</p>
            </div>
        """
    
    html_content += """
            </div>
        </div>
    """
    
    html_content += """
    <script>
    """
    
    # Add JavaScript for each plot
    for i, fig in enumerate(figures):
        plot_json = fig.to_json()
        html_content += f"""
        var plotData{i+1} = {plot_json};
        Plotly.newPlot('chart{i+1}', plotData{i+1}.data, plotData{i+1}.layout, {{responsive: true}});
        """

    html_content += """
    </script>
    </body>
    </html>
    """

    return html_content


def save_demographics_analysis_data(data_dict):
    """Save demographics analysis data files"""
    print("Saving demographics analysis data files...")
    
    os.makedirs("demographics_analysis/data", exist_ok=True)
    
    # Save all datasets
    data_dict["age_demographics"].write_csv("demographics_analysis/data/age_demographics_detailed.csv")
    data_dict["age_group_demographics"].write_csv("demographics_analysis/data/age_group_demographics_summary.csv")
    data_dict["age_category_analysis"].write_csv("demographics_analysis/data/age_category_preferences.csv")
    data_dict["customer_demographics_clv"].write_csv("demographics_analysis/data/customer_demographics_clv.csv")
    data_dict["monthly_age_trends"].write_csv("demographics_analysis/data/monthly_age_trends.csv")
    data_dict["dow_demographics"].write_csv("demographics_analysis/data/day_of_week_demographics.csv")
    data_dict["discount_sensitivity"].write_csv("demographics_analysis/data/discount_sensitivity_by_age.csv")
    
    # Save gender data if available
    if data_dict["gender_demographics"] is not None:
        data_dict["gender_demographics"].write_csv("demographics_analysis/data/gender_demographics_analysis.csv")
        data_dict["gender_category_analysis"].write_csv("demographics_analysis/data/gender_category_preferences.csv")
    
    print("Demographics analysis data files saved!")

def run_demographics_analysis():
    """Run comprehensive demographics analysis"""
    print("Starting Customer Demographics Analysis...")
    
    # Load data
    master_data = load_master_data_for_demographics()
    if master_data is None:
        print("Failed to load master data!")
        return None
    
    # Prepare analysis data
    data_dict = prepare_demographics_analysis_data(master_data)
    if data_dict is None:
        print("Failed to prepare demographics data!")
        return None
    
    # Save data files
    save_demographics_analysis_data(data_dict)
    
    # Create dashboard
    html_content = create_demographics_dashboard(data_dict)
    
    # Create directory and save HTML
    os.makedirs("demographics_analysis", exist_ok=True)
    
    with open("demographics_analysis/demographics_analysis_dashboard.html", "w", encoding='utf-8') as f:
        f.write(html_content)
    
    # Print summary insights
    age_demo_df = data_dict["age_group_demographics"].to_pandas()
    customer_clv_df = data_dict["customer_demographics_clv"].to_pandas()
    
    print("\nCUSTOMER DEMOGRAPHICS ANALYSIS COMPLETE!")
    print(f"Total Customers Analyzed: {age_demo_df['Unique_Customers'].sum():,}")
    print(f"Total Revenue Analyzed: ${age_demo_df['Total_Revenue_USD'].sum():,.0f}")
    print(f"Age Groups Analyzed: {len(age_demo_df)}")
    
    # Age group insights
    top_age_group = age_demo_df.loc[age_demo_df['Total_Revenue_USD'].idxmax()]
    print(f"Top Revenue Age Group: {top_age_group['Age_Group']} (${top_age_group['Total_Revenue_USD']:,.0f})")
    
    highest_aov_age = age_demo_df.loc[age_demo_df['Average_Order_Value'].idxmax()]
    print(f"Highest AOV Age Group: {highest_aov_age['Age_Group']} (${highest_aov_age['Average_Order_Value']:.2f})")
    
    # CLV insights
    avg_clv_by_age = customer_clv_df.groupby('Age_Group')['Total_CLV'].mean()
    highest_clv_age = avg_clv_by_age.idxmax()
    print(f"Highest CLV Age Group: {highest_clv_age} (${avg_clv_by_age[highest_clv_age]:,.2f})")
    
    print("\nDashboard and Data Files Created:")
    print("   demographics_analysis/demographics_analysis_dashboard.html")
    print("   demographics_analysis/data/ (CSV files for detailed analysis)")
    
    return data_dict

def create_advanced_demographic_insights(data_dict):
    """Create advanced demographic insights and recommendations"""
    print("Generating advanced demographic insights...")
    
    # Convert key datasets to pandas for analysis
    age_demo_df = data_dict["age_group_demographics"].to_pandas()
    customer_clv_df = data_dict["customer_demographics_clv"].to_pandas()
    age_category_df = data_dict["age_category_analysis"].to_pandas()
    
    insights = {}
    
    # 1. Age Group Performance Analysis
    insights['age_performance'] = {}
    for _, row in age_demo_df.iterrows():
        age_group = row['Age_Group']
        insights['age_performance'][age_group] = {
            'customers': int(row['Unique_Customers']),
            'revenue': float(row['Total_Revenue_USD']),
            'revenue_per_customer': float(row['Revenue_Per_Customer']),
            'avg_order_value': float(row['Average_Order_Value']),
            'transactions_per_customer': float(row['Transactions_Per_Customer']),
            'discount_sensitivity': float(row['Avg_Discount_Percent']),
            'market_share': float(row['Unique_Customers'] / age_demo_df['Unique_Customers'].sum() * 100),
            'average_age': float(row['Average_Age'])
        }
    
    # 2. Category Preferences by Age
    insights['category_preferences'] = {}
    for age_group in age_category_df['Age_Group'].unique():
        age_data = age_category_df[age_category_df['Age_Group'] == age_group]
        top_categories = age_data.nlargest(3, 'Revenue_USD')
        
        insights['category_preferences'][age_group] = {
            'top_categories': top_categories[['Category', 'Revenue_USD']].to_dict('records'),
            'category_diversity': len(age_data),
            'total_revenue': float(age_data['Revenue_USD'].sum())
        }
    
    # 3. CLV Analysis by Demographics
    clv_by_age = customer_clv_df.groupby('Age_Group').agg({
        'Total_CLV': ['mean', 'median', 'std', 'count'],
        'Total_Spent_USD': 'mean',
        'Predicted_CLV': 'mean',
        'Average_Order_Value': 'mean',
        'Total_Transactions': 'mean'
    }).round(2)
    
    insights['clv_analysis'] = clv_by_age.to_dict()
    
    # 4. Strategic Recommendations
    insights['recommendations'] = {}
    
    # Find highest value age groups
    top_revenue_age = age_demo_df.loc[age_demo_df['Total_Revenue_USD'].idxmax(), 'Age_Group']
    top_clv_age = customer_clv_df.groupby('Age_Group')['Total_CLV'].mean().idxmax()
    highest_aov_age = age_demo_df.loc[age_demo_df['Average_Order_Value'].idxmax(), 'Age_Group']
    most_frequent_age = age_demo_df.loc[age_demo_df['Transactions_Per_Customer'].idxmax(), 'Age_Group']
    
    insights['recommendations'] = {
        'focus_segments': {
            'primary_revenue_driver': top_revenue_age,
            'highest_lifetime_value': top_clv_age,
            'premium_customers': highest_aov_age,
            'most_engaged': most_frequent_age
        },
        'strategies': {
            top_revenue_age: "Primary revenue driver - focus on retention and expansion",
            top_clv_age: "Highest CLV - invest in premium experiences and loyalty programs",
            highest_aov_age: "Premium segment - target with high-value products",
            most_frequent_age: "Most engaged - leverage for referrals and advocacy"
        }
    }
    
    # 5. Market Opportunities
    insights['opportunities'] = {
        'underperforming_segments': [],
        'growth_potential': [],
        'category_expansion': {}
    }
    
    # Identify underperforming segments (low revenue per customer)
    median_rpc = age_demo_df['Revenue_Per_Customer'].median()
    underperforming = age_demo_df[age_demo_df['Revenue_Per_Customer'] < median_rpc * 0.8]
    
    for _, row in underperforming.iterrows():
        insights['opportunities']['underperforming_segments'].append({
            'age_group': row['Age_Group'],
            'customers': int(row['Unique_Customers']),
            'current_rpc': float(row['Revenue_Per_Customer']),
            'potential_uplift': float(median_rpc - row['Revenue_Per_Customer'])
        })
    
    return insights

def generate_demographic_report(insights):
    """Generate a comprehensive demographic report"""
    print("Generating demographic insights report...")
    
    report = f"""
    
    Customer Demographics Analysis Report
    ====================================
    
    EXECUTIVE SUMMARY
    =================
    
    KEY FINDINGS:
    • Primary Revenue Driver: {insights['recommendations']['focus_segments']['primary_revenue_driver']}
    • Highest CLV Segment: {insights['recommendations']['focus_segments']['highest_lifetime_value']}
    • Premium Customer Group: {insights['recommendations']['focus_segments']['premium_customers']}
    • Most Engaged Segment: {insights['recommendations']['focus_segments']['most_engaged']}
    
    AGE GROUP PERFORMANCE ANALYSIS
    ==============================
    """
    
    # Add age group details
    for age_group, data in insights['age_performance'].items():
        report += f"""
    {age_group.upper()}:
       • Customers: {data['customers']:,} ({data['market_share']:.1f}% of total)
       • Average Age: {data['average_age']:.1f} years
       • Revenue: ${data['revenue']:,.0f}
       • Revenue per Customer: ${data['revenue_per_customer']:,.2f}
       • Average Order Value: ${data['avg_order_value']:.2f}
       • Purchase Frequency: {data['transactions_per_customer']:.1f} transactions/customer
       • Discount Sensitivity: {data['discount_sensitivity']:.1f}%
    """
    
    report += f"""
    
    CATEGORY PREFERENCES BY AGE GROUP
    =================================
    """
    
    # Add category preferences
    for age_group, prefs in insights['category_preferences'].items():
        report += f"""
    {age_group.upper()}:
       • Category Diversity: {prefs['category_diversity']} categories explored
       • Total Category Revenue: ${prefs['total_revenue']:,.0f}
       • Top 3 Categories:"""
        
        for i, cat in enumerate(prefs['top_categories'][:3], 1):
            report += f"""
         {i}. {cat['Category']}: ${cat['Revenue_USD']:,.0f}"""
    
    report += f"""
    
    STRATEGIC RECOMMENDATIONS
    =========================
    """
    
    for age_group, strategy in insights['recommendations']['strategies'].items():
        report += f"""
    • {age_group}: {strategy}
    """
    
    # Add opportunities section
    if insights['opportunities']['underperforming_segments']:
        report += f"""
    
    GROWTH OPPORTUNITIES
    ===================
    
    UNDERPERFORMING SEGMENTS (Revenue Enhancement Opportunities):
    """
        
        for segment in insights['opportunities']['underperforming_segments']:
            potential_revenue = segment['customers'] * segment['potential_uplift']
            report += f"""
    • {segment['age_group']}: {segment['customers']:,} customers
      Current RPC: ${segment['current_rpc']:.2f}
      Potential Uplift: ${segment['potential_uplift']:.2f}/customer
      Total Opportunity: ${potential_revenue:,.0f}
    """
    
    report += f"""
    
    END OF REPORT
    =============
    """
    
    return report

# Enhanced run function with insights
def run_complete_demographics_analysis():
    """Run complete demographics analysis with insights"""
    print("Starting Complete Customer Demographics Analysis...")
    
    # Run main analysis
    results = run_demographics_analysis()
    
    if results:
        print("\nGenerating Advanced Insights...")
        
        # Generate advanced insights
        insights = create_advanced_demographic_insights(results)
        
        # Generate comprehensive report
        report = generate_demographic_report(insights)
        
        # Save insights and report
        os.makedirs("demographics_analysis/reports", exist_ok=True)
        
        # Save report as text
        with open("demographics_analysis/reports/demographic_analysis_report.txt", "w", encoding='utf-8') as f:
            f.write(report)
        
        # Print the report
        print(report)
        
        print("\nADDITIONAL FILES CREATED:")
        print("   demographics_analysis/reports/demographic_analysis_report.txt")
        
        return results, insights
    
    else:
        return None, None

# Run the complete analysis
print("Executing Complete Demographics Analysis Pipeline...")
results, insights = run_complete_demographics_analysis()

if results and insights:
    print("\nDEMOGRAPHICS ANALYSIS PIPELINE COMPLETED SUCCESSFULLY!")
    print("\nANALYSIS OUTPUTS:")
    print("   Interactive Dashboard: demographics_analysis/demographics_analysis_dashboard.html")
    print("   Raw Data Files: demographics_analysis/data/")
    print("   Insights Report: demographics_analysis/reports/")
    print("   Strategic Recommendations: Available in report")
else:
    print("\nDemographics Analysis Pipeline failed!")
    print("Please check your data files and try again.")



Executing Complete Demographics Analysis Pipeline...
Starting Complete Customer Demographics Analysis...
Starting Customer Demographics Analysis...
Loading master data for demographics analysis...
Master data loaded: (6416827, 45)
Available columns: ['Invoice ID', 'Line', 'Customer ID', 'Product ID', 'Size', 'Color', 'Unit Price', 'Quantity', 'Date', 'Discount', 'Line Total', 'Store ID', 'Employee ID', 'Currency', 'Currency Symbol', 'SKU', 'Transaction Type', 'Payment Method', 'Invoice Total', 'Exchange_Rate_to_USD', 'Unit_Price_USD', 'Line_Total_USD', 'Invoice_Total_USD', 'Category', 'Sub Category', 'Description EN', 'Color_right', 'Sizes', 'Production Cost', 'Name', 'Email', 'City', 'Country', 'Gender', 'Date Of Birth', 'Job Title', 'Country_right', 'City_right', 'Store Name', 'Number of Employees', 'Store ID_right', 'Name_right', 'Position', 'Latitude', 'Longitude']
Available demographic columns: ['Date Of Birth', 'Gender']
Preparing demographics analysis data...
Sales data shape: (


`pl.count()` is deprecated. Please use `pl.len()` instead.
(Deprecated in version 0.20.5)



Creating category preferences by demographics...
Creating customer lifetime value by demographics...
Creating temporal demographics patterns...
Creating discount sensitivity by demographics...
Demographics analysis data prepared successfully!
Saving demographics analysis data files...
Demographics analysis data files saved!
Creating Demographics Analysis Dashboard...

CUSTOMER DEMOGRAPHICS ANALYSIS COMPLETE!
Total Customers Analyzed: 1,283,707
Total Revenue Analyzed: $305,884,837
Age Groups Analyzed: 6
Top Revenue Age Group: 18-24 ($115,682,536)
Highest AOV Age Group: 65+ ($52.97)
Highest CLV Age Group: 25-34 ($7,454.74)

Dashboard and Data Files Created:
   demographics_analysis/demographics_analysis_dashboard.html
   demographics_analysis/data/ (CSV files for detailed analysis)

Generating Advanced Insights...
Generating advanced demographic insights...
Generating demographic insights report...

    
    Customer Demographics Analysis Report
    
    EXECUTIVE SUMMARY
    
    KEY FI