In [35]:
# master_data = pl.read_parquet("data/master_transactions.parquet")

In [37]:
len(master_data['Employee ID'].unique().to_list())

264

In [None]:
import polars as pl
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import os

def load_workforce_data():
    """Load data for workforce performance analysis"""
    print("Loading workforce data...")
    
    try:
        # Load master transactions with coordinates if available, otherwise use standard file
        if os.path.exists("data/master_transactions_with_coords.parquet"):
            master_data = pl.read_parquet("data/master_transactions_with_coords.parquet")
        else:
            master_data = pl.read_parquet("data/master_transactions.parquet")
        
        print(f"Master data loaded successfully: {master_data.shape}")
        return master_data
        
    except Exception as e:
        print(f"Error loading workforce data: {e}")
        return None

def prepare_separate_datasets(master_data):
    """Prepare separate, properly aggregated datasets for each visualization"""
    print("Preparing separate datasets for each visualization...")
    
    # Filter for sales transactions only
    sales_data = master_data.filter(pl.col("Transaction Type") == "Sale")
    print(f"Sales data shape: {sales_data.shape}")
    
    # Create output directory
    os.makedirs("workforce_analysis/data", exist_ok=True)
    
    # 1. STORE PERFORMANCE DATASET (For Fig1 & Fig2)
    print("Creating store performance dataset...")
    store_performance = sales_data.filter(
        pl.col("Store ID").is_not_null() &
        pl.col("Store Name").is_not_null() &
        pl.col("Country").is_not_null() &
        pl.col("City").is_not_null()
    ).group_by([
        "Store ID"
    ]).agg([
        pl.col("Store Name").first().alias("Store_Name"),
        pl.col("Country").first().alias("Country"),
        pl.col("City").first().alias("City"),
        pl.col("Line_Total_USD").sum().alias("Total_Revenue_USD"),
        pl.col("Quantity").sum().alias("Total_Quantity"),
        pl.col("Line_Total_USD").count().alias("Total_Transactions"),
        pl.col("Employee ID").n_unique().alias("Actual_Employees"),
        pl.col("Customer ID").n_unique().alias("Unique_Customers"),
        pl.col("Product ID").n_unique().alias("Unique_Products_Sold"),
    ]).with_columns([
        (pl.col("Total_Revenue_USD") / pl.col("Total_Transactions")).alias("Avg_Transaction_Value"),
        (pl.col("Total_Revenue_USD") / pl.col("Unique_Customers")).alias("Revenue_Per_Customer"),
        (pl.col("Total_Quantity") / pl.col("Total_Transactions")).alias("Items_Per_Transaction")
    ]).sort("Total_Revenue_USD", descending=True)
    
    print(f"Store Performance Dataset: {store_performance.shape}")
    store_performance.write_csv("workforce_analysis/data/store_performance.csv")
    
    # 2. STORE EFFICIENCY DATASET (For Fig3 - Efficiency Matrix)
    print("Creating store efficiency dataset...")
    store_efficiency = store_performance.with_columns([
        pl.col("Total_Revenue_USD").rank(method="average").alias("Revenue_Rank_Raw"),
        pl.col("Avg_Transaction_Value").rank(method="average").alias("Transaction_Value_Rank_Raw"),
        pl.col("Revenue_Per_Customer").rank(method="average").alias("Customer_Value_Rank_Raw")
    ]).with_columns([
        (pl.col("Revenue_Rank_Raw") / pl.col("Revenue_Rank_Raw").max()).alias("Revenue_Percentile"),
        (pl.col("Transaction_Value_Rank_Raw") / pl.col("Transaction_Value_Rank_Raw").max()).alias("Transaction_Value_Percentile"),
        (pl.col("Customer_Value_Rank_Raw") / pl.col("Customer_Value_Rank_Raw").max()).alias("Customer_Value_Percentile")
    ])
    
    print(f"Store Efficiency Dataset: {store_efficiency.shape}")
    store_efficiency.write_csv("workforce_analysis/data/store_efficiency.csv")
    
    # 3. EMPLOYEE PRODUCTIVITY DATASET (For Fig4)
    print("Creating employee productivity dataset...")
    employee_productivity = store_performance.with_columns([
        (pl.col("Total_Revenue_USD") / pl.col("Actual_Employees")).alias("Revenue_Per_Employee"),
        (pl.col("Total_Transactions") / pl.col("Actual_Employees")).alias("Transactions_Per_Employee"),
        (pl.col("Unique_Customers") / pl.col("Actual_Employees")).alias("Customers_Per_Employee")
    ])
    
    print(f"Employee Productivity Dataset: {employee_productivity.shape}")
    employee_productivity.write_csv("workforce_analysis/data/employee_productivity.csv")
    

    # 4. LOCATION PERFORMANCE DATASET (For Fig5) - FIXED
    print("Creating location performance dataset...")
    location_performance = store_performance.group_by([
        "Country", "City"
    ]).agg([
        pl.col("Store ID").count().alias("Number_of_Stores"),
        pl.col("Total_Revenue_USD").sum().alias("Total_Location_Revenue_USD"),
        pl.col("Total_Transactions").sum().alias("Total_Location_Transactions"),
        pl.col("Unique_Customers").sum().alias("Total_Location_Customers")
    ]).with_columns([
        (pl.col("Total_Location_Revenue_USD") / pl.col("Number_of_Stores")).alias("Avg_Revenue_Per_Store"),
        (pl.col("Total_Location_Transactions") / pl.col("Number_of_Stores")).alias("Avg_Transactions_Per_Store"),
        (pl.col("Total_Location_Customers") / pl.col("Number_of_Stores")).alias("Avg_Customers_Per_Store")
    ]).sort("Total_Location_Revenue_USD", descending=True)
    location_performance.write_csv("workforce_analysis/data/location_performance.csv")
    
    # 5. CUSTOMER EFFICIENCY DATASET (For Fig6)
    print("Creating customer efficiency dataset...")
    customer_efficiency = store_performance.select([
        "Store ID", "Store_Name", "Country", "City",
        "Unique_Customers", "Revenue_Per_Customer", "Total_Revenue_USD"
    ])
    
    print(f"Customer Efficiency Dataset: {customer_efficiency.shape}")
    customer_efficiency.write_csv("workforce_analysis/data/customer_efficiency.csv")
    
    print("All datasets created and saved successfully!")
    
    return {
        "store_performance": store_performance,
        "store_efficiency": store_efficiency,
        "employee_productivity": employee_productivity,
        "location_performance": location_performance,
        "customer_efficiency": customer_efficiency
    }

def create_workforce_dashboard_from_csv():
    """Create dashboard by loading separate CSV files for each visualization"""
    print("Creating dashboard from separate CSV datasets...")
    
    # Load each dataset separately
    store_perf_df = pd.read_csv("workforce_analysis/data/store_performance.csv")
    store_eff_df = pd.read_csv("workforce_analysis/data/store_efficiency.csv")
    employee_prod_df = pd.read_csv("workforce_analysis/data/employee_productivity.csv")
    location_perf_df = pd.read_csv("workforce_analysis/data/location_performance.csv")
    customer_eff_df = pd.read_csv("workforce_analysis/data/customer_efficiency.csv")
    
    print(f"Store Performance: {store_perf_df.shape}")
    print(f"Store Efficiency: {store_eff_df.shape}")
    print(f"Employee Productivity: {employee_prod_df.shape}")
    print(f"Location Performance: {location_perf_df.shape}")
    print(f"Customer Efficiency: {customer_eff_df.shape}")
    
    # Create HTML structure
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Workforce Optimization Dashboard</title>
        <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
            .chart-container { background-color: white; margin: 20px 0; padding: 20px; border-radius: 10px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
            .chart-title { font-size: 24px; font-weight: bold; text-align: center; margin-bottom: 20px; color: #333; }
            .dashboard-title { font-size: 36px; font-weight: bold; text-align: center; margin-bottom: 30px; color: #2c3e50; }
            .kpi-container { display: flex; justify-content: space-around; margin: 20px 0; }
            .kpi-box { background-color: #ffffff; padding: 20px; border-radius: 10px; text-align: center; box-shadow: 0 2px 5px rgba(0,0,0,0.1); min-width: 150px; }
            .kpi-value { font-size: 28px; font-weight: bold; color: #2c3e50; }
            .kpi-label { font-size: 14px; color: #7f8c8d; margin-top: 5px; }
            .insights-box { background-color: #e8f4f8; padding: 15px; margin: 10px 0; border-radius: 8px; font-size: 14px; }
        </style>
    </head>
    <body>
        <div class="dashboard-title">Workforce Optimization Dashboard</div>
    """
    
    # Calculate KPIs
    total_stores = len(store_perf_df)
    total_revenue = store_perf_df['Total_Revenue_USD'].sum()
    avg_revenue_per_store = store_perf_df['Total_Revenue_USD'].mean()
    avg_transaction_value = store_perf_df['Avg_Transaction_Value'].mean()
    total_employees_est = employee_prod_df['Actual_Employees'].sum()
    
    html_content += f"""
        <div class="kpi-container">
            <div class="kpi-box">
                <div class="kpi-value">{total_stores}</div>
                <div class="kpi-label">Total Stores</div>
            </div>
            <div class="kpi-box">
                <div class="kpi-value">${total_revenue:,.0f}</div>
                <div class="kpi-label">Total Revenue</div>
            </div>
            <div class="kpi-box">
                <div class="kpi-value">${avg_revenue_per_store:,.0f}</div>
                <div class="kpi-label">Avg Revenue/Store</div>
            </div>
            <div class="kpi-box">
                <div class="kpi-value">${avg_transaction_value:.2f}</div>
                <div class="kpi-label">Avg Transaction Value</div>
            </div>
            <div class="kpi-box">
                <div class="kpi-value">{total_employees_est}</div>
                <div class="kpi-label">Total Employees</div>
            </div>
        </div>
    """
    
    # Figure 1: Store Performance Matrix
    fig1 = px.scatter(
        store_perf_df.head(50),
        x="Total_Revenue_USD",
        y="Avg_Transaction_Value",
        size="Total_Transactions",
        color="Country",
        hover_name="Store_Name",
        hover_data={
            "City": True,
            "Total_Transactions": ":,",
            "Unique_Customers": ":,",
            "Revenue_Per_Customer": ":,.2f"
        },
        title="Store Performance: Revenue vs Transaction Value",
        labels={
            "Total_Revenue_USD": "Total Revenue (USD)",
            "Avg_Transaction_Value": "Average Transaction Value (USD)"
        },
        height=600
    )
    fig1.update_layout(template="plotly_white")
    
    # Figure 2: Top 20 Stores
    fig2 = px.bar(
        store_perf_df.head(20),
        x="Store_Name",
        y="Total_Revenue_USD",
        color="Country",
        title="Top 20 Store Performance by Revenue",
        labels={"Total_Revenue_USD": "Total Revenue (USD)"},
        height=600
    )
    fig2.update_xaxes(tickangle=45)
    fig2.update_layout(template="plotly_white")
    
    # Figure 3: Store Efficiency Matrix
    fig3 = px.scatter(
        store_eff_df,
        x="Revenue_Percentile",
        y="Customer_Value_Percentile",
        size="Total_Revenue_USD",
        color="Transaction_Value_Percentile",
        hover_name="Store_Name",
        hover_data={
            "Country": True,
            "City": True,
            "Total_Revenue_USD": ":,.0f",
            "Revenue_Per_Customer": ":,.2f"
        },
        title="Store Efficiency Matrix",
        labels={
            "Revenue_Percentile": "Revenue Performance (Percentile)",
            "Customer_Value_Percentile": "Customer Value Performance (Percentile)",
            "Transaction_Value_Percentile": "Transaction Value"
        },
        color_continuous_scale="RdYlBu_r",
        height=600,
        size_max=20
    )
    
    fig3.update_traces(
        marker=dict(sizemin=5, sizemode='diameter', line=dict(width=1, color='white'))
    )
    fig3.add_hline(y=0.5, line_dash="dash", line_color="gray", opacity=0.7)
    fig3.add_vline(x=0.5, line_dash="dash", line_color="gray", opacity=0.7)
    fig3.add_annotation(x=0.25, y=0.75, text="Low Revenue<br>High Customer Value", 
                       showarrow=False, font=dict(size=10, color="gray"))
    fig3.add_annotation(x=0.75, y=0.75, text="High Revenue<br>High Customer Value", 
                       showarrow=False, font=dict(size=10, color="gray"))
    fig3.add_annotation(x=0.25, y=0.25, text="Low Revenue<br>Low Customer Value", 
                       showarrow=False, font=dict(size=10, color="gray"))
    fig3.add_annotation(x=0.75, y=0.25, text="High Revenue<br>Low Customer Value", 
                       showarrow=False, font=dict(size=10, color="gray"))
    fig3.update_layout(template="plotly_white", showlegend=True,
                      coloraxis_colorbar=dict(title="Transaction Value Percentile", titleside="right"))

    
    # Figure 4: Employee Productivity
    fig4 = px.scatter(
        employee_prod_df.head(30),
        x="Revenue_Per_Employee",
        y="Transactions_Per_Employee",
        size="Total_Revenue_USD",
        color="Country",
        hover_name="Store_Name",
        hover_data={
            "City": True,
            "Actual_Employees": True,
            "Customers_Per_Employee": ":,.0f"
        },
        title="Employee Productivity Analysis",
        labels={
            "Revenue_Per_Employee": "Revenue Per Employee (USD)",
            "Transactions_Per_Employee": "Transactions Per Employee"
        },
        height=600
    )
    fig4.update_layout(template="plotly_white")
    
    # Figure 5: Location Performance
    fig5 = px.treemap(
        location_perf_df,
        path=['Country', 'City'],
        values='Total_Location_Revenue_USD',
        color='Avg_Revenue_Per_Store',
        hover_data={
            'Number_of_Stores': ':,',
            'Total_Location_Revenue_USD': ':,.0f',
            'Avg_Revenue_Per_Store': ':,.0f'
        },
        title='Location Performance by Revenue',
        color_continuous_scale='Viridis',
        height=600
    )
    fig5.update_layout(template="plotly_white")
    
    # Figure 6: Customer Efficiency
    fig6 = px.scatter(
        customer_eff_df,
        x="Unique_Customers",
        y="Revenue_Per_Customer",
        size="Total_Revenue_USD",
        color="Country",
        hover_name="Store_Name",
        title="Customer Efficiency: Volume vs Value",
        labels={
            "Unique_Customers": "Number of Unique Customers",
            "Revenue_Per_Customer": "Revenue Per Customer (USD)"
        },
        height=600
    )
    fig6.update_layout(template="plotly_white")
    
    # Add charts to HTML
    figures = [fig1, fig2, fig3, fig4, fig5, fig6]
    titles = [
        "Store Performance: Revenue vs Transaction Value",
        "Top 20 Store Performance by Revenue", 
        "Store Efficiency Matrix",
        "Employee Productivity Analysis",
        "Location Performance by Revenue",
        "Customer Efficiency: Volume vs Value"
    ]
    
    for i, (fig, title) in enumerate(zip(figures, titles)):
        html_content += f"""
        <div class="chart-container">
            <div class="chart-title">{title}</div>
            <div id="chart{i+1}"></div>
        </div>
        """
    
    # Add insights
    best_store = store_perf_df.iloc[0]
    best_location = location_perf_df.iloc[0]
    avg_productivity = employee_prod_df['Revenue_Per_Employee'].mean()
    
    html_content += f"""
        <div class="insights-box">
            <strong>Key Workforce Insights:</strong><br>
            • <strong>Top Store:</strong> {best_store['Store_Name']} ({best_store['City']}, {best_store['Country']}) - ${best_store['Total_Revenue_USD']:,.0f}<br>
            • <strong>Best Location:</strong> {best_location['City']}, {best_location['Country']} ({best_location['Number_of_Stores']} stores)<br>
            • <strong>Avg Employee Productivity:</strong> ${avg_productivity:,.0f} revenue per employee<br>
            • <strong>Revenue Range:</strong> ${store_perf_df['Total_Revenue_USD'].min():,.0f} - ${store_perf_df['Total_Revenue_USD'].max():,.0f}
        </div>
    """
    
    html_content += "<script>"
    
    for i, fig in enumerate(figures):
        plot_json = fig.to_json()
        html_content += f"""
        var plotData{i+1} = {plot_json};
        Plotly.newPlot('chart{i+1}', plotData{i+1}.data, plotData{i+1}.layout, {{responsive: true}});
        """
    
    html_content += """
    </script>
    </body>
    </html>
    """
    
    return html_content

def run_workforce_analysis():
    """Run the complete workforce analysis pipeline"""
    print("Starting Workforce Analysis...")
    
    # Step 1: Load data
    master_data = load_workforce_data()
    if master_data is None:
        return None
    
    # Step 2: Prepare separate datasets and save to CSV
    datasets = prepare_separate_datasets(master_data)
    
    # Step 3: Create dashboard from CSV files
    html_content = create_workforce_dashboard_from_csv()
    
    # Step 4: Save dashboard
    with open("workforce_analysis/workforce_dashboard.html", "w", encoding='utf-8') as f:
        f.write(html_content)
    
    print("\nWorkforce Analysis Complete!")
    print("Files Created:")
    print("   Interactive Dashboard: workforce_analysis/workforce_dashboard.html")
    print("   Separate Data Files:")
    print("     - store_performance.csv")
    print("     - store_efficiency.csv")
    print("     - employee_productivity.csv")
    print("     - location_performance.csv")
    print("     - customer_efficiency.csv")
    
    return datasets

# Execute the analysis
results = run_workforce_analysis()


Starting Workforce Analysis...
Loading workforce data...
Master data loaded successfully: (6416827, 45)
Preparing separate datasets for each visualization...
Sales data shape: (6077200, 45)
Creating store performance dataset...
Store Performance Dataset: (35, 13)
Creating store efficiency dataset...
Store Efficiency Dataset: (35, 19)
Creating employee productivity dataset...
Employee Productivity Dataset: (35, 16)
Creating location performance dataset...
Creating customer efficiency dataset...
Customer Efficiency Dataset: (35, 7)
All datasets created and saved successfully!
Creating dashboard from separate CSV datasets...
Store Performance: (35, 13)
Store Efficiency: (35, 19)
Employee Productivity: (35, 16)
Location Performance: (35, 9)
Customer Efficiency: (35, 7)

Workforce Analysis Complete!
Files Created:
   Interactive Dashboard: workforce_analysis/workforce_dashboard.html
   Separate Data Files:
     - store_performance.csv
     - store_efficiency.csv
     - employee_productivity