In [4]:
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import os

def load_master_data():
    """Load master transaction data"""
    print("üìä Loading Master Data...")
    master_data = pl.read_parquet("data/master_transactions.parquet")
    sales_data = master_data.filter(pl.col("Transaction Type") == "Sale")
    print(f"‚úì Loaded {sales_data.shape[0]:,} sales transactions")
    return sales_data

def prepare_analysis_datasets(sales_data):
    """Prepare datasets for analysis"""
    print("üìã Preparing Analysis Datasets...")
    
    # Create output directory
    os.makedirs("price_elasticity_analysis", exist_ok=True)
    
    # 1. Price Elasticity by Country
    print("  Computing price elasticity...")
    countries = sales_data["Country"].unique().to_list()
    elasticity_results = []
    
    for country in countries:
        country_data = sales_data.filter(pl.col("Country") == country)
        df = country_data.select(["Unit_Price_USD", "Quantity"]).to_pandas().dropna()
        df = df[(df['Unit_Price_USD'] > 0) & (df['Quantity'] > 0)]
        
        if len(df) < 50:
            continue
        
        # Calculate elasticity using price deciles
        df['Price_Decile'] = pd.qcut(df['Unit_Price_USD'], q=10, labels=False, duplicates='drop')
        price_demand = df.groupby('Price_Decile').agg({
            'Unit_Price_USD': 'mean',
            'Quantity': 'sum'
        }).reset_index()
        
        if len(price_demand) < 5:
            continue
        
        # Log-log regression for elasticity
        log_price = np.log(price_demand['Unit_Price_USD'])
        log_qty = np.log(price_demand['Quantity'])
        mask = np.isfinite(log_price) & np.isfinite(log_qty)
        
        if mask.sum() >= 3:
            from scipy import stats
            slope, _, _, _, _ = stats.linregress(log_price[mask], log_qty[mask])
            elasticity_results.append({
                'Country': country,
                'Price_Elasticity': round(abs(slope), 4)
            })
    
    elasticity_df = pl.DataFrame(elasticity_results)
    elasticity_df.write_csv("price_elasticity_analysis/price_elasticity_data.csv")
    
    # 2. Sales vs Quantity Analysis Datasets
    datasets = {}
    
    # Employee Analysis
    datasets['employee'] = sales_data.group_by("Employee ID").agg([
        pl.col("Line_Total_USD").sum().alias("Sales"),
        pl.col("Quantity").sum().alias("Qty")
    ]).filter(pl.col("Sales") > 0)
    
    # Country Analysis
    datasets['country'] = sales_data.group_by("Country").agg([
        pl.col("Line_Total_USD").sum().alias("Sales"),
        pl.col("Quantity").sum().alias("Qty"),
        pl.col("Employee ID").n_unique().alias("Staff_Count")
    ])
    
    # City Analysis
    datasets['city'] = sales_data.group_by(["City", "Country"]).agg([
        pl.col("Line_Total_USD").sum().alias("Sales"),
        pl.col("Quantity").sum().alias("Qty")
    ])
    
    # Customer Analysis
    datasets['customer'] = sales_data.group_by("Customer ID").agg([
        pl.col("Line_Total_USD").sum().alias("Sales"),
        pl.col("Quantity").sum().alias("Qty")
    ]).filter(pl.col("Sales") > 0)
    
    # Product Analysis
    datasets['product'] = sales_data.group_by("Name").agg([
        pl.col("Line_Total_USD").sum().alias("Sales"),
        pl.col("Quantity").sum().alias("Qty")
    ]).filter(pl.col("Sales") > 0)
    
    # Save all datasets
    for name, data in datasets.items():
        data.write_csv(f"price_elasticity_analysis/{name}_analysis_data.csv")
    
    # 3. Multiple Regression Dataset (Country level)
    multiple_reg_data = datasets['country'].filter(pl.col("Staff_Count") > 0)
    multiple_reg_data.write_csv("price_elasticity_analysis/multiple_regression_data.csv")
    
    print(f"‚úì Prepared {len(datasets)} analysis datasets + elasticity + multiple regression")

def perform_regression_analysis():
    """Perform regression analysis from saved datasets"""
    print("üìà Performing Regression Analysis...")
    
    ols_summary_data = []
    
    # Analysis types
    analysis_types = ['employee', 'country', 'city', 'customer', 'product']
    
    for analysis_type in analysis_types:
        # Load data
        df = pl.read_csv(f"price_elasticity_analysis/{analysis_type}_analysis_data.csv").to_pandas()
        df = df.dropna()
        df = df[(df['Sales'] > 0) & (df['Qty'] > 0)]
        
        if len(df) < 3:
            continue
        
        # OLS Regression: Sales = coefficient * Qty + intercept
        X = df['Qty'].values
        y = df['Sales'].values
        X_ols = sm.add_constant(X)
        model = sm.OLS(y, X_ols).fit()
        
        # Extract statistics
        intercept = model.params[0]
        coefficient = model.params[1]
        r2 = model.rsquared
        p_value = model.pvalues[1]
        
        # Add to OLS summary
        ols_summary_data.append({
            'Analysis_Type': analysis_type.title(),
            'Equation': f"Sales = {coefficient:.1f} √ó Qty + {intercept:.0f}",
            'R_Squared': f"{r2:.4f}",
            'P_Value': f"{p_value:.6f}",
            'Coefficient': f"{coefficient:.2f}",
            'Intercept': f"{intercept:.2f}",
            'Data_Points': len(df)
        })
    
    # Multiple Regression
    df_multi = pl.read_csv("price_elasticity_analysis/multiple_regression_data.csv").to_pandas().dropna()
    
    if len(df_multi) >= 3 and 'Staff_Count' in df_multi.columns:
        X_multi = df_multi[['Staff_Count', 'Qty']].values
        y_multi = df_multi['Sales'].values
        X_multi_ols = sm.add_constant(X_multi)
        model_multi = sm.OLS(y_multi, X_multi_ols).fit()
        
        intercept_multi = model_multi.params[0]
        coef_staff = model_multi.params[1]
        coef_qty = model_multi.params[2]
        r2_multi = model_multi.rsquared
        p_value_multi = model_multi.f_pvalue
        
        # Add to OLS summary
        ols_summary_data.append({
            'Analysis_Type': 'Multiple Regression',
            'Equation': f"Sales = {coef_staff:.1f} √ó Staff + {coef_qty:.1f} √ó Qty + {intercept_multi:.0f}",
            'R_Squared': f"{r2_multi:.4f}",
            'P_Value': f"{p_value_multi:.6f}",
            'Coefficient': f"Staff:{coef_staff:.1f}, Qty:{coef_qty:.1f}",
            'Intercept': f"{intercept_multi:.2f}",
            'Data_Points': len(df_multi)
        })
    
    # Save OLS Summary
    ols_summary_df = pl.DataFrame(ols_summary_data)
    ols_summary_df.write_csv("price_elasticity_analysis/ols_summary.csv")
    
    print("‚úì Regression analysis completed")

def create_dashboard():
    """Create dashboard using same logic as workforce productivity"""
    print("üìä Creating Dashboard...")
    
    # Load data from CSV files
    elasticity_df = pl.read_csv("price_elasticity_analysis/price_elasticity_data.csv").to_pandas()
    ols_summary_df = pl.read_csv("price_elasticity_analysis/ols_summary.csv").to_pandas()
    
    # Create individual figures first (same as workforce dashboard)
    figures = []
    
    # 1. Price Elasticity Chart
    fig1 = px.bar(
        elasticity_df, 
        x='Country', 
        y='Price_Elasticity',
        title='Price Elasticity by Country',
        text='Price_Elasticity'
    )
    fig1.update_traces(texttemplate='%{text:.4f}', textposition='outside')
    fig1.update_layout(height=400)
    figures.append(fig1)
    
    # 2. OLS Summary Table
    fig2 = go.Figure(data=[go.Table(
        header=dict(values=list(ols_summary_df.columns),
                   fill_color='lightblue',
                   align='center'),
        cells=dict(values=[ols_summary_df[col] for col in ols_summary_df.columns],
                  fill_color='white',
                  align='center')
    )])
    fig2.update_layout(title="OLS Summary Statistics", height=400)
    figures.append(fig2)
    
    # 3-7. Sales vs Quantity Regression Charts
    analysis_types = ['employee', 'country', 'city', 'customer', 'product']
    
    for analysis_type in analysis_types:
        try:
            df = pl.read_csv(f"price_elasticity_analysis/{analysis_type}_analysis_data.csv").to_pandas()
            df = df.dropna()
            df = df[(df['Sales'] > 0) & (df['Qty'] > 0)]
            
            if len(df) < 3:
                continue
            
            # Calculate regression
            X = df['Qty'].values
            y = df['Sales'].values
            X_ols = sm.add_constant(X)
            model = sm.OLS(y, X_ols).fit()
            y_pred = model.params[0] + model.params[1] * X
            
            # Calculate medians
            sales_median = df['Sales'].median()
            qty_median = df['Qty'].median()
            
            # Create scatter plot
            fig = go.Figure()
            
            # Add scatter points
            fig.add_trace(go.Scatter(
                x=df['Qty'],
                y=df['Sales'],
                mode='markers',
                name='Data Points',
                marker=dict(size=6, opacity=0.6)
            ))
            
            # Add regression line
            fig.add_trace(go.Scatter(
                x=df['Qty'],
                y=y_pred,
                mode='lines',
                name='Regression Line',
                line=dict(color='red', width=2)
            ))
            
            # Add median lines
            fig.add_shape(
                type="line",
                x0=qty_median, y0=df['Sales'].min(),
                x1=qty_median, y1=df['Sales'].max(),
                line=dict(color="gray", width=1, dash="dash")
            )
            
            fig.add_shape(
                type="line",
                x0=df['Qty'].min(), y0=sales_median,
                x1=df['Qty'].max(), y1=sales_median,
                line=dict(color="gray", width=1, dash="dash")
            )
            
            # Add equation annotation
            equation = f"Sales = {model.params[1]:.1f} √ó Qty + {model.params[0]:.0f}"
            r2 = model.rsquared
            
            fig.add_annotation(
                x=0.05, y=0.95,
                xref="paper", yref="paper",
                text=f"{equation}<br>R¬≤ = {r2:.3f}",
                showarrow=False,
                font=dict(size=12),
                bgcolor="rgba(255,255,255,0.8)",
                bordercolor="gray",
                borderwidth=1
            )
            
            fig.update_layout(
                title=f'{analysis_type.title()}: Sales vs Quantity',
                xaxis_title='Quantity',
                yaxis_title='Sales (USD)',
                height=400
            )
            
            figures.append(fig)
            
        except Exception as e:
            print(f"  ‚ö†Ô∏è Could not create chart for {analysis_type}: {e}")
            continue
    
    # 8. Multiple Regression 3D
    try:
        multiple_reg_data = pl.read_csv("price_elasticity_analysis/multiple_regression_data.csv").to_pandas()
        
        if len(multiple_reg_data) >= 3:
            fig3d = go.Figure(data=[go.Scatter3d(
                x=multiple_reg_data['Staff_Count'],
                y=multiple_reg_data['Qty'],
                z=multiple_reg_data['Sales'],
                mode='markers+text',
                text=multiple_reg_data['Country'] if 'Country' in multiple_reg_data.columns else None,
                marker=dict(
                    size=8,
                    color=multiple_reg_data['Sales'],
                    colorscale='Viridis',
                    showscale=True
                )
            )])
            
            fig3d.update_layout(
                title='Multiple Regression: Sales = f(Staff Count, Quantity)',
                scene=dict(
                    xaxis_title='Staff Count',
                    yaxis_title='Quantity',
                    zaxis_title='Sales (USD)'
                ),
                height=500
            )
            
            figures.append(fig3d)
    except:
        pass
    
    # Combine all figures into dashboard (same as workforce dashboard)
    dashboard_html = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Price Elasticity & Regression Analysis Dashboard</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            .chart-container { margin-bottom: 30px; }
            h1 { text-align: center; color: #2E86AB; }
        </style>
    </head>
    <body>
        <h1>Price Elasticity & Sales-Quantity Regression Analysis Dashboard</h1>
    """
    
    # Add each figure to HTML
    for i, fig in enumerate(figures):
        chart_html = fig.to_html(include_plotlyjs='inline' if i == 0 else False, div_id=f"chart_{i}")
        dashboard_html += f'<div class="chart-container">{chart_html}</div>'
    
    dashboard_html += """
    </body>
    </html>
    """
    
    # Save dashboard
    with open("price_elasticity_analysis/price_elasticity_dashboard.html", "w", encoding='utf-8') as f:
        f.write(dashboard_html)
    
    print("‚úì Dashboard saved as: price_elasticity_dashboard.html")
    
    # Return first figure for display
    return figures[0] if figures else None

def main():
    """Main analysis pipeline"""
    print("üöÄ PRICE ELASTICITY & REGRESSION ANALYSIS")
    print("=" * 60)
    
    # Step 1: Load master data
    sales_data = load_master_data()
    
    # Step 2: Prepare analysis datasets and save to CSV
    prepare_analysis_datasets(sales_data)
    
    # Step 3: Perform regression analysis from CSV data
    perform_regression_analysis()
    
    # Step 4: Create dashboard from CSV data
    dashboard_fig = create_dashboard()
    
    # Step 5: Display first chart
    if dashboard_fig:
        print("\nüìä Displaying Sample Chart...")
        dashboard_fig.show()
    
    print("\nüéâ ANALYSIS COMPLETED!")
    print("üìÅ Results saved in: price_elasticity_analysis/")
    print("üìä Dashboard: price_elasticity_dashboard.html")

if __name__ == "__main__":
    main()


üöÄ PRICE ELASTICITY & REGRESSION ANALYSIS
üìä Loading Master Data...
‚úì Loaded 6,077,200 sales transactions
üìã Preparing Analysis Datasets...
  Computing price elasticity...
‚úì Prepared 5 analysis datasets + elasticity + multiple regression
üìà Performing Regression Analysis...
‚úì Regression analysis completed
üìä Creating Dashboard...
‚úì Dashboard saved as: price_elasticity_dashboard.html

üìä Displaying Sample Chart...



üéâ ANALYSIS COMPLETED!
üìÅ Results saved in: price_elasticity_analysis/
üìä Dashboard: price_elasticity_dashboard.html


In [6]:
import polars as pl
import pandas as pd
import statsmodels.api as sm
import os

def load_master_data():
    """Load master transaction data"""
    print("üìä Loading Master Data...")
    master_data = pl.read_parquet("data/master_transactions.parquet")
    sales_data = master_data.filter(pl.col("Transaction Type") == "Sale")
    print(f"‚úì Loaded {sales_data.shape[0]:,} sales transactions")
    return sales_data

def prepare_regression_data(sales_data):
    """Prepare data for multiple regression - aggregated by Store ID"""
    print("üìã Preparing Regression Dataset by Store ID...")
    
    # Create output directory
    os.makedirs("ols_regression_results", exist_ok=True)
    
    # Aggregate by Store ID
    regression_data = sales_data.group_by("Store ID").agg([
        pl.col("Line_Total_USD").sum().alias("Total_Sales"),
        pl.col("Employee ID").n_unique().alias("Staff_Count"),
        pl.col("Quantity").sum().alias("Total_Quantity")
    ]).filter(
        (pl.col("Total_Sales") > 0) & 
        (pl.col("Staff_Count") > 0) & 
        (pl.col("Total_Quantity") > 0)
    )
    
    # Save dataset
    regression_data.write_csv("ols_regression_results/regression_data.csv")
    
    print(f"‚úì Dataset prepared: {regression_data.shape[0]} stores")
    print("‚úì Variables: Total_Sales, Staff_Count, Total_Quantity")
    return regression_data

def run_ols_regression():
    """Run OLS regression and display results table"""
    print("üìà Running OLS Regression...")
    
    # Load data
    df = pl.read_csv("ols_regression_results/regression_data.csv").to_pandas()
    df = df.dropna()
    
    print(f"‚úì Analysis dataset: {len(df)} stores")
    print(f"‚úì Variables: {list(df.columns)}")
    
    # Define variables
    Y = df['Total_Sales']  # Dependent variable
    X = df[['Staff_Count', 'Total_Quantity']]  # Independent variables
    
    # Add constant (intercept)
    X = sm.add_constant(X)
    
    # Fit OLS model
    model = sm.OLS(Y, X).fit()
    
    # Print the regression results table
    print("\n" + "="*80)
    print("OLS REGRESSION RESULTS")
    print("Total_Sales = Œ≤‚ÇÄ + Œ≤‚ÇÅ√óStaff_Count + Œ≤‚ÇÇ√óTotal_Quantity + Œµ")
    print("="*80)
    print(model.summary())
    
    # Save results to file
    with open("ols_regression_results/regression_summary.txt", "w") as f:
        f.write("OLS REGRESSION RESULTS\n")
        f.write("Total_Sales = Œ≤‚ÇÄ + Œ≤‚ÇÅ√óStaff_Count + Œ≤‚ÇÇ√óTotal_Quantity + Œµ\n")
        f.write("="*80 + "\n")
        f.write(str(model.summary()))
    
    # Save coefficients table
    results_df = pd.DataFrame({
        'Variable': ['const', 'Staff_Count', 'Total_Quantity'],
        'Coefficient': model.params.values,
        'Std_Error': model.bse.values,
        't_Statistic': model.tvalues.values,
        'P_Value': model.pvalues.values,
        'Conf_Int_Lower': model.conf_int()[0].values,
        'Conf_Int_Upper': model.conf_int()[1].values
    })
    
    results_df.to_csv("ols_regression_results/coefficients_table.csv", index=False)
    
    # Print summary statistics
    print(f"\nüìä DATASET SUMMARY:")
    print(f"   Stores analyzed: {len(df)}")
    print(f"   Total Sales range: ${df['Total_Sales'].min():,.0f} - ${df['Total_Sales'].max():,.0f}")
    print(f"   Staff Count range: {df['Staff_Count'].min()} - {df['Staff_Count'].max()}")
    print(f"   Total Quantity range: {df['Total_Quantity'].min():,.0f} - {df['Total_Quantity'].max():,.0f}")
    
    print(f"\n‚úì Results saved to: ols_regression_results/")
    print(f"‚úì Summary: regression_summary.txt")
    print(f"‚úì Coefficients: coefficients_table.csv")
    
    return model

def main():
    """Main function - OLS regression by Store ID"""
    print("üöÄ OLS REGRESSION ANALYSIS BY STORE")
    print("Model: Total_Sales = Œ≤‚ÇÄ + Œ≤‚ÇÅ√óStaff_Count + Œ≤‚ÇÇ√óTotal_Quantity + Œµ")
    print("Aggregation Level: Store ID")
    print("="*70)
    
    # Load data
    sales_data = load_master_data()
    
    # Prepare regression dataset by Store ID
    regression_data = prepare_regression_data(sales_data)
    
    # Run OLS regression and display table
    model = run_ols_regression()
    
    print("\nüéâ OLS REGRESSION COMPLETED!")

if __name__ == "__main__":
    main()


üöÄ OLS REGRESSION ANALYSIS BY STORE
Model: Total_Sales = Œ≤‚ÇÄ + Œ≤‚ÇÅ√óStaff_Count + Œ≤‚ÇÇ√óTotal_Quantity + Œµ
Aggregation Level: Store ID
üìä Loading Master Data...
‚úì Loaded 6,077,200 sales transactions
üìã Preparing Regression Dataset by Store ID...
‚úì Dataset prepared: 35 stores
‚úì Variables: Total_Sales, Staff_Count, Total_Quantity
üìà Running OLS Regression...
‚úì Analysis dataset: 35 stores
‚úì Variables: ['Store ID', 'Total_Sales', 'Staff_Count', 'Total_Quantity']

OLS REGRESSION RESULTS
Total_Sales = Œ≤‚ÇÄ + Œ≤‚ÇÅ√óStaff_Count + Œ≤‚ÇÇ√óTotal_Quantity + Œµ
                            OLS Regression Results                            
Dep. Variable:            Total_Sales   R-squared:                       0.990
Model:                            OLS   Adj. R-squared:                  0.989
Method:                 Least Squares   F-statistic:                     1560.
Date:                Sun, 03 Aug 2025   Prob (F-statistic):           1.27e-32
Time:                   