In [1]:
import pandas as pd
import glob 
import time
import duckdb

In [2]:
conn = duckdb.connect()

### 1. Data Modelling

In [3]:
# Manually define the order of schema files
schema_files_order = [
    "dim_seller_locations.sql", 
    "dim_product_categories.sql", 
    "dim_payment_methods.sql", 
    "dim_geography.sql", 
    "dim_demographics.sql", 
    "dim_delivery_status.sql", 
    "dim_brands.sql", 
    "dim_customers.sql", 
    "dim_products.sql", 
    "dim_sellers.sql", 
    "sales_fact.sql"
]

schema_directory = "./schema/"

# Execute each SQL file in the correct order
for sql_file_name in schema_files_order:
    sql_file_path = schema_directory + sql_file_name
    with open(sql_file_path, 'r') as file:
        sql_query = file.read()
    conn.execute(sql_query)

In [4]:
# Define the directory where CSV data is stored
csv_directory = "./olap-table/"

# List of CSV file names corresponding to the schema
csv_files = [
    ("seller_locations.csv", "dim_seller_locations"),
    ("product_categories.csv", "dim_product_categories"),
    ("payment_methods.csv", "dim_payment_methods"),
    ("geography.csv", "dim_geography"),
    ("demographics.csv", "dim_demographics"),
    ("delivery_status.csv", "dim_delivery_status"),
    ("brands.csv", "dim_brands"),
    ("customers.csv", "dim_customers"),
    ("products.csv", "dim_products"),
    ("sellers.csv", "dim_sellers"),
    ("sales_fact.csv", "sales_fact"),
]

# Load data into DuckDB from CSV files
for csv_file, table_name in csv_files:
    csv_path = csv_directory + csv_file
    conn.execute(f"COPY {table_name} FROM '{csv_path}' (DELIMITER ',', HEADER TRUE);")

### 2. Configuring the OLAP Cube

- Pre-aggregating the data to mimic the behavior of an OLAP cube.
- Storing these aggregations in tables or materialized views for reusability.

In [5]:
# Define aggregation queries for OLAP functionality
olap_queries = [
    # Aggregate sales by product category and geography
    {
        "name": "sales_by_category_region",
        "query": """
            CREATE TABLE sales_by_category_region AS
            SELECT
                pc.Category AS product_category,
                geo.Country AS country,
                geo.State AS state,
                SUM(fact.Total_Amount) AS total_sales,
                COUNT(fact.Order_ID) AS transaction_count
            FROM sales_fact fact
            JOIN dim_products prod ON fact.Product_ID = prod.Product_ID
            JOIN dim_product_categories pc ON prod.Category_ID = pc.Category_ID
            JOIN dim_customers cust ON fact.Customer_ID = cust.Customer_ID
            JOIN dim_geography geo ON cust.Geography_ID = geo.Geography_ID
            GROUP BY ROLLUP (pc.Category, geo.Country, geo.State);
        """
    },
    # Aggregate sales by product and customer demographics
    {
        "name": "sales_by_product_customer",
        "query": """
            CREATE TABLE sales_by_product_customer AS
            SELECT
                prod.Product_Name AS product_name,
                demo.Age_Group AS customer_age_group,
                demo.Gender AS customer_gender,
                SUM(fact.Total_Amount) AS total_sales,
                AVG(fact.Total_Amount) AS average_order_value,
                COUNT(fact.Order_ID) AS purchase_frequency
            FROM sales_fact fact
            JOIN dim_products prod ON fact.Product_ID = prod.Product_ID
            JOIN dim_customers cust ON fact.Customer_ID = cust.Customer_ID
            JOIN dim_demographics demo ON cust.Demographics_ID = demo.Demographics_ID
            GROUP BY prod.Product_Name, demo.Age_Group, demo.Gender;
        """
    },
    # Time-based aggregation: Sales by year and month
    {
        "name": "sales_by_time",
        "query": """
            CREATE TABLE sales_by_time AS
            SELECT
                EXTRACT(YEAR FROM fact.Order_Date) AS year,
                EXTRACT(MONTH FROM fact.Order_Date) AS month,
                SUM(fact.Total_Amount) AS total_sales,
                COUNT(fact.Order_ID) AS transaction_count
            FROM sales_fact fact
            GROUP BY CUBE (EXTRACT(YEAR FROM fact.Order_Date), EXTRACT(MONTH FROM fact.Order_Date));
        """
    },
]

# Execute OLAP queries to create pre-aggregated tables
for olap in olap_queries:
    conn.execute(olap["query"])
    print(f"Created OLAP table: {olap['name']}")

Created OLAP table: sales_by_category_region
Created OLAP table: sales_by_product_customer
Created OLAP table: sales_by_time


### 3. Perform OLAP Operations <br/>

#### a) Roll-up (Summarize Data)

Explanation: <br />
The roll-up operation aggregates sales data by product category and country, summarizing total sales and transaction counts across different regions. <br />
Use Case: <br />
This helps managers understand the overall performance of different product categories across countries, enabling them to identify trends and make data-driven decisions on which product categories are performing well.

In [6]:
# Step 2: Define OLAP operations
rollup_query = """
SELECT 
    product_category,
    SUM(total_sales) AS total_sales,
    SUM(transaction_count) AS total_transactions,
FROM sales_by_category_region
WHERE country IS NULL AND state IS NULL
GROUP BY product_category
ORDER BY total_sales DESC;
"""
rollup_result = conn.execute(rollup_query).fetch_df()
print("Roll-up Result:")
rollup_result

Roll-up Result:


Unnamed: 0,product_category,total_sales,total_transactions
0,,147578200.0,50000.0
1,Fashion,25526500.0,8636.0
2,Beauty,25234980.0,8576.0
3,Electronics,25157360.0,8519.0
4,Groceries,24107360.0,8148.0
5,Books,23996280.0,8150.0
6,Home Appliances,23555750.0,7971.0


#### b) Drill-down (Detailed Data)

Explanation: <br />
Drill-down retrieves detailed sales data for the '18-25' age group, broken down by product name and gender, allowing for deeper analysis of specific customer segments. <br />

Use Case: <br />
Retailers can use this information to develop targeted marketing campaigns aimed at younger customers, promoting products that are popular within this demographic.

In [7]:
# Drill-down: Get detailed data
drilldown_query = """
SELECT 
    product_name,
    customer_age_group,
    customer_gender,
    total_sales
FROM sales_by_product_customer
WHERE customer_age_group = '18-25'
ORDER BY total_sales DESC;
"""

drilldown_result = conn.execute(drilldown_query).fetch_df()
print("Drill-down Result:")
drilldown_result

Drill-down Result:


Unnamed: 0,product_name,customer_age_group,customer_gender,total_sales
0,Product_8840,18-25,Female,23151.14
1,Product_9653,18-25,Male,21992.52
2,Product_2231,18-25,Other,20321.79
3,Product_9018,18-25,Other,20097.86
4,Product_7866,18-25,Male,19584.20
...,...,...,...,...
10593,Product_9013,18-25,Female,-141.92
10594,Product_6522,18-25,Other,-160.89
10595,Product_8488,18-25,Other,-161.11
10596,Product_4706,18-25,Female,-174.19


### c) Slice and Dice (Subset of Data)

Explanation: <br />
This query extracts sales data for a specific period (May to July 2023), allowing a focused analysis on seasonal trends. <br /> 
 
Use Case:<br />
Helps in evaluating the impact of mid-year promotions and understanding seasonal demand patterns.

In [8]:
# Slice and Dice: View specific subsets of data
slice_dice_query = """
SELECT 
    year,
    month,
    total_sales
FROM sales_by_time
WHERE year = 2023 AND month IN (5, 6, 7)  -- Slice for specific year and months
ORDER BY year, month;
"""

slice_dice_query_result = conn.execute(slice_dice_query).fetch_df()
print("Slide-and-Dice Result:")
slice_dice_query_result

Slide-and-Dice Result:


Unnamed: 0,year,month,total_sales
0,2023,5,12479284.45
1,2023,6,12456978.49
2,2023,7,12422850.89


#### Pivot (Reorient Data Dimensions)

Explanation: <br />
This pivot query reorganizes the data to show sales across different product categories for each country, allowing for easy comparison. <br />

Use Case: <br />
Retailers can quickly identify which product categories are most popular in different countries and allocate marketing budgets accordingly. <br />

In [9]:
# Slice and Dice: View specific subsets of data
slice_dice_query = """
SELECT country, 
       SUM(CASE WHEN product_category = 'Beauty' THEN total_sales ELSE 0 END) AS beauty_sales,
       SUM(CASE WHEN product_category = 'Books' THEN total_sales ELSE 0 END) AS books_sales,
       SUM(CASE WHEN product_category = 'Fashion' THEN total_sales ELSE 0 END) AS fashion_sales,
       SUM(CASE WHEN product_category = 'Electronics' THEN total_sales ELSE 0 END) AS electronics_sales,  
       SUM(CASE WHEN product_category = 'Groceries' THEN total_sales ELSE 0 END) AS groceries_sales,
       SUM(CASE WHEN product_category = 'Home Appliances' THEN total_sales ELSE 0 END) AS homeAppliances_sales 
FROM sales_by_category_region
GROUP BY country;
"""

slice_dice_query_result = conn.execute(slice_dice_query).fetch_df()
print("Slide-and-Dice Result:")
slice_dice_query_result

Slide-and-Dice Result:


Unnamed: 0,country,beauty_sales,books_sales,fashion_sales,electronics_sales,groceries_sales,homeAppliances_sales
0,,25234977.52,23996278.0,25526498.03,25157359.96,24107356.25,23555751.36
1,India,50469955.04,47992556.0,51052996.06,50314719.92,48214712.5,47111502.72


Explanation: <br />
This pivot query shows sales for each product broken down by customer age groups, enabling side-by-side comparison of product popularity across demographics. <br />

Use Case: <br />
Retailers can tailor product promotions based on the age group that shows the highest demand, maximizing sales for specific demographics. <br />

In [10]:
# Slice and Dice: View specific subsets of data
slice_dice_query = """
SELECT product_name, 
       SUM(CASE WHEN customer_age_group = '18-25' THEN total_sales ELSE 0 END) AS sales_18_25,
       SUM(CASE WHEN customer_age_group = '26-35' THEN total_sales ELSE 0 END) AS sales_26_35,
       SUM(CASE WHEN customer_age_group = '36-45' THEN total_sales ELSE 0 END) AS sales_36_45,
       SUM(CASE WHEN customer_age_group = '46-55' THEN total_sales ELSE 0 END) AS sales_46_55
FROM sales_by_product_customer
GROUP BY product_name;
"""

slice_dice_query_result = conn.execute(slice_dice_query).fetch_df()
print("Slide-and-Dice Result:")
slice_dice_query_result

Slide-and-Dice Result:


Unnamed: 0,product_name,sales_18_25,sales_26_35,sales_36_45,sales_46_55
0,Product_9527,1681.73,10578.62,11739.88,414.56
1,Product_8843,4127.25,0.00,671.59,3868.77
2,Product_2554,941.58,5139.04,7477.59,1755.70
3,Product_3808,962.01,7831.97,0.00,6327.18
4,Product_5774,834.73,18028.10,0.00,5847.94
...,...,...,...,...,...
9934,Product_3203,6801.10,2438.17,7165.27,5552.64
9935,Product_9667,0.00,5426.02,5613.21,2749.16
9936,Product_2760,0.00,10149.86,0.00,712.39
9937,Product_7791,2591.23,0.00,0.00,0.00


In [11]:
conn.close()