## Build Data Mart
#### Contributors:
##### Tean 8: Anthony Ung, Sean Jerzewski, Gideon Kipkorir
##### Team 9: Rohith, Sneha Dasarla
##### Team 10: Anmol Brahmbhatt, Nikita Brahmbhatt, Satya

## 0. Dependencies and Global Variables

In [1]:
import os
from enum import Enum
import csv
import sqlite3 as lite
from decimal import Decimal
from datetime import date, datetime, timedelta

In [2]:
DATA_MART_START = datetime.now()

DB_HANDLES = {}

#

## 1. Gather the file paths
  
  
## IMPORTANT: 
#### Most of these files are untracked on GitHub. it is each team members'   
####   &emsp; &emsp; It is each team members' individual responsibilities  
####   &emsp; &emsp; to build the Database and CSV files for themselves using the other Jupyter notebooks. 

In [3]:
FILE_PATHS = {
    'DB_TEAM_8' : './../0_SD_Team_8/store_team_8.db',
    'DB_TEAM_9' : './../0_SD_Team_9/grocery_store.db',
    'DB_TEAM_10' : './../0_SD_Team_10/grocery_team_10_v2.db',
    'PRODUCTS_CSV' : './../2_Product_Mapping/PRODUCTS_MAPPED.csv'
}

DATA_MART_PATH = './Region_C_Data_Mart.db'

In [4]:
ALL_FILES_OK = True

for file_key in FILE_PATHS:
    file_name = FILE_PATHS[file_key]
    file_exists = os.path.isfile(file_name)
    
    if(file_exists):
        print(f'OK - {file_key} - \'{file_name}\'')
    else:
        ALL_FILES_OK = False
        print(f'MISSING - {file_key} - \'{file_name}\'')

if not ALL_FILES_OK:
    raise SystemExit('\n' "ERROR!" '\n' "You are missing files!" '\n' "Read and Follow the Cell instructions provided.")

OK - DB_TEAM_8 - './../0_SD_Team_8/store_team_8.db'
OK - DB_TEAM_9 - './../0_SD_Team_9/grocery_store.db'
OK - DB_TEAM_10 - './../0_SD_Team_10/grocery_team_10_v2.db'
OK - PRODUCTS_CSV - './../2_Product_Mapping/PRODUCTS_MAPPED.csv'


#

## 2. Compile the table definitions
- Modified the product table to also hold the cost to the store to assist some computations
- If more tables need to be built, it is VITAL that the name of the table in the  
    &ensp; &ensp; CREATE TABLE statement is the same name as the dictionary's key.

In [5]:
'''
    TABLE_DEFINITIONS is a dict as follows:
        Key - the name of the table in the database
        Value - the CREATE TABLE statement for the table
    I wrote a lot of unused table definitions that will be useful
        in a later HW.
'''
TABLE_DEFINITIONS = {
    'date' : \
            'CREATE TABLE date(' \
                    'DateKey INT, ' \
                    'PrettyDate TEXT, ' \
                    'DayNumberInMonth INT, ' \
                    'DayNumberInYear INT, ' \
                    'WeekNumberInYear INT, ' \
                    'MonthNum INT, ' \
                    'MonthTxt TEXT, ' \
                    'Quarter INT, ' \
                    'Year INT,' \
                    'FiscalYear INT, ' \
                    'isHoliday INT, ' \
                    'isWeekend INT, ' \
                    'Season TEXT' ')',

    'product': \
            'CREATE TABLE product(' \
                    'ProductKey INT,' \
                    'sku INT,' \
                    'product_name TEXT, ' \
                    'product_class_id INT, ' \
                    'subcategory TEXT, ' \
                    'category TEXT, ' \
                    'department TEXT, ' \
                    'product_family TEXT, ' \
                    'size TEXT, ' \
                    'case_count INT, ' \
                    'BrandName TEXT, ' \
                    'Manufacturer TEXT, ' \
                    'Supplier TEXT, ' \
                    'CostToStore REAL)',

    'product_metadata': \
            'CREATE TABLE product_metadata(' \
                    'ProductKey INT,' \
                    'sku INT,' \
                    'product_name TEXT, ' \
                    'old_type TEXT, ' \
                    'subcategory TEXT, ' \
                    'category TEXT, ' \
                    'department TEXT, ' \
                    'product_family TEXT, ' \
                    'meta_code INT,' \
                    'meta_mapped_by TEXT, ' \
                    'meta_reason TEXT)',
    
    'store' : \
            'CREATE TABLE store(' \
                    'StoreKey INT, ' \
                    'StoreManager TEXT, ' \
                    'StoreStreetAddr TEXT, ' \
                    'StoreTown TEXT, ' \
                    'StoreZipCode TEXT, ' \
                    'StorePhoneNumber TEXT, ' \
                    'StoreState TEXT' ')',
    
    'sales_transactions': \
            'CREATE TABLE sales_transactions(' \
                    'DateKey INT, ' \
                    'DailyCustomerNumber INT, ' \
                    'ProductKey INT, ' \
                    'StoreKey INT, ' \
                    'QuantitySold INT, ' \
                    'TotalDollarSales REAL, ' \
                    'TotalCostToStore REAL, ' \
                    'GrossProfit REAL)',

    'sales_daily': \
            'CREATE TABLE sales_daily(' \
                    'DateKey INT, ' \
                    'ProductKey INT, ' \
                    'StoreKey INT, ' \
                    'QuantitySoldToday INT, ' \
                    'CostOfItemsSold REAL, ' \
                    'SalesTotal REAL, ' \
                    'GrossProfit REAL)',

    'inventory_daily' : \
            'CREATE TABLE inventory_daily(' \
                    'DateKey INT, ' \
                    'ProductKey INT, ' \
                    'StoreKey INT, ' \
                    'NumAvailable INT, '
                    'CostToStoreItem FLOAT, ' \
                    'CostToStore FLOAT, ' \
                    'NumCasesPurchasedToDate INT)', 

    'inventory_quarterly' : \
            'CREATE TABLE inventory_quarterly(' \
                    'ProductKey INT, ' \
                    'StoreKey INT, ' \
                    'QuarterAndYear TEXT, ' \
                    'Quarter INT, ' \
                    'Year INT, ' \
                    'CasesPurchasedToDate INT, ' \
                    'CasesPurchasedThisQuarter INT, ' \
                    'CasesOnHand INT, ' \
                    'TotalCostToStoreThisQuarter FLOAT, ' \
                    'TotalSoldByStoreThisQuarter FLOAT, ' \
                    'TotalCostToStoreThisYTD FLOAT, ' \
                    'TotalSoldByStoreThisYTD FLOAT)'
}


#

## 3. Initialize the Database File and the Database API

I originally made this Database API back in HW 2.

#### Note: The first cell in this block is destructive.
#### If you need to see multiple versions of the database side-by-side, rename the db file before rerunning this notebook.

In [6]:
if os.path.isfile(DATA_MART_PATH):
    os.remove(DATA_MART_PATH)

In [7]:
'''
    If I try to make db_options an inner class to db, 
        I get an error saying that the class is undefined.
'''
class db_options(Enum):
        DEFAULT = 0
        RETURN_RESULTS = 1
        PRINT_RESULTS = 2

class db:
    
    def __init__(self, name):
        self.name = rf"{name}"

    def connect(self):
        self.con = lite.connect(self.name)
        self.cur = self.con.cursor()

    def build_table(self, name):      
        self.execute_sql(f'DROP TABLE IF EXISTS {name}')
        self.execute_sql(TABLE_DEFINITIONS[name])
    
    def execute_sql(self, sql, options=db_options.DEFAULT):
        if (options.value & db_options.RETURN_RESULTS.value):
            results = self.cur.execute(sql).fetchall()
            return results
        elif (options.value & db_options.PRINT_RESULTS.value):
            results = self.cur.execute(sql).fetchall()
            for row in results:
                print(row)
        else:
            self.cur.execute(sql)

    def execute_sql_values(self, sql, values, options=db_options.DEFAULT):
        if (options.value & db_options.RETURN_RESULTS.value):
            results = self.cur.execute(sql, values).fetchall()
            return results
        elif (options.value & db_options.PRINT_RESULTS.value):
            results = self.cur.execute(sql, values).fetchall()
            for row in results:
                print(row)
        else:
            self.cur.execute(sql, values)


    def commit(self):
        self.con.commit()

    def close(self):
        self.con.commit()
        self.con.close()

In [8]:
DB_HANDLES['DB_TEAM_8'] = db(FILE_PATHS['DB_TEAM_8'])
DB_HANDLES['DB_TEAM_9'] = db(FILE_PATHS['DB_TEAM_9'])
DB_HANDLES['DB_TEAM_10'] = db(FILE_PATHS['DB_TEAM_10'])
DB_HANDLES['DATA_MART'] = db(DATA_MART_PATH)

#

## 4. Build the Dimension Tables

#### Product Dimension
The presence of the CSV generated by the script is checked earlier.

In [9]:
def build_product_table():
    db_handle = DB_HANDLES['DATA_MART']
    
    with open(FILE_PATHS['PRODUCTS_CSV'], 'r') as csvfile:
        db_handle.connect()

        db_handle.build_table('product')
        db_handle.build_table('product_metadata')
        
        for row in csv.DictReader(csvfile):
            product_key = row['product_id']
            sku = row['SKU']
            product_name = row['Product Name']
            product_class_id = row['product_class_id']
            product_subcategory = row['product_subcategory']
            product_category = row['product_category']
            product_department = row['product_department']
            product_family = row['product_family']
            size = row['Size']
            case_count = 12
            brand_name = row['product_subcategory']
            manufacturer = row['Manufacturer']
            supplier = row['Supplier']
            cost_to_store = round(float(Decimal(row['BasePrice'].strip('$'))),2)


            old_type = row['itemType']
            meta_code = row['meta_code']
            meta_mapped_by = row['meta_mapped_by']
            meta_reason = row['meta_reason']

            db_handle.execute_sql_values(sql='insert into product values \
                                    (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', \
                                 values=(product_key, sku, product_name, \
                                        product_class_id, product_subcategory, product_category, product_department, product_family, \
                                        size, case_count,
                                        brand_name, manufacturer, supplier, cost_to_store))

            db_handle.execute_sql_values(sql='insert into product_metadata values \
                                    (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', \
                                    values=(product_key, sku, product_name, \
                                            old_type, product_subcategory, product_category, product_department, product_family, \
                                            meta_code, meta_mapped_by, meta_reason))
        
        
        print('Product and Product Metadata Tables Populated')
        db_handle.commit()
        db_handle.close()

build_product_table()

Product and Product Metadata Tables Populated


#### Store Dimension
Code originally written by Gideon Kipkorir

In [10]:
data = [
    {
        "StoreKey": 8,
        "StoreManager": "Anthony-Sean-Gideon",
        "StoreStreetAddr": "1180 Seven Seas Dr",
        "StoreTown": "Orlando",
        "StoreZipCode": "32836",
        "StorePhone#": "(407) 824-4500",
        "StoreState": "FL"
    },
    {
        "StoreKey": 9,
        "StoreManager": "Rohith-Sneha",
        "StoreStreetAddr": "201 Mullica Hill Road",
        "StoreTown": "Glassboro",
        "StoreZipCode": "08028",
        "StorePhone#": "(856) 424-2222 x2500",
        "StoreState": "NJ"
    },
    {
        "StoreKey": 10,
        "StoreManager": "Anmol-Nikita-Satya",
        "StoreStreetAddr": "620 Anthony Ung Drive",
        "StoreTown": "Miami",
        "StoreZipCode": "33130",
        "StorePhone#": "(856) 663-8006",
        "StoreState": "FL"
    }
]

def build_store_dimension():
    db_handle = DB_HANDLES['DATA_MART']
    db_handle.connect()
    db_handle.build_table('store')

    for store in data:
        db_handle.execute_sql_values(sql='insert into store values \
                                    (?, ?, ?, ?, ?, ?, ?)', \
                                    values=(store['StoreKey'], \
                                            store['StoreManager'], \
                                            store['StoreStreetAddr'], \
                                            store['StoreTown'], \
                                            store['StoreZipCode'], \
                                            store['StorePhone#'], \
                                            store['StoreState']))
    
    db_handle.commit()
    db_handle.close()
    print('Store Dimension Successfully Built')
    

build_store_dimension()

Store Dimension Successfully Built


#### Date Dimension
Logic originally written by Sean Jerzewski  
AU changed the dates of the equinoxes.

In [11]:
def build_date_dimension():
    db_handle = DB_HANDLES['DATA_MART']
    db_handle.connect()
    db_handle.build_table('date')

    start_date = date(2024,1,1)
    end_date = date(2024,12,31)
    
    current_date = start_date
    day_number = 1

    holidays = ["2024-01-01", \
                "2024-01-15", \
                "2024-02-19", \
                "2024-03-29", \
                "2024-05-27", \
                "2024-06-21", \
                "2024-07-04", \
                "2024-09-02", \
                "2024-10-14", \
                "2024-11-05", \
                "2024-11-11", \
                "2024-11-28", \
                "2024-12-25"]
    
    spring = date(2024,3,21)
    summer = date(2024,6,21)
    fall = date(2024,9,21)
    winter = date(2024,12,21)

    while (current_date <= end_date):
        DateKey = day_number
        PrettyDate = current_date.strftime('%Y-%m-%d')
        DayNumberInMonth = current_date.strftime('%d')
        DayNumberInYear = day_number
        WeekNumberInYear = current_date.strftime('%W')
        MonthNum = current_date.strftime('%m')
        MonthTxt = current_date.strftime('%B')
        Quarter = (int(MonthNum) + 2) // 3
        Year = current_date.year
        FiscalYear = 2023 if current_date.month < 8 else 2024
        isHoliday = 'True' if current_date.strftime('%Y-%m-%d') in holidays else 'False'

        # 'False' is more typical than True
        isWeekend = 'False' if current_date.weekday() < 5 else 'True'

        if spring <= current_date < summer:
            season = "Spring"
        elif summer <= current_date < fall:
            season = "Summer"
        elif fall <= current_date < winter:
            season = "Fall"
        else:
            season = "Winter"

        db_handle.execute_sql_values(sql='insert into date values \
                                    (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', \
                                    values=(DateKey, \
                                            PrettyDate, \
                                            DayNumberInMonth, \
                                            DayNumberInYear, \
                                            WeekNumberInYear, \
                                            MonthNum, \
                                            MonthTxt, \
                                            Quarter, \
                                            Year, \
                                            FiscalYear, \
                                            isHoliday, \
                                            isWeekend, \
                                            season))
        
        day_number += 1
        current_date += timedelta(days=1)

    db_handle.commit()
    db_handle.close()
    print('Date Dimension Successfully Built')

build_date_dimension()

Date Dimension Successfully Built


#

## 5. Build the tables

#### I use my own Database API to build the table.

In [12]:
db_handle = DB_HANDLES['DATA_MART']
db_handle.connect()
db_handle.build_table('sales_transactions')
db_handle.build_table('inventory_daily')
db_handle.build_table('sales_daily')
db_handle.build_table('inventory_quarterly')
db_handle.commit()
db_handle.close()


#### Build an auxiliary lookup table in memory
Given a fact table of size `m` and a dimension table of size `n`, I note the following about time and space complexity:
Joins are O(m*n) whereas one lookup per row is O(m). The space requirement changes from O(1) to O(n)

In [13]:
PRODUCTS_LOOKUP = {}

db_handle = DB_HANDLES['DATA_MART']
db_handle.connect()

sql = 'SELECT sku, ProductKey, CostToStore FROM product'
results = db_handle.execute_sql(sql, options=db_options.RETURN_RESULTS)
for row in results:
    PRODUCTS_LOOKUP[str(row[0])] = {'ProductKey': row[1], 'CostToStore': row[2]}

db_handle.close()

#### Create Utility One-Line Functions
This was done to improve code readability.

In [14]:
def round_money(amount): return round(amount, 2)
def get_product_cost(sku): return PRODUCTS_LOOKUP[str(sku)]['CostToStore']
def get_case_count(qty): return ((((qty+11)//12)))

#

## 6. Team 8's ETL

#### I. Build the Data Structures Necessary to ETL from Team 8's Database

In [15]:
DATE_KEYS = {}

def build_data_structures_8():
    start_date = date(2024,1,1)
    end_date = date(2024,12,31)
    current_date = start_date
    
    date_key = 1
    
    while (current_date <= end_date):
        date_str = current_date.strftime('%Y-%m-%d')
        DATE_KEYS[date_str] = date_key
    
        date_key += 1
        current_date += timedelta(days=1)

#### II. Sales

In [16]:
def etl_team_8_sales():
    db_handle_old = DB_HANDLES['DB_TEAM_8']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT date, sku, customer_number, COUNT(*), SUM(salesPrice )' \
            'FROM sales_transactions GROUP BY date, customer_number, sku'
    
    
    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO sales_transactions VALUES (?, ?, ?, ?, ?, ?, ?, ?)'
    
    num_records = 0

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)
    
    print(f'{datetime.now()} - Started Insertions')
    for row in results:
        DateKey = DATE_KEYS[row[0]]
        DailyCustomerNumber = row[2]
        ProductKey = PRODUCTS_LOOKUP[str(row[1])]['ProductKey']
        StoreKey = 8
        QuantitySold = round_money(row[3])
        TotalDollarSales = round_money(row[4])
        TotalCostToStore = round_money((row[3] * get_product_cost(row[1])))
        GrossProfit = round((TotalDollarSales - TotalCostToStore), 2)
    
        values = (DateKey, DailyCustomerNumber, ProductKey, StoreKey, \
                 QuantitySold, TotalDollarSales, TotalCostToStore, GrossProfit)
    
        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 1000000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')
    
    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()

#### III. Roll Sales Up

In [17]:
def etl_team_8_sales_daily():
    db_handle_old = DB_HANDLES['DB_TEAM_8']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT date, sku, COUNT(*), SUM(salesPrice )' \
                    'FROM sales_transactions ' \
                    'GROUP BY date, sku'
    
    
    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO sales_daily VALUES (?, ?, ?, ?, ?, ?, ?)'
    
    num_records = 0

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)
    
    print(f'{datetime.now()} - Started Insertions')
    for row in results:
        DateKey = DATE_KEYS[row[0]]
        ProductKey = PRODUCTS_LOOKUP[str(row[1])]['ProductKey']
        StoreKey = 8
        QuantitySold = row[2]
        TotalDollarSales = round_money(row[3])
        TotalCostToStore = round_money((row[2] * get_product_cost(row[1])))
        GrossProfit = round_money((TotalDollarSales - TotalCostToStore))
        
        values = (DateKey, ProductKey, StoreKey, \
                 QuantitySold, TotalDollarSales, TotalCostToStore, GrossProfit)
        
        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 50000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')
    
    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()

#### IV. Inventory

In [18]:
def etl_team_8_inventory():
    db_handle_old = DB_HANDLES['DB_TEAM_8']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT sku, date, MIN(items_left), MAX(cases_ordered)' \
                    'FROM sales_transactions ' \
                    'GROUP BY date, sku;'

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Started Insertions')

    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO inventory_daily VALUES (?, ?, ?, ?, ?, ?, ?)'

    num_records = 0
    for row in results:
        DateKey = DATE_KEYS[row[1]]
        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        StoreKey = 8
        NumAvailable = row[2]
        CostToStoreItem = round_money((row[2]*get_product_cost(row[0])))
        CostToStore = round_money(12*get_case_count(row[2])*get_product_cost(row[0]))
        NumCasesPurchasedToDate = row[3]
        
        values = (DateKey, ProductKey, StoreKey, NumAvailable, \
                 CostToStoreItem, CostToStore, NumCasesPurchasedToDate)
    
        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 100000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')

    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()


#### V. Run
Comment out the call to `run_8()` to verify the functionality for other ETLs.

In [19]:
def run_8():
    build_data_structures_8()
    etl_team_8_sales()
    etl_team_8_sales_daily()
    etl_team_8_inventory()

run_8()

2025-03-30 10:39:39.207816 - Started Query
2025-03-30 10:40:08.760618 - Started Insertions
2025-03-30 10:40:12.497608 - Committed record 1000000
2025-03-30 10:40:16.430363 - Committed record 2000000
2025-03-30 10:40:19.837188 - Committed record 3000000
2025-03-30 10:40:24.192999 - Committed record 4000000
2025-03-30 10:40:27.837626 - Committed record 5000000
2025-03-30 10:40:31.337902 - Committed record 6000000
2025-03-30 10:40:34.773815 - Committed record 7000000
2025-03-30 10:40:38.270479 - Committed record 8000000
2025-03-30 10:40:41.860063 - Committed record 9000000
2025-03-30 10:40:46.080682 - Committed record 10000000
2025-03-30 10:40:50.097034 - Committed record 11000000
2025-03-30 10:40:54.234691 - Committed record 12000000
2025-03-30 10:40:58.030055 - Committed record 13000000
2025-03-30 10:41:02.017002 - Committed record 14000000
2025-03-30 10:41:06.319127 - Committed record 15000000
2025-03-30 10:41:10.497112 - Committed record 16000000
2025-03-30 10:41:14.479395 - Committed

#

## 7. Team 9's ETL

#### I. Build the Data Structures Necessary to ETL from Team 8's Database

In [20]:
DATE_KEYS = {}

def build_data_structures_9():
    start_date = date(2024,1,1)
    end_date = date(2024,12,31)
    current_date = start_date
    
    date_key = 1
    
    while (current_date <= end_date):
        date_str = current_date.strftime('%Y-%m-%d')
        DATE_KEYS[date_str] = date_key
    
        date_key += 1
        current_date += timedelta(days=1)

#### II. Sales

In [21]:
def etl_team_9_sales():
    db_handle_old = DB_HANDLES['DB_TEAM_9']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT date1, sku, customerID , COUNT(*), SUM(salePrice) ' \
                    'FROM transactions ' \
                    'GROUP BY date1, customerID , sku'
    
    
    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO sales_transactions VALUES (?, ?, ?, ?, ?, ?, ?, ?)'
    
    num_records = 0

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)

    print(f'{datetime.now()} - Started Insertions')
    for row in results:
        DateKey = DATE_KEYS[row[0]]
        DailyCustomerNumber = row[2]
        ProductKey = PRODUCTS_LOOKUP[str(row[1])]['ProductKey']
        StoreKey = 9
        QuantitySold = row[3]
        TotalDollarSales = row[4]
        TotalCostToStore = round_money(row[3] * PRODUCTS_LOOKUP[str(row[1])]['CostToStore'])
        GrossProfit = round_money((TotalDollarSales - TotalCostToStore))

        values = (DateKey, DailyCustomerNumber, ProductKey, StoreKey, \
                 QuantitySold, TotalDollarSales, TotalCostToStore, GrossProfit)
        
        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 1000000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')

    print(f'{datetime.now()} - Committed record {num_records}')

    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()


#### III. Roll Sales Up

In [22]:
def etl_team_9_sales_daily():
    db_handle_old = DB_HANDLES['DB_TEAM_9']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT date1, sku, COUNT(*), SUM(salePrice) ' \
                    'FROM transactions ' \
                    'GROUP BY date1, sku'
    
    
    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO sales_daily VALUES (?, ?, ?, ?, ?, ?, ?)'
    
    num_records = 0

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)
    
    print(f'{datetime.now()} - Started Insertions')
    for row in results:
        DateKey = DATE_KEYS[row[0]]
        ProductKey = PRODUCTS_LOOKUP[str(row[1])]['ProductKey']
        StoreKey = 9
        QuantitySold = row[2]
        TotalDollarSales = round_money(row[3])
        TotalCostToStore = round_money((row[2] * PRODUCTS_LOOKUP[str(row[1])]['CostToStore']))
        GrossProfit = round_money((TotalDollarSales - TotalCostToStore))
        
        values = (DateKey, ProductKey, StoreKey, \
                 QuantitySold, TotalDollarSales, TotalCostToStore, GrossProfit)
        
        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 50000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')
    
    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()

#### IV. Inventory

In [23]:
def etl_team_9_inventory():
    db_handle_old = DB_HANDLES['DB_TEAM_9']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT sku, date1, MIN(itemsLeft), MAX(co)' \
                    'FROM transactions ' \
                    'GROUP BY date1, sku;'

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Started Insertions')

    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO inventory_daily VALUES (?, ?, ?, ?, ?, ?, ?)'

    num_records = 0
    for row in results:
        DateKey = DATE_KEYS[row[1]]
        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        StoreKey = 8
        NumAvailable = row[2]
        CostToStoreItem = round_money((row[2]*get_product_cost(row[0])))
        CostToStore = round_money(12*get_case_count(row[2])*get_product_cost(row[0]))
        NumCasesPurchasedToDate = row[3]
        
        values = (DateKey, ProductKey, StoreKey, NumAvailable, \
                 CostToStoreItem, CostToStore, NumCasesPurchasedToDate)

        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 100000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')

    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()


In [24]:
def run_9():
    build_data_structures_9()
    etl_team_9_sales()
    etl_team_9_sales_daily()
    etl_team_9_inventory()

run_9()

2025-03-30 10:41:53.066435 - Started Query
2025-03-30 10:42:13.974345 - Started Insertions
2025-03-30 10:42:17.455074 - Committed record 1000000
2025-03-30 10:42:21.507888 - Committed record 2000000
2025-03-30 10:42:25.322084 - Committed record 3000000
2025-03-30 10:42:29.086200 - Committed record 4000000
2025-03-30 10:42:32.698338 - Committed record 5000000
2025-03-30 10:42:36.229669 - Committed record 6000000
2025-03-30 10:42:39.740935 - Committed record 7000000
2025-03-30 10:42:43.182545 - Committed record 8000000
2025-03-30 10:42:46.312600 - Committed record 9000000
2025-03-30 10:42:49.450930 - Committed record 10000000
2025-03-30 10:42:52.628669 - Committed record 11000000
2025-03-30 10:42:55.768726 - Committed record 12000000
2025-03-30 10:42:58.866413 - Committed record 13000000
2025-03-30 10:43:01.526314 - Committed record 13854769
2025-03-30 10:43:02.465879 - Started Query
2025-03-30 10:43:13.610801 - Started Insertions
2025-03-30 10:43:13.804134 - Committed record 50000
2025-

#

## 8. Team 10's ETL

#### I. Build the data structures necessary for Team 10's ETL

In [25]:
DATE_KEYS = {}

def build_data_structures_10():
    start_date = date(2024,1,1)
    end_date = date(2024,12,31)
    current_date = start_date
    
    date_key = 1
    
    while (current_date <= end_date):
        date_str = current_date.strftime('%Y%m%d')
        DATE_KEYS[date_str] = date_key
    
        date_key += 1
        current_date += timedelta(days=1)

#### II. Sales

In [26]:
def etl_team_10_sales():
    db_handle_old = DB_HANDLES['DB_TEAM_10']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT date, sku, customer_number, COUNT(*), SUM(salesPrice )' \
            'FROM sales_transactions GROUP BY date, customer_number, sku'
    
    
    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO sales_transactions VALUES (?, ?, ?, ?, ?, ?, ?, ?)'
    
    num_records = 0

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)
    
    print(f'{datetime.now()} - Started Insertions')
    for row in results:
        DateKey = DATE_KEYS[row[0]]
        DailyCustomerNumber = row[2]
        ProductKey = PRODUCTS_LOOKUP[str(row[1])]['ProductKey']
        StoreKey = 10
        QuantitySold = round_money(row[3])
        TotalDollarSales = round_money(row[4])
        TotalCostToStore = round_money((row[3] * get_product_cost(row[1])))
        GrossProfit = round((TotalDollarSales - TotalCostToStore), 2)
    
        values = (DateKey, DailyCustomerNumber, ProductKey, StoreKey, \
                 QuantitySold, TotalDollarSales, TotalCostToStore, GrossProfit)
    
        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 1000000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')
    
    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()

#### III. Roll Sales Up

In [27]:
def etl_team_10_sales_daily():
    db_handle_old = DB_HANDLES['DB_TEAM_10']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT date, sku, COUNT(*), SUM(salesPrice )' \
                    'FROM sales_transactions ' \
                    'GROUP BY date, sku'
    
    
    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO sales_daily VALUES (?, ?, ?, ?, ?, ?, ?)'
    
    num_records = 0

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)
    
    print(f'{datetime.now()} - Started Insertions')
    for row in results:
        DateKey = DATE_KEYS[row[0]]
        ProductKey = PRODUCTS_LOOKUP[str(row[1])]['ProductKey']
        StoreKey = 10
        QuantitySold = row[2]
        TotalDollarSales = round_money(row[3])
        TotalCostToStore = round_money((row[2] * get_product_cost(row[1])))
        GrossProfit = round_money((TotalDollarSales - TotalCostToStore))
        
        values = (DateKey, ProductKey, StoreKey, \
                 QuantitySold, TotalDollarSales, TotalCostToStore, GrossProfit)
        
        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 50000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')
    
    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()

#### IV. Inventory

In [28]:
def etl_team_10_inventory():
    db_handle_old = DB_HANDLES['DB_TEAM_10']
    db_handle_old.connect()
    
    sql_retrieve = 'SELECT sku, date, MIN(items_left), MAX(cases_ordered)' \
                    'FROM sales_transactions ' \
                    'GROUP BY date, sku;'

    print(f'{datetime.now()} - Started Query')
    results = db_handle_old.execute_sql(sql_retrieve, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Started Insertions')

    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO inventory_daily VALUES (?, ?, ?, ?, ?, ?, ?)'

    num_records = 0
    for row in results:
        DateKey = DATE_KEYS[row[1]]
        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        StoreKey = 10
        NumAvailable = row[2]
        CostToStoreItem = round_money((row[2]*get_product_cost(row[0])))
        CostToStore = round_money(12*get_case_count(row[2])*get_product_cost(row[0]))
        NumCasesPurchasedToDate = row[3]
        
        values = (DateKey, ProductKey, StoreKey, NumAvailable, \
                 CostToStoreItem, CostToStore, NumCasesPurchasedToDate)
    
        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)
    
        if(num_records % 100000 == 0):
            db_handle_new.commit()
            print(f'{datetime.now()} - Committed record {num_records}')

    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()


In [29]:
def run_10():
    build_data_structures_10()
    etl_team_10_sales()
    etl_team_10_sales_daily()
    etl_team_10_inventory()

run_10()

2025-03-30 10:43:31.034852 - Started Query
2025-03-30 10:43:49.465554 - Started Insertions
2025-03-30 10:43:52.983311 - Committed record 1000000
2025-03-30 10:43:56.437251 - Committed record 2000000
2025-03-30 10:43:59.895285 - Committed record 3000000
2025-03-30 10:44:03.358557 - Committed record 4000000
2025-03-30 10:44:06.785438 - Committed record 5000000
2025-03-30 10:44:10.293627 - Committed record 6000000
2025-03-30 10:44:13.755413 - Committed record 7000000
2025-03-30 10:44:17.178091 - Committed record 8000000
2025-03-30 10:44:20.615837 - Committed record 9000000
2025-03-30 10:44:24.098771 - Committed record 10000000
2025-03-30 10:44:27.574938 - Committed record 11000000
2025-03-30 10:44:31.003418 - Committed record 12000000
2025-03-30 10:44:34.487752 - Committed record 13000000
2025-03-30 10:44:34.866649 - Committed record 13109316
2025-03-30 10:44:35.662577 - Started Query
2025-03-30 10:44:45.686457 - Started Insertions
2025-03-30 10:44:45.906347 - Committed record 50000
2025-

## 9 - Generate Quarterly Snapshots

Conceptual hurdles identified
1. We need to do aggregation by quarter, which suggests a JOIN between the `inventory_daily` and the `date` tables
2. We need the last inventory fact for each (Store, Date, Product) tuple  
    &emsp; &emsp; for each tuple's `CasesOnHand` and `CasesPurchasedToDate`  
    &emsp; &emsp; and these are non-additive.  
    &emsp; &emsp; Some (Store, Date, Product) keys may not have an Inventory fact associated with them  
    &emsp; &emsp; because they sold 0 and we need to LEFT-JOIN multiple tables  
    &emsp; &emsp; and do a full table scan of each table at least once for each missing (Store, Date, Product) tuple.  
4. We need to aggregate by quarter to generate the following:  
    &emsp; &emsp; (1) Total costs and Counts sold by the store in the current quarter  
    &emsp; &emsp; (2) Total costs and counts sold by the store YTD.  
    &emsp; &emsp; Generating (2) involves a self-JOIN on already-aggregated data (which warrants the use of a CTE)


In [30]:
def build_date_mapping_tables():
    db_8 = DB_HANDLES['DB_TEAM_8']
    db_9 = DB_HANDLES['DB_TEAM_9']
    db_10 = DB_HANDLES['DB_TEAM_10']

    db_handles = [db_8, db_9, db_10]

    start_date = date(2024, 1, 1)
    end_date = date(2024, 12, 31)

    current_date = start_date

    sql_table_creation = '''
                            CREATE TABLE date(
                                date INT, 
                                quarter INT
                            )
                        '''
    sql_insert = 'INSERT INTO date VALUES (?, ?)'
    
    for db in db_handles:
        db.connect()
        db.execute_sql('DROP TABLE IF EXISTS date')
        db.execute_sql(sql_table_creation)

    while(current_date <= end_date):
        values_fmt_1 = (current_date.strftime('%Y-%m-%d'), ((current_date.month + 2)//3))
        values_fmt_2 = (current_date.strftime('%Y%m%d'), ((current_date.month + 2)//3))

        db_8.execute_sql_values(sql_insert, values_fmt_1)
        db_9.execute_sql_values(sql_insert, values_fmt_1)
        db_10.execute_sql_values(sql_insert, values_fmt_2)
        
        current_date += timedelta(days=1)

    for db in db_handles:
        db.commit()
        db.close()
    
build_date_mapping_tables()

In [31]:
def etl_team_8_quarterly():
    db_handle_old = DB_HANDLES['DB_TEAM_8']
    db_handle_old.connect()
    
    sql_retrieve_1 =  '''
                        WITH quarterly_inventory AS (
                            SELECT sku, d.date, d.quarter, 
                                FIRST_VALUE(items_left) OVER 
                                    (PARTITION BY sku, quarter ORDER BY sku ASC, d.quarter ASC, d.date DESC, items_left ASC)
                                    AS items_left, 
                                FIRST_VALUE(cases_ordered) OVER 
                                    (PARTITION BY sku, quarter ORDER BY sku ASC, d.quarter ASC, d.date DESC, items_left ASC)
                                    AS cases_ordered
                            FROM sales_transactions AS st
                            JOIN date AS d USING (date)
                            GROUP BY sku, d.date
                            ORDER BY sku ASC, d.date DESC, items_left DESC
                        )
                        SELECT sku, quarter, items_left, cases_ordered, 
                                COALESCE(
                                    (LAG(cases_ordered, 1) OVER (PARTITION BY sku ORDER BY quarter ASC))
                                    ,0)
                        FROM quarterly_inventory
                        GROUP BY sku, quarter
                    '''

    print(f'{datetime.now()} - Started Query 1')
    results = db_handle_old.execute_sql(sql_retrieve_1, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Finished Query 1')

    inventory_records = {}

    for row in results:

        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        StoreKey = 8
        QuarterAndYear = f'Q{row[1]} 2024'
        Quarter = row[1]
        Year = 2024

        CasesPurchasedToDate = row[3]
        CasesPurchasedThisQuarter = (row[3] - row[4])
        CasesOnHand = (row[2] // 12)

        TotalCostToStoreThisQuarter = round_money(12*CasesPurchasedThisQuarter * get_product_cost(row[0]))        
        TotalCostToStoreThisYTD = round_money(12*CasesPurchasedToDate * get_product_cost(row[0]))

        key = f'{ProductKey}|{Quarter}'
        
        inventory_records[key] = {
            'ProductKey': ProductKey,
            'StoreKey': StoreKey,
            'QuarterAndYear': QuarterAndYear,
            'Quarter': Quarter,
            'Year': Year,
            'CasesPurchasedToDate': CasesPurchasedToDate,
            'CasesPurchasedThisQuarter': CasesPurchasedThisQuarter,
            'CasesOnHand': CasesOnHand,
            'TotalCostToStoreThisQuarter': TotalCostToStoreThisQuarter,
            'TotalCostToStoreThisYTD': TotalCostToStoreThisYTD
        }


    sql_retrieve_2 =    '''
                            WITH quarterly_sales AS (
                                SELECT sku, quarter, COUNT(*) AS current_quarter_count
                                FROM sales_transactions
                                JOIN date USING (date)
                                GROUP BY sku, quarter
                                ORDER BY sku, quarter ASC
                            )
                            SELECT sku, quarter, current_quarter_count,
                                SUM(current_quarter_count) 
                                    OVER (PARTITION BY sku ORDER BY quarter ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS ytd_sales
                            FROM quarterly_sales
                            ORDER BY sku, quarter;
                        '''

    print(f'{datetime.now()} - Started Query 2')
    results = db_handle_old.execute_sql(sql_retrieve_2, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Finished Query 2')

    for row in results:

        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        
        key = f'{ProductKey}|{row[1]}'

        inventory_records[key]['TotalSoldByStoreThisQuarter'] = row[2]
        inventory_records[key]['TotalSoldByStoreThisYTD'] = row[3]


    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO inventory_quarterly VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
    num_records = 0
    for key in inventory_records:
        values = (inventory_records[key]['ProductKey'],
                  inventory_records[key]['StoreKey'],
                  inventory_records[key]['QuarterAndYear'],
                  inventory_records[key]['Quarter'],
                  inventory_records[key]['Year'],
                  inventory_records[key]['CasesPurchasedToDate'],
                  inventory_records[key]['CasesPurchasedThisQuarter'],
                  inventory_records[key]['CasesOnHand'],
                  inventory_records[key]['TotalCostToStoreThisQuarter'],
                  inventory_records[key]['TotalSoldByStoreThisQuarter'],
                  inventory_records[key]['TotalCostToStoreThisYTD'],
                  inventory_records[key]['TotalSoldByStoreThisYTD']
        )

        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)

    
    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()

build_data_structures_8()
etl_team_8_quarterly()


2025-03-30 10:45:01.910482 - Started Query 1
2025-03-30 10:45:21.417070 - Finished Query 1
2025-03-30 10:45:21.433723 - Started Query 2
2025-03-30 10:45:34.183098 - Finished Query 2
2025-03-30 10:45:34.202216 - Committed record 8300


In [32]:
def etl_team_9_quarterly():
    db_handle_old = DB_HANDLES['DB_TEAM_9']
    db_handle_old.connect()
    
    sql_retrieve_1 =  '''
                        WITH quarterly_inventory AS (
                            SELECT sku, d.date, d.quarter, 
                                FIRST_VALUE(itemsLeft) OVER 
                                    (PARTITION BY sku, quarter ORDER BY sku ASC, d.quarter ASC, d.date DESC, itemsLeft ASC)
                                    AS items_left, 
                                FIRST_VALUE(co) OVER 
                                    (PARTITION BY sku, quarter ORDER BY sku ASC, d.quarter ASC, d.date DESC, itemsLeft ASC)
                                    AS cases_ordered
                            FROM transactions AS t
                            JOIN date AS d ON t.date1 = d.date
                            GROUP BY sku, d.date
                            ORDER BY sku ASC, d.date DESC, itemsLeft DESC
                        )
                        SELECT sku, quarter, items_left, cases_ordered, 
                                COALESCE(
                                    (LAG(cases_ordered, 1) OVER (PARTITION BY sku ORDER BY quarter ASC))
                                    ,0)
                        FROM quarterly_inventory
                        GROUP BY sku, quarter
                    '''

    print(f'{datetime.now()} - Started Query 1')
    results = db_handle_old.execute_sql(sql_retrieve_1, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Finished Query 1')

    inventory_records = {}

    for row in results:

        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        StoreKey = 9
        QuarterAndYear = f'Q{row[1]} 2024'
        Quarter = row[1]
        Year = 2024

        CasesPurchasedToDate = row[3]
        CasesPurchasedThisQuarter = (row[3] - row[4])
        CasesOnHand = (row[2] // 12)

        TotalCostToStoreThisQuarter = round_money(12*CasesPurchasedThisQuarter * get_product_cost(row[0]))        
        TotalCostToStoreThisYTD = round_money(12*CasesPurchasedToDate * get_product_cost(row[0]))

        key = f'{ProductKey}|{Quarter}'
        
        inventory_records[key] = {
            'ProductKey': ProductKey,
            'StoreKey': StoreKey,
            'QuarterAndYear': QuarterAndYear,
            'Quarter': Quarter,
            'Year': Year,
            'CasesPurchasedToDate': CasesPurchasedToDate,
            'CasesPurchasedThisQuarter': CasesPurchasedThisQuarter,
            'CasesOnHand': CasesOnHand,
            'TotalCostToStoreThisQuarter': TotalCostToStoreThisQuarter,
            'TotalCostToStoreThisYTD': TotalCostToStoreThisYTD
        }


    sql_retrieve_2 =    '''
                            WITH quarterly_sales AS (
                                SELECT sku, quarter, COUNT(*) AS current_quarter_count
                                FROM transactions AS t
                                JOIN date AS d ON t.date1 = d.date
                                GROUP BY sku, quarter
                                ORDER BY sku, quarter ASC
                            )
                            SELECT sku, quarter, current_quarter_count,
                                SUM(current_quarter_count) 
                                    OVER (PARTITION BY sku ORDER BY quarter ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS ytd_sales
                            FROM quarterly_sales
                            ORDER BY sku, quarter;
                        '''

    print(f'{datetime.now()} - Started Query 2')
    results = db_handle_old.execute_sql(sql_retrieve_2, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Finished Query 2')

    for row in results:

        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        
        key = f'{ProductKey}|{row[1]}'

        inventory_records[key]['TotalSoldByStoreThisQuarter'] = row[2]
        inventory_records[key]['TotalSoldByStoreThisYTD'] = row[3]


    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO inventory_quarterly VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
    num_records = 0
    for key in inventory_records:
        values = (inventory_records[key]['ProductKey'],
                  inventory_records[key]['StoreKey'],
                  inventory_records[key]['QuarterAndYear'],
                  inventory_records[key]['Quarter'],
                  inventory_records[key]['Year'],
                  inventory_records[key]['CasesPurchasedToDate'],
                  inventory_records[key]['CasesPurchasedThisQuarter'],
                  inventory_records[key]['CasesOnHand'],
                  inventory_records[key]['TotalCostToStoreThisQuarter'],
                  inventory_records[key]['TotalSoldByStoreThisQuarter'],
                  inventory_records[key]['TotalCostToStoreThisYTD'],
                  inventory_records[key]['TotalSoldByStoreThisYTD']
        )

        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)

    
    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()

build_data_structures_9()
etl_team_9_quarterly()


2025-03-30 10:45:34.229915 - Started Query 1
2025-03-30 10:45:49.745122 - Finished Query 1
2025-03-30 10:45:49.764548 - Started Query 2
2025-03-30 10:45:59.529521 - Finished Query 2
2025-03-30 10:45:59.559205 - Committed record 8300


In [33]:
def etl_team_10_quarterly():
    db_handle_old = DB_HANDLES['DB_TEAM_10']
    db_handle_old.connect()
    
    sql_retrieve_1 =  '''
                        WITH quarterly_inventory AS (
                            SELECT sku, d.date, d.quarter, 
                                FIRST_VALUE(items_left) OVER 
                                    (PARTITION BY sku, quarter ORDER BY sku ASC, d.quarter ASC, d.date DESC, items_left ASC)
                                    AS items_left, 
                                FIRST_VALUE(cases_ordered) OVER 
                                    (PARTITION BY sku, quarter ORDER BY sku ASC, d.quarter ASC, d.date DESC, items_left ASC)
                                    AS cases_ordered
                            FROM sales_transactions AS st
                            JOIN date AS d USING (date)
                            GROUP BY sku, d.date
                            ORDER BY sku ASC, d.date DESC, items_left DESC
                        )
                        SELECT sku, quarter, items_left, cases_ordered, 
                                COALESCE(
                                    (LAG(cases_ordered, 1) OVER (PARTITION BY sku ORDER BY quarter ASC))
                                    ,0)
                        FROM quarterly_inventory
                        GROUP BY sku, quarter
                    '''

    print(f'{datetime.now()} - Started Query 1')
    results = db_handle_old.execute_sql(sql_retrieve_1, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Finished Query 1')

    inventory_records = {}

    for row in results:

        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        StoreKey = 10
        QuarterAndYear = f'Q{row[1]} 2024'
        Quarter = row[1]
        Year = 2024

        CasesPurchasedToDate = row[3]
        CasesPurchasedThisQuarter = (row[3] - row[4])
        CasesOnHand = (row[2] // 12)

        TotalCostToStoreThisQuarter = round_money(12*CasesPurchasedThisQuarter * get_product_cost(row[0]))        
        TotalCostToStoreThisYTD = round_money(12*CasesPurchasedToDate * get_product_cost(row[0]))

        key = f'{ProductKey}|{Quarter}'
        
        inventory_records[key] = {
            'ProductKey': ProductKey,
            'StoreKey': StoreKey,
            'QuarterAndYear': QuarterAndYear,
            'Quarter': Quarter,
            'Year': Year,
            'CasesPurchasedToDate': CasesPurchasedToDate,
            'CasesPurchasedThisQuarter': CasesPurchasedThisQuarter,
            'CasesOnHand': CasesOnHand,
            'TotalCostToStoreThisQuarter': TotalCostToStoreThisQuarter,
            'TotalCostToStoreThisYTD': TotalCostToStoreThisYTD
        }


    sql_retrieve_2 =    '''
                            WITH quarterly_sales AS (
                                SELECT sku, quarter, COUNT(*) AS current_quarter_count
                                FROM sales_transactions
                                JOIN date USING (date)
                                GROUP BY sku, quarter
                                ORDER BY sku, quarter ASC
                            )
                            SELECT sku, quarter, current_quarter_count,
                                SUM(current_quarter_count) 
                                    OVER (PARTITION BY sku ORDER BY quarter ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS ytd_sales
                            FROM quarterly_sales
                            ORDER BY sku, quarter;
                        '''

    print(f'{datetime.now()} - Started Query 2')
    results = db_handle_old.execute_sql(sql_retrieve_2, options=db_options.RETURN_RESULTS)
    print(f'{datetime.now()} - Finished Query 2')

    for row in results:

        ProductKey = PRODUCTS_LOOKUP[str(row[0])]['ProductKey']
        
        key = f'{ProductKey}|{row[1]}'

        inventory_records[key]['TotalSoldByStoreThisQuarter'] = row[2]
        inventory_records[key]['TotalSoldByStoreThisYTD'] = row[3]


    db_handle_new = DB_HANDLES['DATA_MART']
    db_handle_new.connect()
    
    sql_insert = 'INSERT INTO inventory_quarterly VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
    num_records = 0
    for key in inventory_records:
        values = (inventory_records[key]['ProductKey'],
                  inventory_records[key]['StoreKey'],
                  inventory_records[key]['QuarterAndYear'],
                  inventory_records[key]['Quarter'],
                  inventory_records[key]['Year'],
                  inventory_records[key]['CasesPurchasedToDate'],
                  inventory_records[key]['CasesPurchasedThisQuarter'],
                  inventory_records[key]['CasesOnHand'],
                  inventory_records[key]['TotalCostToStoreThisQuarter'],
                  inventory_records[key]['TotalSoldByStoreThisQuarter'],
                  inventory_records[key]['TotalCostToStoreThisYTD'],
                  inventory_records[key]['TotalSoldByStoreThisYTD']
        )

        num_records += 1
        db_handle_new.execute_sql_values(sql_insert, values=values)

    
    print(f'{datetime.now()} - Committed record {num_records}')
    db_handle_new.commit()
    db_handle_new.close()
    
    db_handle_old.close()

build_data_structures_10()
etl_team_10_quarterly()


2025-03-30 10:45:59.577813 - Started Query 1
2025-03-30 10:46:10.727229 - Finished Query 1
2025-03-30 10:46:10.743027 - Started Query 2
2025-03-30 10:46:18.623679 - Finished Query 2
2025-03-30 10:46:18.661734 - Committed record 8300


#### Duration of the batch job

In [34]:
DATA_MART_END = datetime.now()
DATA_MART_DURATION = DATA_MART_END - DATA_MART_START

print(f'START: {DATA_MART_START}')
print(f'END: {DATA_MART_END}')
print(f'This batch job took {DATA_MART_DURATION}')

START: 2025-03-30 10:39:38.538135
END: 2025-03-30 10:46:18.676874
This batch job took 0:06:40.138739
