## Data Warehousing HW 3 - Grocery Store Data Profiling
## Anthony Ung, Sean Jerzewski, Gideon Kipkorir

### 0. Connecting to Database

In [3]:
import sqlite3 as lite
import csv

def GroceryDatabaseAccess():
    GroceryDatabaseAccess.grocerDBConnection
    GroceryDatabaseAccess.groceryDBCursor
    GroceryDatabaseAccess.salesTransactionsToCommitCount
    GroceryDatabaseAccess.maxTransactionsBeforeCommit

def init():
    GroceryDatabaseAccess.groceryDBConnection = None
    GroceryDatabaseAccess.groceryDBCursor = None
    GroceryDatabaseAccess.salesTransactionsToCommitCount = 0   
    GroceryDatabaseAccess.maxTransactionsBeforeCommit = 10000 

def run():
    print("Connecting to the grocerydb database")
    con = lite.connect(r'store.db')
    print('Database successfully connected to')
    GroceryDatabaseAccess.groceryDBConnection = con
    GroceryDatabaseAccess.groceryDBCursor = con.cursor()

def build_products_table():
    sql = 'DROP TABLE IF EXISTS products'
    GroceryDatabaseAccess.groceryDBCursor.execute(sql)
    
    sql = 'CREATE TABLE products(sku INT, product_name TEXT, product_type TEXT)'
    GroceryDatabaseAccess.groceryDBCursor.execute(sql)

    csv.register_dialect('piper', delimiter='|', quoting=csv.QUOTE_NONE)
    
    with open('Products1.txt', 'r') as csvfile:
        i = 0
        
        for row in csv.DictReader(csvfile, dialect='piper'):
            sku = row.get('SKU')
            product_name = row.get('Product Name')
            product_type = row.get('itemType')
            GroceryDatabaseAccess.groceryDBCursor.execute('insert into products values (?, ?, ?)',
                                                      (sku,product_name, product_type))
            i += 1
            if i % 10000 == 0:
                GroceryDatabaseAccess.groceryDBConnection.commit()
                print(f"Committed row {i}")
            
        GroceryDatabaseAccess.groceryDBConnection.commit()
        print(f"Committed row {i}")

run()

Connecting to the grocerydb database
Database successfully connected to


### 0.1. Build Transactions Table

In [23]:
from datetime import date, datetime, timedelta


def test_dates():
    print('''
            This should print the first 5 and last 5 dates.
            I need subqueries because in SQLite, ORDER BY comes after UNION
                and I need subqueries to order by ASC and DESC in the two parts independently.
            In this case, we are missing the dates '2024-01-01' and '2024-12-31'.
        ''')

    
    sql =   '''
                SELECT * FROM
                    (SELECT DISTINCT date
                    FROM sales_transactions
                    ORDER BY date ASC
                    LIMIT 5
                    )

                UNION

                SELECT * FROM
                    (SELECT DISTINCT date
                    FROM sales_transactions
                    ORDER BY date DESC
                    LIMIT 5
                    )
            '''
    results = GroceryDatabaseAccess.groceryDBCursor.execute(sql)
    for row in results:
        print(row)

def build_transactions_table():
    cur = GroceryDatabaseAccess.groceryDBCursor
    
    sql = "DROP TABLE IF EXISTS transacs"
    cur.execute(sql)

    sql = '''
            CREATE TABLE transactions(date VARCHAR(8), customerNumber INT, total FLOAT)
        '''
    cur.execute(sql)

    current_date = date(2024, 1, 1)
    end_date = date(2024, 12, 31)

    
def query():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection
    
    
    current_date = date(2024, 1, 1)
    end_date = date(2024, 12, 31)
    
    num_days = 1
    
    while(current_date <= end_date):
        sql = '''
                SELECT date, customerNumber, ROUND(SUM(salesPrice),2) 
                FROM sales_transactions 
                WHERE date == (?)
                GROUP BY customerNumber
            '''
        num_records = 0
        
        results = cur.execute(sql, (current_date,)).fetchall()
        for row in results:
            num_records += 1
            data = (row[0], row[1], row[2])
            cur.execute('INSERT INTO transactions VALUES (?, ?, ?)', data)
        con.commit()

        if num_days % 30 == 1:
            print(f'{datetime.now()} - Committed transaction results for date {current_date} - {num_records} records committed')

        num_days += 1
        current_date += timedelta(1)
    

def run_0_1():
    #test_dates()
    build_transactions_table()
    query()

run_0_1()

  results = cur.execute(sql, (current_date,)).fetchall()


2025-02-14 14:48:59.549605 - Committed transaction results for date 2024-01-01 - 1052 records committed
2025-02-14 14:49:32.927967 - Committed transaction results for date 2024-01-31 - 1044 records committed
2025-02-14 14:50:05.563190 - Committed transaction results for date 2024-03-01 - 1023 records committed
2025-02-14 14:50:37.575598 - Committed transaction results for date 2024-03-31 - 1130 records committed
2025-02-14 14:51:12.133306 - Committed transaction results for date 2024-04-30 - 1042 records committed
2025-02-14 14:51:42.364399 - Committed transaction results for date 2024-05-30 - 1044 records committed
2025-02-14 14:52:13.629320 - Committed transaction results for date 2024-06-29 - 1130 records committed
2025-02-14 14:52:46.482562 - Committed transaction results for date 2024-07-29 - 1029 records committed
2025-02-14 14:53:20.241322 - Committed transaction results for date 2024-08-28 - 1058 records committed
2025-02-14 15:36:56.251112 - Committed transaction results for d

## 0.2 - Various tests

In [6]:
def test_daily_customer_counts():
    cur = GroceryDatabaseAccess.groceryDBCursor

    sql = \
        '''
            WITH customer_counts AS (
                SELECT date, COUNT (DISTINCT customerNumber)
                FROM sales_transactions
                GROUP BY date
            )
        '''


tests = \
    { \
        'Daily Customer Counts' : True
    }

if tests['Daily Customer Counts']:
    test_daily_customer_counts()

## 0.3 - Preliminary Profiling
Given our parameters,
1. There should be about 15,000 transactions
2. There should be about 10,500 transactions involving sales of milk.

In [None]:
def run_0_3():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection

    sql = '''
            SELECT COUNT(*) AS count
            FROM transactions
            WHERE (date >= '2024-01-02') AND (date <= '2024-01-16')
        '''

    results = cur.execute(sql)
    for row in results:
        print(f'{row[0]} transactions in database')
    
    sql = '''
            SELECT COUNT(*) AS count
            FROM sales_transactions
            JOIN products on sales_transactions.sku == products.sku
            WHERE (date >= '2024-01-02') AND (date <= '2024-01-16') AND (product_type == 'Milk')
        '''

    results = cur.execute(sql)
    for row in results:
        print(f'{row[0]} transactions involving milk')

run_0_3()

## 1.1 - Average Sales Per Day
Compute the average sales per day of each item

Our parameters
- 1020-1060 Customers Daily + 75 for Weekends
- 1-90 Items Per Customer

Assumptions Given
- About 70% of our customers are supposed to buy milk
  - 35% will buy milk and cereal
  - 1.5% will buy cereal but no milk
- 20% will buy baby food
  - 16% will buy baby food and diapers
  - 0.8% will buy diapers but no baby food
- 50% will buy bread
- 10% will buy peanut butter
  - 9% will buy peanut butter + jam/jelly
  - 4.5% will buy jam + jelly but no peanut butter
- All other products should be equally likely

In [25]:
def run_1_1():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection

    sql = '''
            WITH selected_transacs AS (
                SELECT *
                FROM sales_transactions
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-16')
            )
            SELECT 
                products.product_name, 
                products.product_type,
                count(*)/14 AS count
            FROM products
            JOIN selected_transacs USING(sku)
            GROUP BY products.sku
            ORDER BY count DESC
        '''

    results = cur.execute(sql)
    for row in results:
        print(row)

run_1_1()

('1.00% Milk', 'Milk', 160)
('2.00% Milk', 'Milk', 158)
('1.00% Milk', 'Milk', 157)
('Whole Milk Milk', 'Milk', 153)
('Whole Milk Milk', 'Milk', 151)
('2.00% Milk', 'Milk', 151)
('Squeeze Jelly Grape', 'Jelly/Jam', 62)
('Jelly Grape', 'Jelly/Jam', 61)
('Jam Strawberry', 'Jelly/Jam', 60)
('Jam Grape', 'Jelly/Jam', 59)
('Farmhouse Bread Potato', 'Bread', 39)
('Ezekiel 4:9 Bread Sesame Sprouted Grain', 'Bread', 38)
('Whole Grain Bread 100% Wheat Bread', 'Bread', 38)
('Flatbread Angus Cheeseburgers', 'Bread', 38)
('Bread Honey Wheat', 'Bread', 38)
('Sandwich Thins Potato Rolls', 'Bread', 37)
('Swirl Oatmeal Bread Apple & Cinnamon', 'Bread', 37)
('Light Style Bread Soft Wheat', 'Bread', 37)
('Italian Bread With Sesame Seeds', 'Bread', 37)
('Farmhouse Bread Oatmeal', 'Bread', 37)
('Bread Swirl Raisin Cinnamon', 'Bread', 37)
('Bread Texas Toast', 'Bread', 37)
('Bread Amazin Raisin', 'Bread', 37)
('Sandwich Thins Whole Wheat', 'Bread', 36)
('Whole Grain Bread Honey Wheat', 'Bread', 36)
('Very 

## 1.1 - Average Sales Per Item Type Per Day
It may also be valuable to see the numbers of each product type sold per day

In [32]:
def run_1_1a():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection

    sql = '''
            WITH selected_transacs AS (
                SELECT *
                FROM sales_transactions
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-16')
            )
            SELECT 
                products.product_type,
                count(*)/14 AS count
            FROM products
            JOIN selected_transacs USING(sku)
            GROUP BY products.product_type
            HAVING products.product_type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            ORDER BY count DESC
        '''

    results = cur.execute(sql)
    for row in results:
        print(row)

run_1_1a()

('Baby Food', 4075)
('Cereal', 2614)
('Diapers', 2140)
('Bread', 1721)
('Milk', 932)
('Peanut Butter', 585)
('Jelly/Jam', 243)


## 1.1 - Average Sales Per Item Type Per Transaction Per Day
It may also be valuable to see the numbers of each product type sold per transaction per day

In [48]:
def run_1_1b():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection

    sql = '''
            WITH selected_transacs AS (
                SELECT *
                FROM sales_transactions
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-16')
            ),
            item_type_counts_per_transaction AS (
                SELECT date, customerNumber, products.product_type AS type, COUNT(*) as count
                FROM selected_transacs
                JOIN products USING(sku)
                GROUP BY date, customerNumber, products.product_type
            )
            SELECT type, COUNT(*)/14 AS freq
            FROM item_type_counts_per_transaction
            GROUP BY type
            HAVING type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            ORDER BY freq DESC
        '''

    results = cur.execute(sql)
    for row in results:
        print(row)

run_1_1b()

('Baby Food', 991)
('Cereal', 950)
('Bread', 892)
('Diapers', 865)
('Milk', 838)
('Peanut Butter', 440)
('Jelly/Jam', 226)
