## Data Warehousing HW 3 - Grocery Store Data Profiling
## Anthony Ung, Sean Jerzewski, Gideon Kipkorir

## 0. Databse Classes of Utility Functions

In [1]:
import sqlite3 as lite

class db:
    con = None
    cur = None
    commit_pending = 0

    def connect():
        db.con = lite.connect(r'store.db')
        db.cur = db.con.cursor()
        print('Database Successfully Connected To')

    def execute_sql(sql):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        return db.cur.execute(sql).fetchall()

    def execute_sql_values(sql, values):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        assert isinstance(values, tuple), \
            f"""Error! This function expected a string. 
                Got {print(type(values))} instead"""
        results = db.cur.execute(sql, values).fetchall()

    def commit():
        db.con.commit()
        db.commit_pending = 0

    def close():
        db.con.commit()
        db.con.close()
        print('Database Connection Closed')


class db_debug():
    
    def execute_sql(sql):
        assert isinstance(sql, str), \
            f"""Error! This function expected a string. 
                Got {type(sql)} instead"""

        '''
        In my testing, the db_debug class does not play nicely with
            the db class because even though I invoke db.connect(),
            I still get error messages saying that the database is closed.
        Each invocation creates its own database connection since
            these methods are meant to be used very rarely.
        '''
        con = lite.connect(r'store.db')
        cur = con.cursor()
        
        results = cur.execute(sql).fetchall()
        for row in results:
            print(row)

        con.close()

    def execute_sql_values(sql, values):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        assert isinstance(values, tuple), \
            f"""Error! This function expected a tuple. 
                Got {print(type(values))} instead"""

        con = lite.connect(r'store.db')
        cur = con.cursor()
        
        results = cur.execute(sql, values)
        for row in results:
            print(row)
        
        con.close()

## 0.1 - Preliminary Profiling
Given our parameters,
1. There should be about 15,000 transactions
2. There should be about 10,500 transactions involving sales of milk.

In [21]:
def run_0_3():
    db.connect()
    
    sql = '''
            SELECT COUNT(*) AS count
            FROM transactions_customers
            WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
        '''

    results = db.execute_sql(sql)
    for row in results:
        print(f'{row[0]} transactions in database')
    
    sql = '''
            WITH filter AS (
                SELECT *
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            )
            SELECT COUNT(*) AS count
            FROM filter
            JOIN products on filter.sku == products.sku
            WHERE (product_type == 'Milk')
        '''

    results = db.execute_sql(sql)
    for row in results:
        print(f'{row[0]} transactions involving milk')

    db.close()

run_0_3()

Database Successfully Connected To
14947 transactions in database
12314 transactions involving milk
Database Connection Closed


## 1.1 - Average Sales Per Day
Compute the average sales per day of each item

Our parameters
- 1020-1060 Customers Daily + 75 for Weekends
- 1-90 Items Per Customer

Assumptions Given
- About 70% of our customers are supposed to buy milk
  - 35% will buy milk and cereal
  - 1.5% will buy cereal but no milk
- 20% will buy baby food
  - 16% will buy baby food and diapers
  - 0.8% will buy diapers but no baby food
- 50% will buy bread
- 10% will buy peanut butter
  - 9% will buy peanut butter + jam/jelly
  - 4.5% will buy jam + jelly but no peanut butter
- All other products should be equally likely

In [22]:
def run_1_1():
    
    sql = '''
            WITH selected_transacs AS (
                SELECT *
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-16')
            )
            SELECT 
                products.product_name, 
                products.product_type,
                count(*)/14 AS count
            FROM products
            JOIN selected_transacs USING(sku)
            GROUP BY products.sku
            ORDER BY count DESC
            LIMIT 25
        '''

    db_debug.execute_sql(sql)

run_1_1()

('Whole Milk Milk', 'Milk', 160)
('Whole Milk Milk', 'Milk', 159)
('2.00% Milk', 'Milk', 158)
('1.00% Milk', 'Milk', 158)
('2.00% Milk', 'Milk', 153)
('1.00% Milk', 'Milk', 150)
('Squeeze Jelly Grape', 'Jelly/Jam', 65)
('Jelly Grape', 'Jelly/Jam', 63)
('Jam Grape', 'Jelly/Jam', 63)
('Jam Strawberry', 'Jelly/Jam', 59)
('Swirl Oatmeal Bread Apple & Cinnamon', 'Bread', 39)
('Farmhouse Bread Oatmeal', 'Bread', 39)
('Bread Swirl Raisin Cinnamon', 'Bread', 39)
('Italian Bread With Sesame Seeds', 'Bread', 38)
('Sandwich Thins Honey Wheat', 'Bread', 37)
('Ezekiel 4:9 Bread Whole Grain', 'Bread', 37)
('Bread Cinnamon Raisin Swirl', 'Bread', 37)
('Whole Grain Bread 100% Wheat Bread', 'Bread', 37)
('Swirl Bread French Toast', 'Bread', 37)
('Light Style Bread 7 Grain', 'Bread', 37)
('Farmhouse Bread Sourdough', 'Bread', 37)
('Bread White Original', 'Bread', 37)
('Bread Honey Wheat', 'Bread', 37)
('Sandwich Thins Potato Rolls', 'Bread', 36)
('Ezekiel 4:9 Bread Sesame Sprouted Grain', 'Bread', 36)


## 1.1 - Average Sales Per Item Type Per Day
It may also be valuable to see the numbers of each product type sold per day

In [23]:
def run_1_1a():
    
    sql = '''
            WITH selected_transacs AS (
                SELECT *
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            )
            SELECT 
                products.product_type,
                count(*)/14 AS count
            FROM products
            JOIN selected_transacs USING(sku)
            GROUP BY products.product_type
            HAVING products.product_type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            ORDER BY count DESC
        '''

    db_debug.execute_sql(sql)

run_1_1a()

('Baby Food', 3784)
('Cereal', 2443)
('Diapers', 2011)
('Bread', 1600)
('Milk', 879)
('Peanut Butter', 550)
('Jelly/Jam', 235)


## 1.1 - Average Sales Per Item Type Per Transaction Per Day
It may also be valuable to see the numbers of each product type sold per transaction per day

In [24]:
def run_1_1b():
   
    sql = '''
            WITH selected_transacs AS (
                SELECT date, customerNumber, sku
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-16')
            ),
            type_counts_per_transaction AS (
                SELECT date, customerNumber, products.product_type AS type, COUNT(*) as count
                FROM selected_transacs
                JOIN products USING(sku)
                GROUP BY date, customerNumber, products.product_type
            ),
            type_frequencies_per_day AS (
                SELECT type, COUNT(*)/14 AS freq
                FROM type_counts_per_transaction
                GROUP BY type
                HAVING type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
                ORDER BY freq DESC
            )
            SELECT * from type_frequencies_per_day
        '''

    db_debug.execute_sql(sql)

run_1_1b()

('Baby Food', 994)
('Cereal', 952)
('Bread', 896)
('Diapers', 867)
('Milk', 841)
('Peanut Butter', 441)
('Jelly/Jam', 234)


In [19]:
def run_test():

    sql = '''
            WITH selected_transacs AS (
                SELECT date, customerNumber, sku
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            ),
            type_counts_per_transaction AS (
                SELECT date, customerNumber, products.product_type AS type, COUNT(*) as count
                FROM selected_transacs
                JOIN products USING(sku)
                GROUP BY date, customerNumber, products.product_type
            ),
            type_frequencies_per_day AS (
                SELECT type, COUNT(*)/14 AS freq
                FROM type_counts_per_transaction
                GROUP BY type
                HAVING type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
                ORDER BY freq DESC
            )
            SELECT * from type_frequencies_per_day
        '''

    db_debug.execute_sql(sql)

run_test()

('Baby Food', 930)
('Cereal', 891)
('Bread', 837)
('Diapers', 810)
('Milk', 786)
('Peanut Butter', 412)
('Jelly/Jam', 219)


## Average number of customer numbers per day

In [27]:
def run_test():

    sql = '''
            WITH customers_per_day AS (
                SELECT date, MAX(customerNumber) as numCustomers
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
                GROUP BY date
            )
            SELECT ROUND(AVG(numCustomers), 0)
            FROM customers_per_day
        '''

    db_debug.execute_sql(sql)

run_test()

(1068.0,)


## Average Number of Items Sold Per Day

In [30]:
def run_test():

    sql = '''
            WITH customers_per_day AS (
                SELECT date, COUNT(*) as itemsSold
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
                GROUP BY date
            )
            SELECT ROUND(AVG(itemsSold), 0)
            FROM customers_per_day
        '''

    db_debug.execute_sql(sql)

run_test()

(48126.0,)


## Minimum and Maximum for non-special items

In [33]:
def run_test():
    
    sql = '''
            WITH items_sold AS (
                SELECT sku
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            ),
            items_sold_count AS (
                SELECT sku, COUNT(*) as count
                FROM items_sold
                GROUP BY sku
            ),
            filtered_items_list AS (
                SELECT isc.sku, isc.count AS count, p.product_type
                FROM items_sold_count AS isc
                JOIN products AS p USING(sku)
                WHERE p.product_type NOT IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            )
            SELECT MIN(fil.count), MAX(fil.count) FROM filtered_items_list AS fil
        '''
    
    db_debug.execute_sql(sql)

run_test()

(258, 379)


In [34]:
def run_test():

    sql = '''
            SELECT p.product_type, COUNT(*)
            FROM products AS p
            WHERE p.product_type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            GROUP BY p.product_type
        '''
    
    db_debug.execute_sql(sql)

run_test()

('Baby Food', 162)
('Bread', 48)
('Cereal', 93)
('Diapers', 82)
('Jelly/Jam', 4)
('Milk', 6)
('Peanut Butter', 20)


In [35]:
def run_test():

    sql = '''
            SELECT DISTINCT p.product_type
            FROM products AS p
            ORDER BY 1 ASC
        '''
    
    db_debug.execute_sql(sql)

run_test()

('',)
('Acetominifen',)
('Aspirin',)
('Baby Food',)
('Baked Goods Other than Bread',)
('Baking Supplies',)
('Bologna',)
('Bread',)
('Cake Snacks',)
('Cake/Baking Mixes',)
('Candy',)
('Canned Fruit',)
('Canned Goods',)
('Canned Vegetables',)
('Cereal',)
('Cheese',)
('Chocolate Candy',)
('Cleaners',)
('Coffee/Creamer',)
('Cookies',)
('Cooking Oil',)
('Cottage Cheese',)
('Deli Meats',)
('Deli Salads',)
('Deodorizers',)
('Diapers',)
('Dips',)
('Drink',)
('French Fries',)
('Fresh Chicken',)
('Fresh Fish',)
('Fresh Fruit',)
('Fresh Vegetables',)
('Frozen Chicken',)
('Frozen Food',)
('Frozen Vegetables',)
('Gravy/Sauce',)
('Hamburger',)
('Hard Candy',)
('Hot Dogs',)
('Household',)
('Ibuprofen',)
('Ice Cream',)
('Jelly/Jam',)
('Juice',)
('Mac & Cheese',)
('Milk',)
('Mouthwash',)
('Nuts',)
('Other Dairy/Not Milk',)
('Pasta/Noodles',)
('Peanut Butter',)
('Pet Food',)
('Pizza',)
('Popcorn',)
('Popsicles',)
('Produce',)
('Rice',)
('Rice/Rice Mix',)
('Salad Dressing',)
('Sardines',)
('Snacks',)
('S

In [37]:
def run_test():
    
    sql = '''
            WITH items_sold AS (
                SELECT sku
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            )
            SELECT COUNT(*) FROM items_sold
        '''
    
    db_debug.execute_sql(sql)

run_test()

(673760,)
