## Data Warehousing HW 3 - Grocery Store Data Profiling
## Anthony Ung, Sean Jerzewski, Gideon Kipkorir
### This is the code we used to generate the numbers in our spreadsheet.

ALL TABLE DEFINITIONS  
    'products': 'CREATE TABLE products(sku INT, product_name TEXT, product_type TEXT)',  
    'transactions_sales': 'CREATE TABLE transactions_sales(date TEXT, customerNumber INT, sku INT, salesPrice REAL)',  
    'transactions_customers': 'CREATE TABLE transactions_customers(date VARCHAR(8), customerNumber INT, total FLOAT)'  

## 0. Databse Classes of Utility Functions
### IMPORTANT: You need to run this cell to make the the other cells runnable.

In [1]:
import sqlite3 as lite

class db:
    con = None
    cur = None
    commit_pending = 0

    def connect():
        db.con = lite.connect(r'store.db')
        db.cur = db.con.cursor()
        print('Database Successfully Connected To')

    def execute_sql(sql):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        return db.cur.execute(sql).fetchall()

    def execute_sql_values(sql, values):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        assert isinstance(values, tuple), \
            f"""Error! This function expected a string. 
                Got {print(type(values))} instead"""
        results = db.cur.execute(sql, values).fetchall()

    def commit():
        db.con.commit()
        db.commit_pending = 0

    def close():
        db.con.commit()
        db.con.close()
        print('Database Connection Closed')


class db_debug():
    
    def execute_sql(sql):
        assert isinstance(sql, str), \
            f"""Error! This function expected a string. 
                Got {type(sql)} instead"""

        '''
        In my testing, the db_debug class does not play nicely with
            the db class because even though I invoke db.connect(),
            I still get error messages saying that the database is closed.
        Each invocation creates its own database connection since
            these methods are meant to be used very rarely.
        '''
        con = lite.connect(r'store.db')
        cur = con.cursor()
        
        results = cur.execute(sql).fetchall()
        for row in results:
            print(row)

        con.close()

    def execute_sql_values(sql, values):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        assert isinstance(values, tuple), \
            f"""Error! This function expected a tuple. 
                Got {print(type(values))} instead"""

        con = lite.connect(r'store.db')
        cur = con.cursor()
        
        results = cur.execute(sql, values)
        for row in results:
            print(row)
        
        con.close()

## 1.0 - Preliminary Profiling
Take each of the special items Milk, baby food etc. and compute how many should sell per day given your parameters.  
Do the same for the non-special items  
(you can lump these together. Don’t do a separate line for 2000 items)  
  
Table 1 (Expected Probabilities of Each Item Type) in the spreadsheet shows the numbers I expect.
For the "Other", I computed the expected values of special items and then I subtracted those totals from the average sales per day after I did the other computations.

## 1.1 - Average Sales Per Day
Compute the average sales per day of each item

Our parameters
- 1020-1060 Customers Daily + 75 for Weekends
- 1-90 Items Per Customer

Assumptions Given
- About 70% of our customers are supposed to buy milk
  - 35% will buy milk and cereal
  - 1.5% will buy cereal but no milk
- 20% will buy baby food
  - 16% will buy baby food and diapers
  - 0.8% will buy diapers but no baby food
- 50% will buy bread
- 10% will buy peanut butter
  - 9% will buy peanut butter + jam/jelly
  - 4.5% will buy jam + jelly but no peanut butter
- All other products should be equally likely

In [2]:
def run_1_1():
    
    sql = '''
            WITH selected_transacs AS (
                SELECT *
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            ),
            selected_products AS (
                SELECT
                    product_type AS type,
                    product_name AS name,
                    sku
                FROM products
            )
            SELECT COUNT(*)/14 AS count, type, name
            FROM selected_transacs
            JOIN selected_products USING(sku)
            GROUP BY sku
            ORDER BY count DESC
            LIMIT 25
        '''
    
    db_debug.execute_sql(sql)

run_1_1()

(149, 'Milk', 'Whole Milk Milk')
(149, 'Milk', 'Whole Milk Milk')
(148, 'Milk', '1.00% Milk')
(147, 'Milk', '2.00% Milk')
(143, 'Milk', '2.00% Milk')
(141, 'Milk', '1.00% Milk')
(61, 'Jelly/Jam', 'Squeeze Jelly Grape')
(59, 'Jelly/Jam', 'Jelly Grape')
(58, 'Jelly/Jam', 'Jam Grape')
(55, 'Jelly/Jam', 'Jam Strawberry')
(37, 'Bread', 'Swirl Oatmeal Bread Apple & Cinnamon')
(36, 'Bread', 'Farmhouse Bread Oatmeal')
(36, 'Bread', 'Bread Swirl Raisin Cinnamon')
(35, 'Bread', 'Bread Cinnamon Raisin Swirl')
(35, 'Bread', 'Whole Grain Bread 100% Wheat Bread')
(35, 'Bread', 'Light Style Bread 7 Grain')
(35, 'Bread', 'Italian Bread With Sesame Seeds')
(35, 'Bread', 'Farmhouse Bread Sourdough')
(35, 'Bread', 'Bread Honey Wheat')
(34, 'Bread', 'Sandwich Thins Potato Rolls')
(34, 'Bread', 'Sandwich Thins Honey Wheat')
(34, 'Bread', 'Ezekiel 4:9 Bread Whole Grain')
(34, 'Bread', 'Ezekiel 4:9 Bread Sesame Sprouted Grain')
(34, 'Bread', 'Very Thin Bread White')
(34, 'Bread', 'Swirl Bread Raisin Cinnamon

## 1.1 - Average Sales Per Item Type Per Day
It may also be valuable to see the numbers of each product type sold per day

In [3]:
def run_1_1a():
    
    sql = '''
            WITH selected_transacs AS (
                SELECT *
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            ),
            selected_products AS (
                SELECT sku, product_type AS type
                from products
                WHERE products.product_type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            )
            SELECT 
                count(*)/14 AS count,
                sp.type AS type
            FROM selected_transacs
            JOIN selected_products AS sp USING(sku)
            GROUP BY sp.type
            ORDER BY count DESC
        '''

    db_debug.execute_sql(sql)

run_1_1a()

(3784, 'Baby Food')
(2443, 'Cereal')
(2011, 'Diapers')
(1600, 'Bread')
(879, 'Milk')
(550, 'Peanut Butter')
(235, 'Jelly/Jam')


## 1.1 - Average Sales Per Item Type Per Transaction Per Day
It may also be valuable to see the numbers of each product type sold per transaction per day

In [4]:
def run_1_1b():
   
    sql = '''
            WITH selected_transacs AS (
                SELECT date, customerNumber, sku
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            ),
            selected_products AS (
                SELECT sku, product_type AS type
                FROM products
                WHERE product_type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            ),
            type_counts_per_transaction AS (
                SELECT date, customerNumber, type, COUNT(*) as count
                FROM selected_transacs
                JOIN selected_products USING(sku)
                GROUP BY date, customerNumber, type
            ),
            type_frequencies_per_day AS (
                SELECT COUNT(*)/14 AS freq, type
                FROM type_counts_per_transaction
                GROUP BY type
                ORDER BY freq DESC
            )
            SELECT * from type_frequencies_per_day
        '''

    db_debug.execute_sql(sql)

run_1_1b()

(930, 'Baby Food')
(891, 'Cereal')
(837, 'Bread')
(810, 'Diapers')
(786, 'Milk')
(412, 'Peanut Butter')
(219, 'Jelly/Jam')


## Average number of customers per day

In [5]:
def run_test():

    sql = '''
            WITH customers_per_day AS (
                SELECT date, MAX(customerNumber) as numCustomers
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
                GROUP BY date
            )
            SELECT 'Average Customer Count Per Day', ROUND(AVG(numCustomers), 0)
            FROM customers_per_day
        '''

    db_debug.execute_sql(sql)

run_test()

('Average Customer Count Per Day', 1068.0)


## Average Number of Items Sold Per Day

In [6]:
def run_test():

    sql = '''
            WITH customers_per_day AS (
                SELECT date, COUNT(*) as itemsSold
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
                GROUP BY date
            )
            SELECT 'Average Number of Items Sold Per Day', ROUND(AVG(itemsSold), 0)
            FROM customers_per_day
        '''

    db_debug.execute_sql(sql)

run_test()

('Average Number of Items Sold Per Day', 48126.0)


## Minimum and Maximum for non-special items

In [7]:
def run_test():
    
    sql = '''
            WITH items_sold AS (
                SELECT sku
                FROM transactions_sales
                WHERE (date >= '2024-01-02') AND (date <= '2024-01-15')
            ),
            selected_products AS (
                SELECT *
                FROM products AS p
                WHERE p.product_type NOT IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            ),
            items_sold_count AS (
                SELECT sku, COUNT(*) as count
                FROM items_sold
                GROUP BY sku
            ),
            filtered_items_list AS (
                SELECT isc.sku, ROUND(isc.count/14, 0) AS count, sp.product_type
                FROM items_sold_count AS isc
                JOIN selected_products AS sp USING(sku)
            )
            SELECT 'Minimum:', MIN(fil.count), 'Maximum', MAX(fil.count) FROM filtered_items_list AS fil
        '''
    
    db_debug.execute_sql(sql)

run_test()

('Minimum:', 18.0, 'Maximum', 27.0)


## Count per Itemm Type

In [8]:
def run_test():

    sql = '''
            SELECT COUNT(*), p.product_type
            FROM products AS p
            WHERE p.product_type IN ('Milk', 'Cereal', 'Diapers', 'Baby Food', 'Bread', 'Peanut Butter', 'Jelly/Jam')
            GROUP BY p.product_type
        '''
    
    db_debug.execute_sql(sql)

run_test()

(162, 'Baby Food')
(48, 'Bread')
(93, 'Cereal')
(82, 'Diapers')
(4, 'Jelly/Jam')
(6, 'Milk')
(20, 'Peanut Butter')
