## Data Warehousing HW 3 - Grocery Store Data Profiling
## Anthony Ung, Sean Jerzewski, Gideon Kipkorir

### 0. Connecting to Database

In [15]:
import sqlite3 as lite
import csv

def GroceryDatabaseAccess():
    GroceryDatabaseAccess.grocerDBConnection
    GroceryDatabaseAccess.groceryDBCursor
    GroceryDatabaseAccess.salesTransactionsToCommitCount
    GroceryDatabaseAccess.maxTransactionsBeforeCommit

def init():
    GroceryDatabaseAccess.groceryDBConnection = None
    GroceryDatabaseAccess.groceryDBCursor = None
    GroceryDatabaseAccess.salesTransactionsToCommitCount = 0   
    GroceryDatabaseAccess.maxTransactionsBeforeCommit = 10000 

def run():
    print("Connecting to the grocerydb database")
    con = lite.connect(r'store.db')
    print('Database successfully connected to')
    GroceryDatabaseAccess.groceryDBConnection = con
    GroceryDatabaseAccess.groceryDBCursor = con.cursor()

def build_products_table():
    sql = 'DROP TABLE IF EXISTS products'
    GroceryDatabaseAccess.groceryDBCursor.execute(sql)
    
    sql = 'CREATE TABLE products(sku INT, product_name TEXT, product_type TEXT)'
    GroceryDatabaseAccess.groceryDBCursor.execute(sql)

    csv.register_dialect('piper', delimiter='|', quoting=csv.QUOTE_NONE)
    
    with open('Products1.txt', 'r') as csvfile:
        i = 0
        
        for row in csv.DictReader(csvfile, dialect='piper'):
            sku = row.get('SKU')
            product_name = row.get('Product Name')
            product_type = row.get('itemType')
            GroceryDatabaseAccess.groceryDBCursor.execute('insert into products values (?, ?, ?)',
                                                      (sku,product_name, product_type))
            i += 1
            if i % 10000 == 0:
                GroceryDatabaseAccess.groceryDBConnection.commit()
                print(f"Committed row {i}")
            
        GroceryDatabaseAccess.groceryDBConnection.commit()
        print(f"Committed row {i}")

run()

Connecting to the grocerydb database
Database successfully connected to


### 0.1. Build Transactions Table

In [None]:
from datetime import date, datetime, timedelta


def test_dates():
    print('''
            This should print the first 5 and last 5 dates.
            I need subqueries because in SQLite, ORDER BY comes after UNION
                and I need subqueries to order by ASC and DESC in the two parts independently.
            In this case, we are missing the dates '2024-01-01' and '2024-12-31'.
        ''')

    
    sql =   '''
                SELECT * FROM
                    (SELECT DISTINCT date
                    FROM sales_transactions
                    ORDER BY date ASC
                    LIMIT 5
                    )

                UNION

                SELECT * FROM
                    (SELECT DISTINCT date
                    FROM sales_transactions
                    ORDER BY date DESC
                    LIMIT 5
                    )
            '''
    results = GroceryDatabaseAccess.groceryDBCursor.execute(sql)
    for row in results:
        print(row)

def build_transactions_table():
    cur = GroceryDatabaseAccess.groceryDBCursor
    
    sql = "DROP TABLE IF EXISTS transacs"
    cur.execute(sql)

    sql = '''
            CREATE TABLE transacs(textDate VARCHAR(8), transacNum INT, total FLOAT)
        '''
    cur.execute(sql)

    current_date = date(2024, 1, 1)
    end_date = date(2024, 12, 31)

    '''
    while(current_date <= end_date):

        sql = ''
        
        current_date += timedelta(1)
    '''
    
def query():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection
    
    
    current_date = date(2024, 1, 1)
    end_date = date(2024, 12, 31)
    
    while(current_date <= end_date):
        sql = '''
                SELECT date, customerNumber, ROUND(SUM(salesPrice),2) 
                FROM sales_transactions 
                WHERE date == (?) 
                GROUP BY customerNumber
            '''
        i = 0
        
        results = cur.execute(sql, (current_date,))
        print(results)
        for row in results:
            i += 1
            data = (row[0], row[1], row[2])
            cur.execute('INSERT INTO transacs VALUES (?, ?, ?)', data)
        con.commit()
        print(f'{datetime.now()} - Committed transaction results for date {current_date} - {i} records committed')
        current_date += timedelta(1)
    

def run_0_1():
    #test_dates()
    build_transactions_table()
    query()

run_0_1()

  results = cur.execute(sql, (current_date,))


<sqlite3.Cursor object at 0x000002117A8882C0>
2025-02-14 12:40:51.511320 - Committed transaction results for date 2024-01-01 - 0 records committed
<sqlite3.Cursor object at 0x000002117A8882C0>
2025-02-14 12:40:52.537854 - Committed transaction results for date 2024-01-02 - 1 records committed
<sqlite3.Cursor object at 0x000002117A8882C0>
2025-02-14 12:40:53.535723 - Committed transaction results for date 2024-01-03 - 1 records committed
<sqlite3.Cursor object at 0x000002117A8882C0>
2025-02-14 12:40:54.540722 - Committed transaction results for date 2024-01-04 - 1 records committed
<sqlite3.Cursor object at 0x000002117A8882C0>
2025-02-14 12:40:55.584249 - Committed transaction results for date 2024-01-05 - 1 records committed
<sqlite3.Cursor object at 0x000002117A8882C0>
2025-02-14 12:40:56.726794 - Committed transaction results for date 2024-01-06 - 1 records committed
<sqlite3.Cursor object at 0x000002117A8882C0>
2025-02-14 12:40:57.881920 - Committed transaction results for date 2024

In [62]:
cur = GroceryDatabaseAccess.groceryDBCursor
con = GroceryDatabaseAccess.groceryDBConnection

current_date = date(2024, 1, 2)

sql = '''
        SELECT date, customerNumber, ROUND(SUM(salesPrice),2) 
        FROM sales_transactions 
        WHERE date == '2024-01-02' 
        GROUP BY customerNumber
    '''
results = cur.execute(sql)
for row in results:
    print(row)

('2024-01-02', 1, 125.63)
('2024-01-02', 2, 341.71)
('2024-01-02', 3, 28.62)
('2024-01-02', 4, 50.7)
('2024-01-02', 5, 104.15)
('2024-01-02', 6, 62.47)
('2024-01-02', 7, 2.27)
('2024-01-02', 8, 338.16)
('2024-01-02', 9, 44.35)
('2024-01-02', 10, 9.58)
('2024-01-02', 11, 21.79)
('2024-01-02', 12, 26.59)
('2024-01-02', 13, 292.6)
('2024-01-02', 14, 26.18)
('2024-01-02', 15, 167.29)
('2024-01-02', 16, 4.79)
('2024-01-02', 17, 217.43)
('2024-01-02', 18, 27.1)
('2024-01-02', 19, 326.71)
('2024-01-02', 20, 31.03)
('2024-01-02', 21, 210.54)
('2024-01-02', 22, 366.5)
('2024-01-02', 23, 123.19)
('2024-01-02', 24, 203.77)
('2024-01-02', 25, 248.71)
('2024-01-02', 26, 310.14)
('2024-01-02', 27, 83.36)
('2024-01-02', 28, 282.05)
('2024-01-02', 29, 359.86)
('2024-01-02', 30, 38.34)
('2024-01-02', 31, 210.47)
('2024-01-02', 32, 226.67)
('2024-01-02', 33, 69.48)
('2024-01-02', 34, 327.13)
('2024-01-02', 35, 150.74)
('2024-01-02', 36, 369.06)
('2024-01-02', 37, 283.68)
('2024-01-02', 38, 144.9)
('2024

In [51]:
def run_0_2():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection

    sql = '''
            SELECT *
            FROM transacs
            WHERE (textDate >= '2024-01-02') AND (textDate <= '2024-01-16')
        '''

    results = cur.execute(sql)
    for row in results:
        print(row)
    
    sql = '''
            SELECT COUNT(*) AS count, product_type
            FROM sales_transactions
            JOIN products on sales_transactions.sku == products.sku
            WHERE (date >= '2024-01-02') AND (date <= '2024-01-16')
            GROUP BY product_type
            ORDER BY count DESC
        '''

    results = cur.execute(sql)
    for row in results:
        print(row)

run_0_2()

('2024-01-02', 1, 125.63)
('2024-01-03', 1, 77.65)
('2024-01-04', 1, 225.8)
('2024-01-05', 1, 117.47)
('2024-01-06', 1, 214.14)
('2024-01-07', 1, 9.94)
('2024-01-08', 1, 119.77)
('2024-01-09', 1, 160.81)
('2024-01-10', 1, 223.52)
('2024-01-11', 1, 329.02)
('2024-01-12', 1, 201.94)
('2024-01-13', 1, 132.23)
('2024-01-14', 1, 330.68)
('2024-01-15', 1, 183.08)
('2024-01-16', 1, 36.19)
(98565, '')
(57008, 'Baby Food')
(55174, 'Snacks')
(44279, 'Coffee/Creamer')
(39742, 'Fresh Vegetables')
(36631, 'Cereal')
(29993, 'Diapers')
(25429, 'Baked Goods Other than Bread')
(23819, 'Bread')
(23394, 'Fresh Fruit')
(19766, 'Pasta/Noodles')
(15022, 'Cheese')
(14991, 'Soup')
(13391, 'Cookies')
(13058, 'Milk')
(11371, 'Frozen Food')
(10253, 'Canned Vegetables')
(8527, 'Frozen Vegetables')
(8393, 'Nuts')
(8292, 'Chocolate Candy')
(8190, 'Peanut Butter')
(7081, 'Canned Goods')
(6772, 'Dips')
(6666, 'Deli Meats')
(6655, 'Cooking Oil')
(6639, 'Popsicles')
(6623, 'Pizza')
(5103, 'French Fries')
(5075, 'Cleane