## Data Warehousing HW 3 - Grocery Store Data Profiling
## Anthony Ung, Sean Jerzewski, Gideon Kipkorir

### 0. Connecting to Database

In [22]:
import sqlite3 as lite
import csv

def GroceryDatabaseAccess():
    GroceryDatabaseAccess.grocerDBConnection
    GroceryDatabaseAccess.groceryDBCursor
    GroceryDatabaseAccess.salesTransactionsToCommitCount
    GroceryDatabaseAccess.maxTransactionsBeforeCommit

def init():
    GroceryDatabaseAccess.groceryDBConnection = None
    GroceryDatabaseAccess.groceryDBCursor = None
    GroceryDatabaseAccess.salesTransactionsToCommitCount = 0   
    GroceryDatabaseAccess.maxTransactionsBeforeCommit = 10000 

def run():
    print("Connecting to the grocerydb database")
    con = lite.connect(r'store.db')
    print('Database successfully connected to')
    GroceryDatabaseAccess.groceryDBConnection = con
    GroceryDatabaseAccess.groceryDBCursor = con.cursor()

def build_products_table():
    sql = 'DROP TABLE IF EXISTS products'
    GroceryDatabaseAccess.groceryDBCursor.execute(sql)
    
    sql = 'CREATE TABLE products(sku INT, product_name TEXT, product_type TEXT)'
    GroceryDatabaseAccess.groceryDBCursor.execute(sql)

    csv.register_dialect('piper', delimiter='|', quoting=csv.QUOTE_NONE)
    
    with open('Products1.txt', 'r') as csvfile:
        i = 0
        
        for row in csv.DictReader(csvfile, dialect='piper'):
            sku = row.get('SKU')
            product_name = row.get('Product Name')
            product_type = row.get('itemType')
            GroceryDatabaseAccess.groceryDBCursor.execute('insert into products values (?, ?, ?)',
                                                      (sku,product_name, product_type))
            i += 1
            if i % 10000 == 0:
                GroceryDatabaseAccess.groceryDBConnection.commit()
                print(f"Committed row {i}")
            
        GroceryDatabaseAccess.groceryDBConnection.commit()
        print(f"Committed row {i}")

run()
build_products_table()

Connecting to the grocerydb database
Database successfully connected to
Committed row 2075


### 0.1. Build Transactions Table

In [23]:
from datetime import date, datetime, timedelta


def test_dates():
    print('''
            This should print the first 5 and last 5 dates.
            I need subqueries because in SQLite, ORDER BY comes after UNION
                and I need subqueries to order by ASC and DESC in the two parts independently.
            In this case, we are missing the dates '2024-01-01' and '2024-12-31'.
        ''')

    
    sql =   '''
                SELECT * FROM
                    (SELECT DISTINCT date
                    FROM sales_transactions
                    ORDER BY date ASC
                    LIMIT 5
                    )

                UNION

                SELECT * FROM
                    (SELECT DISTINCT date
                    FROM sales_transactions
                    ORDER BY date DESC
                    LIMIT 5
                    )
            '''
    results = GroceryDatabaseAccess.groceryDBCursor.execute(sql)
    for row in results:
        print(row)

def build_transactions_table():
    cur = GroceryDatabaseAccess.groceryDBCursor
    
    sql = "DROP TABLE IF EXISTS transacs"
    cur.execute(sql)

    sql = '''
            CREATE TABLE transactions(date VARCHAR(8), customerNumber INT, total FLOAT)
        '''
    cur.execute(sql)

    current_date = date(2024, 1, 1)
    end_date = date(2024, 12, 31)

    
def query():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection
    
    
    current_date = date(2024, 1, 1)
    end_date = date(2024, 12, 31)
    
    num_days = 1
    
    while(current_date <= end_date):
        sql = '''
                SELECT date, customerNumber, ROUND(SUM(salesPrice),2) 
                FROM sales_transactions 
                WHERE date == (?)
                GROUP BY customerNumber
            '''
        num_records = 0
        
        results = cur.execute(sql, (current_date,)).fetchall()
        for row in results:
            num_records += 1
            data = (row[0], row[1], row[2])
            cur.execute('INSERT INTO transactions VALUES (?, ?, ?)', data)
        con.commit()

        if num_days % 30 == 1:
            print(f'{datetime.now()} - Committed transaction results for date {current_date} - {num_records} records committed')

        num_days += 1
        current_date += timedelta(1)
    

def run_0_1():
    #test_dates()
    build_transactions_table()
    query()

run_0_1()

  results = cur.execute(sql, (current_date,)).fetchall()


2025-02-14 14:48:59.549605 - Committed transaction results for date 2024-01-01 - 1052 records committed
2025-02-14 14:49:32.927967 - Committed transaction results for date 2024-01-31 - 1044 records committed
2025-02-14 14:50:05.563190 - Committed transaction results for date 2024-03-01 - 1023 records committed
2025-02-14 14:50:37.575598 - Committed transaction results for date 2024-03-31 - 1130 records committed
2025-02-14 14:51:12.133306 - Committed transaction results for date 2024-04-30 - 1042 records committed
2025-02-14 14:51:42.364399 - Committed transaction results for date 2024-05-30 - 1044 records committed
2025-02-14 14:52:13.629320 - Committed transaction results for date 2024-06-29 - 1130 records committed
2025-02-14 14:52:46.482562 - Committed transaction results for date 2024-07-29 - 1029 records committed
2025-02-14 14:53:20.241322 - Committed transaction results for date 2024-08-28 - 1058 records committed
2025-02-14 15:36:56.251112 - Committed transaction results for d

In [None]:
def test_daily_customer_counts():
    cur = GroceryDatabaseAccess.groceryDBCursor

    sql = \
        '''
            WITH customer_counts AS (
                SELECT date, COUNT (DISTINCT customerNumber)
                FROM sales_transactions
                GROUP BY date
            )
        '''


tests = \
    { \
        'Daily Customer Counts' : True
    }

if tests['Daily Customer Counts']:
    test_daily_customer_counts()

In [13]:
def run_0_2():
    cur = GroceryDatabaseAccess.groceryDBCursor
    con = GroceryDatabaseAccess.groceryDBConnection

    sql = '''
            SELECT COUNT(*) AS count
            FROM transacs
            WHERE (textDate >= '2024-01-02') AND (textDate <= '2024-01-16')
        '''

    results = cur.execute(sql)
    for row in results:
        print(row)
    
    sql = '''
            SELECT COUNT(*) AS count
            FROM sales_transactions
            JOIN products on sales_transactions.sku == products.sku
            WHERE (date >= '2024-01-02') AND (date <= '2024-01-16') AND (product_type == 'Milk')
        '''

    results = cur.execute(sql)
    for row in results:
        print(row)

run_0_2()

(15922,)
(13058,)
