## Database Build Script
## Anthony Ung, Sean Jerzewski, Gideon Kipkorir

Any new modifications to the database will happen in this Notebook.

Modifications to Chelsea Cantone's Code
1. Changed from using Postgres to SQLite3.
2. Created classes of functions to deliniate separate modules.
3. Created a Product class with the necessary attributes. (Chelsea was using parallel arrays)

In [1]:
import sqlite3 as lite
import csv
from datetime import datetime, date, timedelta
from decimal import Decimal
import random

'''
    If you are building the grocery database, 
        you should only touch
        ARGS, TABLE_DEFINITIONS, and params.

    If you get an error message, 
        set all three values in ARGS to True
        and then re-run the script.

    It took about 10 minutes to do all 3
        on Anthony Ung's Thinkpad P14s
        with Ryzen 7 8840HS, 32GB RAM, and 4TB 990 Pro

    The ARGS dict, as-is, will create the Products table
        and then populate the sales transactions data.

    The populating of the Customers Transactions Data was more useful
        in a later HW where we did some profiling.
        The last operation takes the majority (~8 min.) of the stated
            10 minutes.
'''

ARGS = {
    'Create Products Table': True,
    'Populate Sales Transactions Data': True,
    'Populate Customers Transactions Data': False
}

TABLE_DEFINITIONS = {
    'products': 'CREATE TABLE products(sku INT, product_name TEXT, product_type TEXT)',
    'transactions_sales': 'CREATE TABLE transactions_sales(date TEXT, customerNumber INT, sku INT, salesPrice REAL)',
    'transactions_customers': 'CREATE TABLE transactions_customers(date VARCHAR(8), customerNumber INT, total FLOAT)'
}


class params:
    
    class group:
        price_multiplier = 1.2
        customers_low = 1020
        customers_high = 1060
        weekend_increase = 75
        maximum_items = 90
        
    class simulation:
        start_date = date(2024, 1, 1)
        end_date = date(2024, 12, 31)
        
    class debug:
        display_daily_commits = 14

        
class db:
    con = None
    cur = None
    commit_pending = 0

    def connect():
        db.con = lite.connect(r'store.db')
        db.cur = db.con.cursor()
        print('Database Successfully Connected To')

    def execute_sql(sql):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        db.cur.execute(sql)

    def execute_sql_values(sql, values):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        assert isinstance(values, tuple), \
            f"""Error! This function expected a string. 
                Got {print(type(values))} instead"""
        db.cur.execute(sql, values)

    def commit():
        db.con.commit()
        db.commit_pending = 0

    def close():
        db.con.commit()
        db.con.close()
        print('Database Connection Closed')


class db_debug():
    
    def execute_sql(sql):
        assert isinstance(sql, str), \
            f"""Error! This function expected a string. 
                Got {type(sql)} instead"""

        '''
        In my testing, the db_debug class does not play nicely with
            the db class because even though I invoke db.connect(),
            I still get error messages saying that the database is closed.
        Each invocation creates its own database connection since
            these methods are meant to be used very rarely.
        '''
        con = lite.connect(r'store.db')
        cur = con.cursor()
        
        results = cur.execute(sql).fetchall()
        for row in results:
            print(row)

        con.close()

    def execute_sql_values(sql, values):
        assert type(sql) == str, \
            f"""Error! This function expected a string. 
                Got {print(type(sql))} instead"""
        assert isinstance(values, tuple), \
            f"""Error! This function expected a tuple. 
                Got {print(type(values))} instead"""

        con = lite.connect(r'store.db')
        cur = con.cursor()
        
        results = cur.execute(sql, values)
        for row in results:
            print(row)
        
        con.close()
        

def create_products_table():
    if not ARGS['Create Products Table']:
        print("You don't want to create the Products table")
        return
    db.execute_sql('DROP TABLE IF EXISTS products')
    db.execute_sql(TABLE_DEFINITIONS['products'])
    db.commit()

    csv.register_dialect('piper', delimiter='|', quoting=csv.QUOTE_NONE)
    
    with open('Products1.txt', 'r') as csvfile:
        i = 0
        
        for row in csv.DictReader(csvfile, dialect='piper'):
            sku = row.get('SKU')
            product_name = row.get('Product Name')
            product_type = row.get('itemType')
            db.execute_sql_values(sql='insert into products values (?, ?, ?)',\
                                    values=(sku,product_name, product_type))
            i += 1
            if i % 10000 == 0:
                db.commit()
                print(f"Committed {i} products")
            
        db.commit()
        print(f"Committed {i} products")


class products:
    manufacturer_list = []
    product_name_list = []
    size_list = []
    item_type_list = []
    sku_list = []
    base_price_list = []
    
    milk_price = []
    milk_sku = []
    
    cereal_price = []
    cereal_sku = []
    
    baby_food_price = []
    baby_food_sku = []
    
    diapers_price = []
    diapers_sku = []
    
    bread_price = []
    bread_sku = []
    
    peanut_butter_price = []
    peanut_butter_sku = []
    
    jelly_jam_price = []
    jelly_jam_sku = []
    
    all_items = []


class sales_transactions_utils:
    
    def build_products_arrays():
        csv.register_dialect('piper', delimiter='|', quoting=csv.QUOTE_NONE)
        with open('Products1.txt', 'r') as csvfile:
            for row in csv.DictReader(csvfile, dialect='piper'):
                nosign = row['BasePrice']
                nosign = float(Decimal(nosign.strip('$')))
                nosign = nosign * params.group.price_multiplier
                
                products.manufacturer_list.append(row.get('Manufacturer'))
                products.product_name_list.append(row.get('ProductName'))
                products.size_list.append(row.get('Size'))
                products.item_type_list.append(row.get('itemType'))
                products.sku_list.append(row.get('SKU'))
                products.base_price_list.append(nosign)
                
                if (row['itemType'] == 'Milk'):
                    products.milk_price.append(nosign)
                    products.milk_sku.append(row['SKU'])
                elif (row['itemType'] == 'Cereal'):
                    products.cereal_price.append(nosign)
                    products.cereal_sku.append(row['SKU'])
                elif (row['itemType'] == 'Baby Food'):
                    products.baby_food_price.append(nosign)
                    products.baby_food_sku.append(row['SKU'])
                elif (row['itemType'] == 'Diapers'):
                    products.diapers_price.append(nosign)
                    products.diapers_sku.append(row['SKU'])
                elif (row['itemType'] == 'Bread'):
                    products.bread_price.append(nosign)
                    products.bread_sku.append(row['SKU'])
                elif (row['itemType'] == 'Peanut Butter'):
                    products.peanut_butter_price.append(nosign)
                    products.peanut_butter_sku.append(row['SKU'])
                elif (row['itemType'] == 'Jelly/Jam'):
                        products.jelly_jam_price.append(nosign)
                        products.jelly_jam_sku.append(row['SKU'])

    def get_milk_sku_and_price():
        random_index = random.randrange(len(products.milk_sku))
        return products.milk_sku[random_index], products.milk_price[random_index]

    def cereal_sku_and_price():
        random_index = random.randrange(len(products.cereal_sku))
        return products.cereal_sku[random_index], products.cereal_price[random_index]

    def baby_food_sku_and_price():
        random_index = random.randrange(len(products.baby_food_sku))
        return products.baby_food_sku[random_index], products.baby_food_price[random_index]

    def diapers_sku_and_price():
        random_index = random.randrange(len(products.diapers_sku))
        return products.diapers_sku[random_index], products.diapers_price[random_index]

    def bread_sku_and_price():
        random_index = random.randrange(len(products.bread_sku))
        return products.bread_sku[random_index], products.bread_price[random_index]

    def peanut_butter_sku_and_price():
        random_index = random.randrange(len(products.peanut_butter_sku))
        return products.peanut_butter_sku[random_index], products.peanut_butter_price[random_index]

    def jelly_jam_sku_and_price():
        random_index = random.randrange(len(products.jelly_jam_sku))
        return products.jelly_jam_sku[random_index], products.jelly_jam_price[random_index]

    def get_random_item_sku_and_price():
        random_index = random.randrange(len(products.sku_list))
        return products.sku_list[random_index], products.base_price_list[random_index]


class simulate:
    num_days = 0
    start_date = params.simulation.start_date
    end_date = params.simulation.end_date

    
    def run():
        if not ARGS['Populate Sales Transactions Data']:
            print("You don't want to populate the Sales Transactions table")
            return

        db.execute_sql('DROP TABLE IF EXISTS transactions_sales')
        db.execute_sql(TABLE_DEFINITIONS['transactions_sales'])
        db.commit()

        current_date = simulate.start_date

        while(current_date <= simulate.end_date):
            date_str = current_date.strftime('%Y-%m-%d')
            simulate.simulate_one_day(current_date)
            current_date += timedelta(1)

    
    def simulate_one_day(current_date):
        assert isinstance(current_date, date), \
            f"""Error! This function expected a date. 
                Got {type(current_date)} instead"""

        simulate.num_days += 1        
        increase = 0
        if current_date.weekday() >= 5:
            increase = params.group.weekend_increase
    
        date_str = current_date.strftime('%Y-%m-%d')

        daily_customers = random.randint(params.group.customers_low + increase, params.group.customers_high + increase)

        for customer_number in range(daily_customers):
            simulate.simulate_one_customer(date_str, customer_number + 1)

        if (simulate.num_days % params.debug.display_daily_commits == 0) \
            or (current_date == simulate.start_date) \
            or (current_date == simulate.end_date):
            
            print(f'{datetime.now()} - {date_str} - {db.commit_pending} records created and committing')
        db.commit()
        

    
    def simulate_one_customer(date_str, customer_number):
        assert isinstance(date_str, str), \
            f"""Error! This function expected a date. 
                Got {type(current_date)} instead"""
        assert isinstance(customer_number, int), \
            f"""Error! This function expected a date. 
                Got {type(current_date)} instead"""

        current_num_items = 0
        purchased_num_items = random.randint(1, params.group.maximum_items)

        if random.randint(1, 100) <= 70:
            sku_and_price = sales_transactions_utils.get_milk_sku_and_price()
            SKU = sku_and_price[0]
            price = sku_and_price[1]
            simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
            current_num_items += 1

            if random.randint(1, 100) <= 50:
                sku_and_price = sales_transactions_utils.cereal_sku_and_price()
                SKU = sku_and_price[0]
                price = sku_and_price[1]
                simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
                current_num_items += 1

        else:
            if random.randint(1, 100) <= 5:
                sku_and_price = sales_transactions_utils.cereal_sku_and_price()
                SKU = sku_and_price[0]
                price = sku_and_price[1]
                simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
                current_num_items += 1

        if random.randint(1, 100) <= 20:
            sku_and_price = sales_transactions_utils.baby_food_sku_and_price()
            SKU = sku_and_price[0]
            price = sku_and_price[1]
            simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
            current_num_items += 1

            if random.randint(1, 100) <= 80:
                sku_and_price = sales_transactions_utils.diapers_sku_and_price()
                SKU = sku_and_price[0]
                price = sku_and_price[1]
                simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
                current_num_items += 1

        else:
            if random.randint(1, 100) <= 1:
                sku_and_price = sales_transactions_utils.diapers_sku_and_price()
                SKU = sku_and_price[0]
                price = sku_and_price[1]
                simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
                current_num_items += 1

        if random.randint(1, 100) <= 50:
            sku_and_price = sales_transactions_utils.bread_sku_and_price()
            SKU = sku_and_price[0]
            price = sku_and_price[1]
            simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
            current_num_items += 1

        if random.randint(1, 100) <= 10:
            sku_and_price = sales_transactions_utils.peanut_butter_sku_and_price()
            SKU = sku_and_price[0]
            price = sku_and_price[1]
            simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
            current_num_items += 1

            if random.randint(1, 100) <= 90:
                sku_and_price = sales_transactions_utils.jelly_jam_sku_and_price()
                SKU = sku_and_price[0]
                price = sku_and_price[1]
                simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
                current_num_items += 1

        else:
            if random.randint(1, 100) <= 5:
                sku_and_price = sales_transactions_utils.jelly_jam_sku_and_price()
                SKU = sku_and_price[0]
                price = sku_and_price[1]
                simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
                current_num_items += 1

        while current_num_items < purchased_num_items:
            sku_and_price = sales_transactions_utils.get_random_item_sku_and_price()
            SKU = sku_and_price[0]
            price = sku_and_price[1]
            simulate.writeSalesTransaction(date_str, customer_number, SKU, price)
            current_num_items += 1

    
    def writeSalesTransaction(date: datetime, customerNumber: int, sku: int, salesPrice: float):
        db.commit_pending += 1
        try:
            db.execute_sql_values('insert into transactions_sales values (?, ?, ?, ?)',
                                                          (date,customerNumber,sku,salesPrice))
                
        except Exception as err:
            print("Error writing transactions_sales database table", err)


class customers_transactions_table():    
    num_days = 0
    
    def build():
        if not ARGS['Populate Customers Transactions Data']:
            print("You don't want to populate the customer transactions table")
            return
            
        db.execute_sql('DROP TABLE IF EXISTS transactions_customers')
        db.execute_sql(TABLE_DEFINITIONS['transactions_customers'])
        db.commit()

        current_date = date(2024, 1, 1)
        end_date = date(2024, 12, 31)

        while(current_date <= end_date):            
            sql = '''
                    WITH filter AS (
                        SELECT *
                        FROM transactions_sales 
                        WHERE date == ?
                    ),
                    customer_sales_data AS (
                        SELECT date, customerNumber, ROUND(SUM(salesPrice),2)
                        FROM filter 
                        GROUP BY customerNumber
                    )
                    SELECT *
                    FROM customer_sales_data
                '''
            num_records = 0
            num_days = 0

            results = db.cur.execute(sql, (current_date,)).fetchall()
            for row in results:
                num_records += 1
                data = (row[0], row[1], row[2])
                db.execute_sql_values('INSERT INTO transactions_customers VALUES (?, ?, ?)', data)
            db.commit()
    
            if (customers_transactions_table.num_days % params.debug.display_daily_commits == 0) \
                or (current_date == simulate.start_date) \
                or (current_date == simulate.end_date):
                
                print(f'{datetime.now()} - {current_date} - {num_records} records committed for customer transactions')
    
            customers_transactions_table.num_days += 1
            current_date += timedelta(1)

        
def run():
    db.connect()
    create_products_table()
    sales_transactions_utils.build_products_arrays()
    simulate.run()
    customers_transactions_table.build()
    db.close()

run()

Database Successfully Connected To
Committed 2075 products
2025-02-24 11:04:18.022317 - 2024-01-01 - 48096 records created and committing
2025-02-24 11:04:20.299472 - 2024-01-14 - 51150 records created and committing
2025-02-24 11:04:22.711855 - 2024-01-28 - 51719 records created and committing
2025-02-24 11:04:25.132186 - 2024-02-11 - 50251 records created and committing
2025-02-24 11:04:27.783498 - 2024-02-25 - 49028 records created and committing
2025-02-24 11:04:30.770073 - 2024-03-10 - 50686 records created and committing
2025-02-24 11:04:33.529453 - 2024-03-24 - 50602 records created and committing
2025-02-24 11:04:36.327079 - 2024-04-07 - 50337 records created and committing
2025-02-24 11:04:38.877095 - 2024-04-21 - 51701 records created and committing
2025-02-24 11:04:41.537192 - 2024-05-05 - 49641 records created and committing
2025-02-24 11:04:43.993239 - 2024-05-19 - 51706 records created and committing
2025-02-24 11:04:46.557086 - 2024-06-02 - 53133 records created and comm