## Database Build Script v3
## Anthony Ung, Sean Jerzewski, Gideon Kipkorir

Any new modifications to the database will happen in this Notebook.

Modifications to Chelsea Cantone's Code from Week 2
1. Changed from using Postgres to SQLite3.
2. Created classes of functions to deliniate separate modules.
3. Created classes of products

Inspiration from Harini's Code from Week 4
1. Table Schemas

In [31]:
import sqlite3 as lite
import csv
from datetime import datetime, date, timedelta
from decimal import Decimal
import random


'''
    If you are building the grocery database, 
        you should only touch
        ARGS, TABLE_DEFINITIONS, and params.

    If you get an error message, 
        set all three values in ARGS to True
        and then re-run the script.

    It took about 10 minutes
        on Anthony Ung's Thinkpad P14s
        with Ryzen 7 8840HS, 32GB RAM, and 4TB 990 Pro

    The third arg was usful in a later HW when we did some profiling
'''


class ARGS:
    CREATE_PRODUCTS_TABLE = True
    POPULATE_FACTS_TABLE = True


'''
    TABLE_DEFINITIONS is a dict as follows:
        Key - the name of the table in the database
        Value - the CREATE TABLE statement for the table
'''
TABLE_DEFINITIONS = {
    'products': \
            'CREATE TABLE products(' \
                    'sku INT,' \
                    'product_name TEXT, ' \
                    'product_type TEXT, ' \
                    'manufacturer TEXT, ' \
                    'base_price REAL)',
    'sales_transactions': \
            'CREATE TABLE sales_transactions(' \
                    'date TEXT, ' \
                    'customer_number INT, ' \
                    'sku INT, ' \
                    'salesPrice REAL, ' \
                    'items_left INT, ' \
                    'cases_ordered INT)',
    'sales_customers': \
            'CREATE TABLE sales_customers(' \
                    'date VARCHAR(8), ' \
                    'customer_number INT, ' \
                    'num_items INT, ' \
                    'total FLOAT)',
    'sales_daily': \
            'CREATE TABLE sales_daily(' \
                    'date VARCHAR(8), ' \
                    'num_customers INT, ' \
                    'num_items INT, ' \
                    'total FLOAT)'
}


class params:

    class group:
        price_multiplier = 1.2
        customers_low = 1020
        customers_high = 1060
        weekend_increase = 75
        maximum_items = 90
    
    class simulation:
        start_date = date(2024, 1, 1)
        end_date = date(2024, 1, 12)
 
    class debug:
        display_daily_commits = 14

    '''
        Initial Stock Counts in Cases
        Numbers are based on profiling in HW 3
        The max stock level is set to 2 for items for which I expect
            to sell less than 2 cases worth over three days.
    '''
    class MAX_STOCK_LEVELS:
        milk = 16
        cereal = 2
        baby_food = 2
        diapers = 2
        bread = 3
        peanut_butter = 5
        jelly_jam = 10
        other = 7
  
    '''
        I am assuming that 
        Allows me to give special treatment 
            to different product categories later.
    '''
    class CASE_COUNT:
        generic = 12


class DEBUG:
    
    class globals:
        run_assert = True
        
    def assert_params(param, param_type):
        if not DEBUG.globals.run_assert:
            return
        assert isinstance(param, param_type), \
            "Error!" \
            f"Expected argument of {param_type}" "\n" \
            f"Got argument of {type(param)} instead." "\n" \
            f"This parameter's value is {param}" "\n"


'''
    This class provides one common point of interaction with my team's database.
    Everything that writes to the database uses this API.
'''
class db:
    
    class globals:
        con = None
        cur = None
        commit_pending = 0

    def connect():
        db.globals.con = lite.connect(r'store.db')
        db.globals.cur = db.globals.con.cursor()
        print('Database Successfully Connected To')

    def execute_sql(sql):
        DEBUG.assert_params(sql, str)
        
        db.globals.cur.execute(sql)

    def execute_sql_values(sql, values):
        DEBUG.assert_params(sql, str)
        DEBUG.assert_params(values, tuple)
        
        db.globals.cur.execute(sql, values)

    def commit():
        db.globals.con.commit()
        db.commit_pending = 0

    def close():
        db.globals.con.commit()
        db.globals.con.close()
        print('Database Connection Closed')


'''
    This API is for debugging purposes.
    In my testing, something was not playing nice with the
        existing database API when I tried to print because
        I was getting errors stating that the database connection is closed.
    These functions maintain their own connection and cursor.
'''
class db_debug():
    
    def execute_sql(sql):
        DEBUG.assert_params(sql, str)

        con = lite.connect(r'store.db')
        cur = con.cursor()
        
        results = cur.execute(sql).fetchall()
        for row in results:
            print(row)

        con.close()

    def execute_sql_values(sql, values):
        DEBUG.assert_params(values, tuple)

        con = lite.connect(r'store.db')
        cur = con.cursor()
        
        results = cur.execute(sql, values)
        for row in results:
            print(row)
        
        con.close()
        

def create_products_table():
    
    if not ARGS.CREATE_PRODUCTS_TABLE:
        print("You don't want to create the Products table")
        return
        
    db.execute_sql('DROP TABLE IF EXISTS products')
    db.execute_sql(TABLE_DEFINITIONS['products'])
    db.commit()

    csv.register_dialect('piper', delimiter='|', quoting=csv.QUOTE_NONE)
    
    with open('Products1.txt', 'r') as csvfile:
        count = 0
        
        for row in csv.DictReader(csvfile, dialect='piper'):
            sku = int(row.get('SKU'))
            product_name = row.get('Product Name')
            product_type = row.get('itemType')
            manufacturer = row.get('Manufacturer')
            base_price = row.get('BasePrice')
            
            db.execute_sql_values( \
                    sql='insert into products values (?, ?, ?, ?, ?)',\
                    values=(sku, product_name, product_type, manufacturer, base_price))
            
            count += 1
            if count % 10000 == 0:
                db.commit()
                print(f"Committed {count} products")
            
        db.commit()
        print(f"Committed {count} products")

        # Only do this if file read was successful
        Inventory.populate_lists()

    
class Product:

    def __init__(self, p_name, p_type, sku, price):
        DEBUG.assert_params(p_name, str)
        DEBUG.assert_params(p_type, str)
        DEBUG.assert_params(sku, int)
        DEBUG.assert_params(price, float)
        
        self.p_name = p_name
        self.p_type = p_type
        self.sku = sku
        self.price = price
        self.stock = 0
        self.total_cases_ordered = 0

    def __str__(self):
        return f'{self.p_name} - {self.p_type} - {self.sku} - {self.price}'

    def restock(self):
        match self.p_type:
            case 'Milk':
                max_limit = 12 * params.MAX_STOCK_LEVELS.milk
            case 'Cereal':
                max_limit = 12 * params.MAX_STOCK_LEVELS.cereal
            case 'Baby Food':
                max_limit = 12 * params.MAX_STOCK_LEVELS.baby_food
            case 'Diapers':
                max_limit = 12 * params.MAX_STOCK_LEVELS.diapers
            case 'Bread':
                max_limit = 12 * params.MAX_STOCK_LEVELS.bread
            case 'Peanut Butter':
                max_limit = 12 * params.MAX_STOCK_LEVELS.peanut_butter
            case 'Jelly/Jam':
                max_limit = 12 * params.MAX_STOCK_LEVELS.jelly_jam
            case _:
                max_limit = 12 * params.MAX_STOCK_LEVELS.other
        num_items_needed = max_limit - self.stock
        num_cases_needed = (num_items_needed + 11) // 12

        self.total_cases_ordered += num_cases_needed
        self.stock += 12*(num_cases_needed)


class Inventory:
    from enum import Enum
    
    products = {}
    products['milk'] = []
    products['cereal'] = []
    products['baby food'] = []
    products['diapers'] = []
    products['bread'] = []
    products['peanut butter'] = []
    products['jelly jam'] = []
    products['other'] = []

    class TYPE(Enum):
        OTHER = 'other'
        MILK = 'milk'
        CEREAL = 'cereal'
        BABY_FOOD = 'baby food'
        DIAPERS = 'diapers'
        BREAD = 'bread'
        PEANUT_BUTTER = 'peanut butter'
        JELLY_JAM = 'jelly jam'

    def select(p_type):
        DEBUG.assert_params(p_type, Inventory.TYPE)
        
        num_products_in_type = len(Inventory.products[p_type.value])
        product_index = random.randint(0, (num_products_in_type-1))
        last_index = \
            product_index - 1 if product_index != 0 \
            else (num_products_in_type-1)

        product = Inventory.products[p_type.value][product_index]

        while(product.stock <= 0 and product_index != last_index):   
            product_index += 1
            product_index %= num_products_in_type
            product = Inventory.products[p_type.value][product_index]

        '''
            If everything is out of stock, then return None and let the caller deal with it.
        '''
        if((product_index == last_index) and (product.stock <= 0)):
            return None
        
        product.stock -= 1
        return product

    '''
        Jupyter makes lists persist in memory after I run each cell.
        I delete the existing lists in order to not have the same product appear multiple times.
    '''
    def reset_lists():
        Inventory.products = {}
        Inventory.products['milk'] = []
        Inventory.products['cereal'] = []
        Inventory.products['baby food'] = []
        Inventory.products['diapers'] = []
        Inventory.products['bread'] = []
        Inventory.products['peanut butter'] = []
        Inventory.products['jelly jam'] = []
        Inventory.products['other'] = []

    def populate_lists():
        if not ARGS.CREATE_PRODUCTS_TABLE:
            print("You don't want to create the Products table")
            return
        
        Inventory.reset_lists()

        with open('Products1.txt', 'r') as csvfile:
            for row in csv.DictReader(csvfile, dialect='piper'):
                sku = int(row.get('SKU'))
                product_name = row.get('Product Name')
                product_type = row.get('itemType')

                price = row.get('BasePrice')
                price = float(Decimal(price.strip('$')))
                price = round(price * params.group.price_multiplier, 2)
                
                current_product = Product(\
                    p_name = product_name, \
                    p_type = product_type, \
                    sku = sku, 
                    price = price
                )
     
                match product_type:
                    case 'Milk':
                        Inventory.products['milk'].append(current_product)
                    case 'Cereal':
                        Inventory.products['cereal'].append(current_product)
                    case 'Baby Food':
                        Inventory.products['baby food'].append(current_product)
                    case 'Diapers':
                        Inventory.products['diapers'].append(current_product)
                    case 'Bread':
                        Inventory.products['bread'].append(current_product)
                    case 'Peanut Butter':
                        Inventory.products['peanut butter'].append(current_product)
                    case 'Jelly/Jam':
                        Inventory.products['jelly jam'].append(current_product)
                    case _:
                        Inventory.products['other'].append(current_product)

            '''
                We want the customer to randomly select another item of the same type 
                    if the item is out of stock.
                The select() method chooses the next index and this relies on having random products.
            '''
            for product_list in Inventory.products.values():
                random.shuffle(product_list)
            print('Products in memory successfully populated.')

    def restock_milk():
        for milk_product in Inventory.products['milk']:
            milk_product.restock()

    def restock_all():
        for product_list in Inventory.products.values():
            for product in product_list:
                product.restock()
        

class simulate:

    class globals:
        num_days = 0
        start_date = params.simulation.start_date
        end_date = params.simulation.end_date
    
    class DEBUG:
        def print_log(day):
            if (simulate.globals.num_days % params.debug.display_daily_commits == 0) \
                or (day.current_date == simulate.globals.start_date) \
                or (day.current_date == simulate.globals.end_date):
                
                print(f'{datetime.now()} - ' \
                      f'{day.date_str} - ' \
                      f'{db.commit_pending} records created and committing')

    class Day:
        def __init__(self, current_date):
            DEBUG.assert_params(current_date, date)
            
            self.current_date = current_date
            self.date_str = current_date.strftime('%Y-%m-%d')
            self.num_items = 0
            self.num_customers = 0
            self.daily_total = 0

    class Customer:
        def __init__(self, day):
            DEBUG.assert_params(day, simulate.Day)
            
            self.date = day.date_str
            self.customer_number = day.num_customers + 1
            self.num_items = 0
            self.max_items = random.randint(1, params.group.maximum_items)
            self.running_total = 0

    def build_tables():
        db.execute_sql('DROP TABLE IF EXISTS sales_transactions')
        db.execute_sql(TABLE_DEFINITIONS['sales_transactions'])
        db.execute_sql('DROP TABLE IF EXISTS sales_customers')
        db.execute_sql(TABLE_DEFINITIONS['sales_customers'])
        db.execute_sql('DROP TABLE IF EXISTS sales_daily')
        db.execute_sql(TABLE_DEFINITIONS['sales_daily'])
        db.commit()

    def run():
        if not ARGS.POPULATE_FACTS_TABLE:
            print("You don't want to populate the Facts table")
            return

        simulate.build_tables()
        
        current_date = simulate.globals.start_date
        while(current_date <= simulate.globals.end_date):
            date_str = current_date.strftime('%Y-%m-%d')
            simulate.simulate_one_day(current_date)
            current_date += timedelta(1)

    def simulate_one_day(current_date):
        DEBUG.assert_params(current_date, date)

        simulate.globals.num_days += 1
        if(current_date == simulate.globals.start_date):
            Inventory.restock_all()

        '''
            Milk is restocked all 7 days of the week.
            Everything else is restocked on Tuesday, Thursday, and Saturday.
        '''
        if(current_date.weekday() % 2 == 0):
            Inventory.restock_milk()
        else:
            Inventory.restock_all()
                        
        increase = 0
        if current_date.weekday() >= 5:
            increase = params.group.weekend_increase
    
        day = simulate.Day(current_date)
        
        daily_customers = random.randint(params.group.customers_low + increase, params.group.customers_high + increase)

        for customer_number in range(daily_customers):
            simulate.simulate_one_customer(day)

        simulate.write_daily_total(day)
        simulate.DEBUG.print_log(day)
        
        db.commit()
        
    def simulate_one_customer(day):

        DEBUG.assert_params(day, simulate.Day)

        customer_data = simulate.Customer(day)
        
        if random.randint(1, 100) <= 70:
            product = Inventory.select(Inventory.TYPE.MILK)
            simulate.buy(customer_data, product)

            if random.randint(1, 100) <= 50:
                product = Inventory.select(Inventory.TYPE.CEREAL)
                simulate.buy(customer_data, product)

        else:
            if random.randint(1, 100) <= 5:
                product = Inventory.select(Inventory.TYPE.CEREAL)
                simulate.buy(customer_data, product)

        if random.randint(1, 100) <= 20:
            product = Inventory.select(Inventory.TYPE.BABY_FOOD)
            simulate.buy(customer_data, product)

            if random.randint(1, 100) <= 80:
                product = Inventory.select(Inventory.TYPE.DIAPERS)
                simulate.buy(customer_data, product)
                
        else:
            if random.randint(1, 100) <= 1:
                product = Inventory.select(Inventory.TYPE.DIAPERS)
                simulate.buy(customer_data, product)

        if random.randint(1, 100) <= 50:
            product = Inventory.select(Inventory.TYPE.BREAD)
            simulate.buy(customer_data, product)

        if random.randint(1, 100) <= 10:
            product = Inventory.select(Inventory.TYPE.PEANUT_BUTTER)
            simulate.buy(customer_data, product)

            if random.randint(1, 100) <= 90:
                product = Inventory.select(Inventory.TYPE.JELLY_JAM)
                simulate.buy(customer_data, product)

        else:
            if random.randint(1, 100) <= 5:
                product = Inventory.select(Inventory.TYPE.JELLY_JAM)
                simulate.buy(customer_data, product)

        while customer_data.num_items < customer_data.max_items:
            product = Inventory.select(Inventory.TYPE.OTHER)
            simulate.buy(customer_data, product)

        simulate.write_customer_total(day, customer_data)

    def buy(customer, product):
        if product is None:
            '''
                In the rare case if 
                    there is no inventory left for a particular product,
                    the current customer stops buying
                    and nothing is written.
            '''
            customer.max_items = customer.num_items
            return
        
        customer.num_items += 1
        customer.running_total += product.price
        db.commit_pending += 1
        try:
            db.execute_sql_values('insert into sales_transactions values (?, ?, ?, ?, ?, ?)',
                                        (customer.date,customer.customer_number,product.sku,product.price, product.stock, product.total_cases_ordered))

        except Exception as err:
            print("Error writing to sales_transactions database table", err)

    def write_customer_total(day, customer):
        
        DEBUG.assert_params(day, simulate.Day)
        DEBUG.assert_params(customer, simulate.Customer)

        customer.running_total = round(customer.running_total, 2)
        
        day.num_items += customer.num_items
        day.num_customers += 1
        day.daily_total += customer.running_total
        
        db.commit_pending += 1
        
        try:
            db.execute_sql_values('INSERT INTO sales_customers VALUES (?, ?, ?, ?)', 
                (customer.date, customer.customer_number, customer.num_items, customer.running_total))
        except Exception as err:
            print("Error writing to sales_customers database table.", err)

    def write_daily_total(day):

        DEBUG.assert_params(day, simulate.Day)
        
        db.commit_pending += 1
        try:
            db.execute_sql_values('INSERT INTO sales_daily VALUES (?, ?, ?, ?)', 
                (day.date_str, day.num_items, day.num_customers, day.daily_total))
        except Exception as err:
            print("Error writing to sales_daily database table.", err)


def run():
    db.connect()
    create_products_table()
    simulate.run()
    db.close()


run()

Database Successfully Connected To
Committed 2075 products
Products in memory successfully populated.
2025-02-26 10:47:16.098554 - 2024-01-01 - 48099 records created and committing
2025-02-26 10:47:18.224868 - 2024-01-12 - 49680 records created and committing
Database Connection Closed


## Testbed
Inspect anything you want.

In [6]:
# db_debug.execute_sql('SELECT * from transactions_sales LIMIT 100')
db_debug.execute_sql("SELECT * from sales_transactions WHERE date = '2024-12-31' LIMIT 100")

('2024-12-31', 1, 42356001, 2.27, 198, 3751)
('2024-12-31', 1, 42347001, 2.63, 33, 126)
('2024-12-31', 1, 42671001, 4.79, 39, 343)
('2024-12-31', 1, 43856001, 1.38, 94, 863)
('2024-12-31', 1, 43702001, 17.03, 86, 832)
('2024-12-31', 1, 43908001, 4.78, 83, 832)
('2024-12-31', 1, 42363001, 3.95, 83, 832)
('2024-12-31', 1, 43964001, 1.55, 83, 852)
('2024-12-31', 1, 44010001, 3.16, 91, 855)
('2024-12-31', 1, 43125001, 2.87, 83, 852)
('2024-12-31', 1, 42432001, 2.0, 83, 849)
('2024-12-31', 1, 42989001, 2.34, 83, 851)
('2024-12-31', 1, 43562001, 0.61, 83, 862)
('2024-12-31', 1, 43677001, 3.59, 83, 859)
('2024-12-31', 1, 43057001, 3.31, 83, 855)
('2024-12-31', 1, 42504001, 1.54, 86, 839)
('2024-12-31', 1, 42974001, 0.6, 86, 848)
('2024-12-31', 1, 42361001, 1.19, 93, 852)
('2024-12-31', 1, 42285001, 9.59, 86, 850)
('2024-12-31', 1, 43223001, 1.75, 87, 858)
('2024-12-31', 1, 42551001, 2.63, 89, 849)
('2024-12-31', 1, 43837001, 1.57, 89, 851)
('2024-12-31', 1, 42721001, 3.77, 83, 856)
('2024-12-

In [3]:
db_debug.execute_sql("SELECT * from sales_customers WHERE date = '2024-12-31'")

('2024-12-31', 1, 51, 198.26999999999998)
('2024-12-31', 2, 9, 26.5)
('2024-12-31', 3, 28, 94.00999999999998)
('2024-12-31', 4, 56, 246.57)
('2024-12-31', 5, 69, 261.35)
('2024-12-31', 6, 87, 328.00000000000017)
('2024-12-31', 7, 38, 137.04)
('2024-12-31', 8, 75, 271.73999999999995)
('2024-12-31', 9, 34, 140.31000000000003)
('2024-12-31', 10, 57, 212.23999999999995)
('2024-12-31', 11, 85, 305.3499999999999)
('2024-12-31', 12, 55, 201.64999999999998)
('2024-12-31', 13, 40, 151.87000000000006)
('2024-12-31', 14, 76, 271.27000000000015)
('2024-12-31', 15, 38, 153.07000000000005)
('2024-12-31', 16, 77, 254.95999999999998)
('2024-12-31', 17, 58, 193.35)
('2024-12-31', 18, 29, 134.88)
('2024-12-31', 19, 81, 269.60999999999996)
('2024-12-31', 20, 86, 346.74000000000007)
('2024-12-31', 21, 85, 306.3)
('2024-12-31', 22, 11, 36.75)
('2024-12-31', 23, 24, 91.07)
('2024-12-31', 24, 63, 214.38)
('2024-12-31', 25, 46, 157.75)
('2024-12-31', 26, 24, 77.47999999999999)
('2024-12-31', 27, 20, 115.08)
(

In [4]:
db_debug.execute_sql("SELECT * from sales_daily WHERE date = '2024-12-31'")

('2024-12-31', 47237, 1047, 175651.77000000005)
