# Data Profiling
## Team 8: Anthony, Sean, Gideon
## Team 9: Rohith, Sneha
## Team 10: Anmol, Nikita, Satya

### The first deliverable is a report of the 25 top selling products.
### The first column requires us to aggregate the 3 databases and then generate the top 25.

#### Database API taken from Anthony Ung's code for HW 4.
#### This API was modified so an instance may be created.

In [1]:
import sqlite3 as lite
from enum import Enum

'''
    This class provides one common point of interaction with my team's database.
    Everything that writes to the database uses this API.
'''
class db:

    class Options(Enum):
        RETURN_RESULTS = 1
        PRINT_RESULTS = 2
    
    def __init__(self, name):
        self.name = rf"{name}"

    def connect(self):
        self.con = lite.connect(self.name)
        self.cur = self.con.cursor()

    def build_table(self, name):      
        self.execute_sql(f'DROP TABLE IF EXISTS {name}')
        self.execute_sql(TABLE_DEFINITIONS[name])
    
    def execute_sql(self, sql, options=0):
        if (options.value & db.Options.RETURN_RESULTS.value):
            results = self.cur.execute(sql).fetchall()
            return results
            
        self.cur.execute(sql)

    def execute_sql_values(self, sql, values):
        self.cur.execute(sql, values)

    def commit(self):
        self.con.commit()

    def close(self):
        self.con.commit()
        self.con.close()

#### Connect to Team 9's Database and do a Demo Query

In [2]:
db_9 = db('../0_SD_Team_9/grocery_store.db')
db_9.connect()

In [3]:
sql = 'SELECT * FROM transactions LIMIT 10'

db_9.execute_sql(sql, options=db.Options.RETURN_RESULTS)

[('2024-01-01', 0, 42360001, 2.02, 1109, 92.5),
 ('2024-01-01', 0, 43403001, 3.2, 1151, 96),
 ('2024-01-01', 0, 44034001, 1.17, 629, 52.5),
 ('2024-01-01', 0, 43210001, 23.53, 527, 44),
 ('2024-01-01', 0, 43454001, 2.88, 74, 6.25),
 ('2024-01-01', 0, 42668001, 3.52, 74, 6.25),
 ('2024-01-01', 0, 43244001, 5.87, 74, 6.25),
 ('2024-01-01', 0, 42883001, 2.68, 74, 6.25),
 ('2024-01-01', 0, 43981001, 0.64, 74, 6.25),
 ('2024-01-01', 0, 43702001, 15.18, 74, 6.25)]

#### Connect to Team 8's Database and do a Demo Query

In [4]:
db_8 = db('../0_SD_Team_8/store_team_8.db')
db_8.connect()

In [5]:
sql = 'SELECT * FROM sales_transactions LIMIT 10'

db_8.execute_sql(sql, options=db.Options.RETURN_RESULTS)

[('2024-01-01', 1, 43365001, 1.07, 23, 2),
 ('2024-01-01', 1, 43018001, 13.19, 23, 2),
 ('2024-01-01', 1, 43083001, 2.68, 83, 7),
 ('2024-01-01', 1, 42779001, 13.19, 83, 7),
 ('2024-01-01', 1, 42238001, 1.2, 83, 7),
 ('2024-01-01', 1, 43104001, 1.99, 83, 7),
 ('2024-01-01', 1, 43482001, 4.79, 83, 7),
 ('2024-01-01', 1, 42303001, 1.43, 83, 7),
 ('2024-01-01', 1, 42864001, 3.0, 83, 7),
 ('2024-01-01', 1, 43761001, 1.76, 83, 7)]

#### Connect to Team 10's Database and do a Demo Query

In [6]:
db_10 = db('../0_SD_Team_10/grocery_team_10.db')
db_10.connect()

In [7]:
sql = 'SELECT * FROM sales_transactions LIMIT 10'

results = db_10.execute_sql(sql, options=db.Options.RETURN_RESULTS)
for row in results:
    print(row)

('20240101', 0, 43774001, 2.75, 119, 10)
('20240101', 0, 42345001, 3.78, 119, 10)
('20240101', 1, 43625001, 3.78, 119, 10)
('20240101', 1, 42217001, 0.97, 119, 10)
('20240101', 2, 42538001, 1.25, 119, 10)
('20240101', 2, 43880001, 2.98, 119, 10)
('20240101', 3, 43882001, 3.2, 119, 10)
('20240101', 3, 43517001, 2.6, 119, 10)
('20240101', 3, 42384001, 3.85, 119, 10)
('20240101', 3, 43701001, 8.04, 119, 10)


In [8]:
sql = 'SELECT * FROM sales_transactions WHERE items_left < 0'

results = db_10.execute_sql(sql, options=db.Options.RETURN_RESULTS)
for row in results:
    print(row)

#### Now I have enough information to generate the columns for the individual stores.

Keep the table definitions handy.

Team 9:
```
'CREATE TABLE IF NOT EXISTS transactions(date1 TEXT, customerID INT, sku INT, salePrice REAL, itemsLeft INT, co INT)'
```

Team 8/10:
```
    'sales_transactions': \
            'CREATE TABLE sales_transactions(' \
                    'date TEXT, ' \
                    'customer_number INT, ' \
                    'sku INT, ' \
                    'salesPrice REAL, ' \
                    'items_left INT, ' \
                    'cases_ordered INT)'
```

### Total Sales, December 2024

In [9]:
sql_8 = 'SELECT SUM(salesPrice) ' \
        'FROM sales_transactions ' \
        'WHERE date >= \'2024-12-01\''

sql_9 = 'SELECT SUM(salePrice) ' \
        'FROM transactions ' \
        'WHERE date1 >= \'2024-12-01\''

sql_10 = 'SELECT SUM(salesPrice) ' \
         'FROM sales_transactions ' \
         'WHERE date >= \'20241201\''

results = db_8.execute_sql(sql_8, options=db.Options.RETURN_RESULTS)
for row in results:
    print(row)

results = db_9.execute_sql(sql_9, options=db.Options.RETURN_RESULTS)
for row in results:
    print(row)

results = db_10.execute_sql(sql_10, options=db.Options.RETURN_RESULTS)
for row in results:
    print(row)

(5573872.98,)
(3928780.47,)
(400321.54,)


### Total Customers, December 2024

In [10]:
sql_8 = \
    '''
        SELECT COUNT(*)
        FROM (
            SELECT DISTINCT date, customer_number
            FROM sales_transactions
            WHERE date >= '2024-12-01'
        ) AS distinct_pairs;
    '''

sql_9 = \
    '''
        SELECT COUNT(*)
        FROM (
            SELECT DISTINCT date1, customerID
            FROM transactions
            WHERE date1 >= '2024-12-01'
        ) AS distinct_pairs;
    '''

sql_10 = \
    '''
        SELECT COUNT(*)
        FROM (
            SELECT DISTINCT date, customer_number
            FROM sales_transactions
            WHERE date >= '20241201'
        ) AS distinct_pairs;
    '''

results = db_8.execute_sql(sql_8, options=db.Options.RETURN_RESULTS)
for row in results:
    print(row)

results = db_9.execute_sql(sql_9, options=db.Options.RETURN_RESULTS)
for row in results:
    print(row)

results = db_10.execute_sql(sql_10, options=db.Options.RETURN_RESULTS)
for row in results:
    print(row)

(32750,)
(33382,)
(34141,)


### Some Pseudocode for Nikita Brahmbhatt on how to combine the databases
#### We need a combined database to fill in any of the other cells.

In [11]:
db_combined = db('groucery_etl_staging.db')
db_combined.connect()

'''
Use each of db_8's, db_9's, and db_10's execute_sql methods with the correct options
    to select all records starting from '2024-12-01' or '2024-1201'.

Then, for each row of the results that were returned,
    save the corresponding record using db_combined's execute_sql method.

I imagine this may take a few minutes ...

The format of the dates and the table definitions were established in the earlier cells.
'''

db_combined.commit()
db_combined.close()
print('Finished')

Finished
