# Data Profiling
## Team 8: Anthony, Sean, Gideon
## Team 9: Rohith, Sneha
## Team 10: Anmol, Nikita, Satya

### The first deliverable is a report of the 25 top selling products.
### The first column requires us to aggregate the 3 databases and then generate the top 25.

### Database API taken from Anthony Ung's code for HW 4.
### This API was modified so an instance may be created.

In [1]:
import sqlite3 as lite
from enum import Enum

'''
    This class provides one common point of interaction with my team's database.
    Everything that writes to the database uses this API.
'''
class db_options(Enum):
        DEFAULT = 0
        RETURN_RESULTS = 1
        PRINT_RESULTS = 2

class db:

    class Options(Enum):
        DEFAULT = 0
        RETURN_RESULTS = 1
        PRINT_RESULTS = 2
    
    def __init__(self, name):
        self.name = rf"{name}"

    def connect(self):
        self.con = lite.connect(self.name)
        self.cur = self.con.cursor()

    def build_table(self, name):      
        self.execute_sql(f'DROP TABLE IF EXISTS {name}')
        self.execute_sql(TABLE_DEFINITIONS[name])
    
    def execute_sql(self, sql, options=db_options.DEFAULT):
        if (options.value & db_options.RETURN_RESULTS.value):
            results = self.cur.execute(sql).fetchall()
            return results
            
        self.cur.execute(sql)

    def execute_sql_values(self, sql, values):
        self.cur.execute(sql, values)

    def commit(self):
        self.con.commit()

    def close(self):
        self.con.commit()
        self.con.close()

### Database Paths

In [2]:
DB_PATHS = {
    'db_8' : '../0_SD_Team_8/store_team_8.db',
    'db_9' : '../0_SD_Team_9/grocery_store.db',
    'db_10' : '../0_SD_Team_10/grocery_team_10.db',
    'db_c' : 'grocery_etl_staging.db'
}

#### Connect to Team 9's Database and do a Demo Query

In [3]:
db_9 = db(DB_PATHS['db_9'])
db_9.connect()

sql = 'SELECT * FROM transactions LIMIT 10'

results = db_9.execute_sql(sql, options=db_options.RETURN_RESULTS)

for row in results:
    print(row)

db_9.close()

('2024-01-01', 0, 42360001, 2.02, 1109, 92.5)
('2024-01-01', 0, 43403001, 3.2, 1151, 96)
('2024-01-01', 0, 44034001, 1.17, 629, 52.5)
('2024-01-01', 0, 43210001, 23.53, 527, 44)
('2024-01-01', 0, 43454001, 2.88, 74, 6.25)
('2024-01-01', 0, 42668001, 3.52, 74, 6.25)
('2024-01-01', 0, 43244001, 5.87, 74, 6.25)
('2024-01-01', 0, 42883001, 2.68, 74, 6.25)
('2024-01-01', 0, 43981001, 0.64, 74, 6.25)
('2024-01-01', 0, 43702001, 15.18, 74, 6.25)


#### Connect to Team 8's Database and do a Demo Query

In [4]:
db_8 = db(DB_PATHS['db_8'])
db_8.connect()

sql = 'SELECT * FROM sales_transactions LIMIT 10'

results = db_8.execute_sql(sql, options=db_options.RETURN_RESULTS)

for row in results:
    print(row)

db_8.close()

('2024-01-01', 1, 43365001, 1.07, 23, 2)
('2024-01-01', 1, 43018001, 13.19, 23, 2)
('2024-01-01', 1, 43083001, 2.68, 83, 7)
('2024-01-01', 1, 42779001, 13.19, 83, 7)
('2024-01-01', 1, 42238001, 1.2, 83, 7)
('2024-01-01', 1, 43104001, 1.99, 83, 7)
('2024-01-01', 1, 43482001, 4.79, 83, 7)
('2024-01-01', 1, 42303001, 1.43, 83, 7)
('2024-01-01', 1, 42864001, 3.0, 83, 7)
('2024-01-01', 1, 43761001, 1.76, 83, 7)


#### Connect to Team 10's Database and do a Demo Query

In [5]:
db_10 = db(DB_PATHS['db_10'])
db_10.connect()

sql = 'SELECT * FROM sales_transactions LIMIT 10'

results = db_10.execute_sql(sql, options=db_options.RETURN_RESULTS)

for row in results:
    print(row)

db_10.close()

('20240101', 0, 43774001, 2.75, 119, 10)
('20240101', 0, 42345001, 3.78, 119, 10)
('20240101', 1, 43625001, 3.78, 119, 10)
('20240101', 1, 42217001, 0.97, 119, 10)
('20240101', 2, 42538001, 1.25, 119, 10)
('20240101', 2, 43880001, 2.98, 119, 10)
('20240101', 3, 43882001, 3.2, 119, 10)
('20240101', 3, 43517001, 2.6, 119, 10)
('20240101', 3, 42384001, 3.85, 119, 10)
('20240101', 3, 43701001, 8.04, 119, 10)


#### Now I have enough information to generate the columns for the individual stores.

Keep the table definitions handy.

Team 9:
```
'CREATE TABLE IF NOT EXISTS transactions(date1 TEXT, customerID INT, sku INT, salePrice REAL, itemsLeft INT, co INT)'
```

Team 8/10:
```
    'sales_transactions': \
            'CREATE TABLE sales_transactions(' \
                    'date TEXT, ' \
                    'customer_number INT, ' \
                    'sku INT, ' \
                    'salesPrice REAL, ' \
                    'items_left INT, ' \
                    'cases_ordered INT)'
```

### Total Sales, December 2024

In [6]:
'''
AU is using parallel arrays here becuase this is a small section of code and nothing here will be used by any other cell.
''';

dbs = ['db_8', 'db_9', 'db_10']
sqls = [
    'SELECT SUM(salesPrice) ' \
    'FROM sales_transactions ' \
    'WHERE date >= \'2024-12-01\''
    ,
    'SELECT SUM(salePrice) ' \
    'FROM transactions ' \
    'WHERE date1 >= \'2024-12-01\''
    ,
    'SELECT SUM(salesPrice) ' \
    'FROM sales_transactions ' \
    'WHERE date >= \'20241201\''
]

for i in range(3):
    db_temp = db(DB_PATHS[dbs[i]])
    db_temp.connect()

    results = db_temp.execute_sql(sqls[i], options=db_options.RETURN_RESULTS)
    for row in results:
        print(row)
    
    db_temp.close()

(5573872.98,)
(3928780.47,)
(400321.54,)


### Total Customers, December 2024

In [7]:

'''
AU is using parallel arrays here becuase this is a small section of code and nothing here will be used by any other cell.
''';

dbs = ['db_8', 'db_9', 'db_10']
sqls = [
    '''
        SELECT COUNT(*)
        FROM (
            SELECT DISTINCT date, customer_number
            FROM sales_transactions
            WHERE date >= '2024-12-01'
        ) AS distinct_pairs;
    '''
    ,
    '''
        SELECT COUNT(*)
        FROM (
            SELECT DISTINCT date1, customerID
            FROM transactions
            WHERE date1 >= '2024-12-01'
        ) AS distinct_pairs;
    '''
    ,
    '''
        SELECT COUNT(*)
        FROM (
            SELECT DISTINCT date, customer_number
            FROM sales_transactions
            WHERE date >= '20241201'
        ) AS distinct_pairs;
    '''
]


for i in range(3):
    db_temp = db(DB_PATHS[dbs[i]])
    db_temp.connect()

    results = db_temp.execute_sql(sqls[i], options=db_options.RETURN_RESULTS)
    for row in results:
        print(row)
    
    db_temp.close()

(32750,)
(33382,)
(34141,)


### Some Pseudocode for Nikita Brahmbhatt on how to combine the databases
#### We need a combined database to fill in any of the other cells.

In [8]:
db_combined = db('grocery_etl_staging.db')
db_combined.connect()

'''
Use each of db_8's, db_9's, and db_10's execute_sql methods with the correct options
    to select all records starting from '2024-12-01' or '2024-1201'.

Then, for each row of the results that were returned,
    save the corresponding record using db_combined's execute_sql method.

I imagine this may take a few minutes ...

The format of the dates and the table definitions were established in the earlier cells.
'''

db_combined.commit()
db_combined.close()
print('Finished')

Finished


In [9]:
from datetime import datetime

def create_combined_schema(target_db):
    target_db.execute_sql('DROP TABLE IF EXISTS combined_sales;')
    target_db.execute_sql('''CREATE TABLE IF NOT EXISTS combined_sales(
                            date TEXT, 
                            customer_number INT, 
                            sku INT, 
                            sales_price REAL, 
                            items_left INT, 
                            cases_ordered INT, 
                            store_number INT)''')

def transform_team8(row):
    """Transform Team 8's data (already in correct format)"""
    return (
        row[0],  # date (YYYY-MM-DD)
        row[1],  # customer_number
        row[2],  # sku
        float(row[3]),  # salesPrice
        int(row[4]),  # items_left
        int(row[5]),  # cases_ordered
        8
    )

def transform_team9(row):
    """Transform Team 9's data with column renaming"""
    return (
        row[0],  # date -> date (YYYY-MM-DD)
        row[1],  # customerID -> customer_number
        row[2],  # sku
        float(row[3]),  # salePrice -> sales_price
        int(row[4]),  # itemsLeft -> items_left
        int(row[5]),  # co -> cases_ordered
        9
    )

def transform_team10(row):
    """Transform Team 10's data with column renaming"""
    return (
        datetime.strptime(row[0], "%Y%m%d").strftime("%Y-%m-%d"),  # date1 -> date (YYYY-MM-DD)
        row[1],  # customerID -> customer_number
        row[2],  # sku
        float(row[3]),  # salePrice -> sales_price
        int(row[4]),  # itemsLeft -> items_left
        int(row[5]),  # co -> cases_ordered
        10
    )



In [10]:
def combine_data(source_db, query, transform_func, target_db):
    """Generic function to combine data from different sources"""
    results = source_db.execute_sql(query, options=db_options.RETURN_RESULTS)
    for row in results:
        try:
            transformed = transform_func(row)
            target_db.execute_sql_values(
                '''INSERT INTO combined_sales 
                   VALUES (?, ?, ?, ?, ?, ?, ?)''',
                transformed
            )
        except Exception as e:
            print(f"Error processing row: {row}")
            print(f"Error: {str(e)}")
    target_db.commit()


def generate_top25_report(target_db, output_file="top_25_products.csv"):
    """Generate the final report for top 25 products by total sales"""
    sql_combined = '''
    SELECT 
        sku, 
        COUNT(*) AS count
    FROM combined_sales 
    WHERE date >= '2024-12-01' 
    GROUP BY sku
    ORDER BY count DESC
    LIMIT 25;
    '''

    # Execute the query and fetch results
    results = target_db.execute_sql(sql_combined, options=db_options.RETURN_RESULTS)
    
    # Write the results to a CSV file
    with open(output_file, 'w') as f:
        f.write("SKU,Total Sales\n")  # CSV header
        for row in results:
            f.write(f"{row[0]},{row[1]}\n")  # Write SKU, total sales, and rank

    print(f"Report generated: {output_file}")


In [11]:
# Initialize combined database
combined_db = db('grocery_etl_staging.db')
combined_db.connect()
create_combined_schema(combined_db)

# Connect to source databases
db8 = db(DB_PATHS['db_8'])
db9 = db(DB_PATHS['db_9'])
db10 = db(DB_PATHS['db_10'])

for source in [db8, db9, db10]:
    source.connect()

print(f'{datetime.now()} - Begin')

# Combine data from all sources
combine_data(db8, 
            "SELECT * FROM sales_transactions WHERE date >= '2024-12-01'",
            transform_team8,
            combined_db)

print(f"{datetime.now()} - Team 8's database placed in combined database")

combine_data(db9,
            "SELECT * FROM transactions WHERE date1 >= '2024-12-01'",  # Fixed query to use 'date' instead of 'date1'
            transform_team9,
            combined_db)

print(f"{datetime.now()} - Team 9's database placed in combined database")

combine_data(db10,
            "SELECT * FROM sales_transactions WHERE date >= '2024-12-01'",
            transform_team10,
            combined_db)

print(f"{datetime.now()} - Team 10's database placed in combined database")

# Generate report for top 25 products
generate_top25_report(combined_db)

# Cleanup
for db_conn in [db8, db9, db10, combined_db]:
    db_conn.close()

print(f'{datetime.now()} - Combined Database Created')

2025-03-17 21:27:43.299280 - Begin
2025-03-17 21:27:47.735941 - Team 8's database placed in combined database
2025-03-17 21:27:51.373291 - Team 9's database placed in combined database
2025-03-17 21:28:01.705262 - Team 10's database placed in combined database
Report generated: top_25_products.csv
2025-03-17 21:28:02.885974 - Combined Database Created
