In [41]:
import pandas as pd
import numpy as np
import os

import datetime
from prettytable import PrettyTable
import pymysql
import yaml
from sqlalchemy import create_engine
from sqlalchemy import text


In [42]:
root_dir = os.getcwd()

In [43]:
# Function to read YAML file
def read_yaml(file_path):
    with open(file_path, 'r') as file:
        try:
            # Load the YAML content into a Python dictionary
            data = yaml.safe_load(file)
            return data
        except yaml.YAMLError as exc:
            print(f"Error reading YAML file: {exc}")
            return None

In [44]:
file_path = os.path.join(root_dir,'MYSQL_user_config.yaml')
config = read_yaml(file_path)

user = config.get('MYSQL_credentials')['user']
password = config.get('MYSQL_credentials')['password']
host = config.get('MYSQL_credentials')['host']
port = config.get('MYSQL_credentials')['port']

In [45]:
def run_query(engine,query):
    try:
        with engine.connect() as connection:
            result = connection.execute(text(query))
            return result.fetchall()
    except Exception as e:
        print(f"Error in the query -> {e}")

In [46]:
db_url = f'mysql+pymysql://{user}:{password}@{host}:{port}'
db_name = "FETCH_DB"
engine = create_engine(f'{db_url}'+'/'+f'{db_name}')

### What are the top 5 brands by receipts scanned for most recent month?

Here are two ideas to make the query more flexible, to incorporate the following things.
1) Definition of Top Brand
2) Scanned Month

"Top Brand" can be defined in several ways, depending on the context. Some possible definitions include:

i) Brands appearing in the highest number of unique receipts: This measures how many distinct receipts contain items from the brand, regardless of how many items are from that brand on each receipt.

ii) Brands that have been awarded the highest total points: This tracks the total number of points awarded to a brand across all receipts, reflecting the overall value or loyalty associated with the brand.

iii) Brands with the highest number of occurrences on all receipts: This measures the total number of times items from the brand appear across all receipts, counting every instance where an item from the brand is scanned, even if multiple items from the same brand are present on a single receipt.

To offer more flexibility, the following query takes care of user defined month (also including the option of recent_month)

Note that, as discussed in the Data Quality Section about brand and BrandCodes, since the best possible way to relate receipts and brands is by brandCode which is half empty, the imputing method of extracting the first word from description is used below.

In [47]:
## Definition of TopBrand

brands_in_unique_receipts = False
brands_with_highest_points = True
total_brand_occurences_in_all_receipts = False

##Setting Recent_month

most_recent_month = False
user_defined_month = '202101'

if most_recent_month == True:
    set_month = '(SELECT DATE_FORMAT(MAX(scanned_date_time),\'%Y%m\') FROM receipts)'
else:
    set_month = user_defined_month


In [48]:
set_most_recent_month_query = f"SET @MOST_RECENT_MONTH = {set_month};"
run_query(engine,query = set_most_recent_month_query)

if brands_in_unique_receipts:
    agg_func = 'COUNT(DISTINCT RR.receipt_id)'
    table_col_name = 'Unique_Receipts'
elif brands_with_highest_points:
    agg_func = 'SUM(COALESCE(RR.pointsEarned,0))'
    table_col_name = 'Total Points Earned'
elif total_brand_occurences_in_all_receipts:
    agg_func = 'COUNT(RR.receipt_id)'
    table_col_name = 'Total Entries'

Error in the query -> This result object does not return rows. It has been closed automatically.


In [49]:
query_1 = f"""
SELECT brand,ROUND(score,4) AS score,
DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
FROM 
(
SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
{agg_func} AS score
FROM rewards_receipts RR
INNER JOIN receipts R
ON RR.receipt_id = R.receipt_id
WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = @MOST_RECENT_MONTH
GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
) S1
WHERE brand IS NOT NULL AND brand != 'ITEM' AND score != 0
LIMIT 5;
"""
results=run_query(engine,query_1)

columns = ["brandCode", table_col_name, "Brand_Rank"]
table = PrettyTable()
table.field_names = columns


for row in results:
    table.add_row(row)

print(table)

+----------------+---------------------+------------+
|   brandCode    | Total Points Earned | Brand_Rank |
+----------------+---------------------+------------+
|     MILLER     |       52530.0       |     1      |
|    HUGGIES     |       20618.5       |     2      |
| BEN AND JERRYS |       20548.6       |     3      |
|     KNORR      |        7003.5       |     4      |
|    KLEENEX     |        4440.3       |     5      |
+----------------+---------------------+------------+


In [50]:
#### Query 2

query_2 = f"""
SELECT S2.brand AS Brand, S2.score AS CurrentScore, S2.brandRank AS CurrentBrandRank,S3.score AS PreviousScore, S3.brandRank AS PreviousBrandRank
FROM
(
    SELECT brand,ROUND(score,4) AS score,
    DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
    FROM 
    (
    SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
    {agg_func} AS score
    FROM rewards_receipts RR
    INNER JOIN receipts R
    ON RR.receipt_id = R.receipt_id
    WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = @MOST_RECENT_MONTH
    GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
    ) S1
    WHERE brand IS NOT NULL AND brand != 'ITEM' AND score != 0
    LIMIT 10
) S2
LEFT JOIN
(
    SELECT brand,ROUND(score,4) AS score,
    DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
    FROM 
    (
    SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
    {agg_func} AS score
    FROM rewards_receipts RR
    INNER JOIN receipts R
    ON RR.receipt_id = R.receipt_id
    WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = DATE_FORMAT(DATE_SUB(STR_TO_DATE(CONCAT(@Most_recent_month, '01'), '%Y%m%d'), INTERVAL 1 MONTH), '%Y%m')

    GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
    ) S1
    WHERE brand IS NOT NULL AND brand != 'ITEM'

) S3
ON S2.brand = S3.brand;
"""
results=run_query(engine,query_2)

columns = ["brandCode", 'Current_'+table_col_name, "Current_Brand_Rank",'Previous_'+table_col_name, "Previous_Brand_Rank"]
table = PrettyTable()
table.field_names = columns


for row in results:
    table.add_row(row)

print(table)


+----------------+-----------------------------+--------------------+------------------------------+---------------------+
|   brandCode    | Current_Total Points Earned | Current_Brand_Rank | Previous_Total Points Earned | Previous_Brand_Rank |
+----------------+-----------------------------+--------------------+------------------------------+---------------------+
|     MILLER     |           52530.0           |         1          |             None             |         None        |
|    HUGGIES     |           20618.5           |         2          |             None             |         None        |
| BEN AND JERRYS |           20548.6           |         3          |             None             |         None        |
|     KNORR      |            7003.5           |         4          |             None             |         None        |
|    KLEENEX     |            4440.3           |         5          |             None             |         None        |
| CRACKER BARREL