In [82]:
## Importing Libraries
import pandas as pd
import numpy as np
import os

import datetime
from prettytable import PrettyTable
import pymysql
import yaml
from sqlalchemy import create_engine, text



In [83]:
## Root Directory to read the configuration files
root_dir = os.getcwd()

In [84]:
## Fucntions

# Function to read YAML file
def read_yaml(file_path):
    with open(file_path, 'r') as file:
        try:
            # Load the YAML content into a Python dictionary
            data = yaml.safe_load(file)
            return data
        except yaml.YAMLError as exc:
            print(f"Error reading YAML file: {exc}")
            return None

# Function to execute the query
def run_query(engine,query):
    try:
        with engine.connect() as connection:
            result = connection.execute(text(query))
            return result.fetchall()
    except Exception as e:
        print(f"Error in the query -> {e}")

In [85]:
## Reading Configuration file

file_path = os.path.join(root_dir,'MYSQL_user_config.yaml')
config = read_yaml(file_path)

user = config.get('MYSQL_credentials')['user']
password = config.get('MYSQL_credentials')['password']
host = config.get('MYSQL_credentials')['host']
port = config.get('MYSQL_credentials')['port']

In [86]:
## Engine Configuration

db_url = f'mysql+pymysql://{user}:{password}@{host}:{port}'
db_name = "FETCH_DB"
engine = create_engine(f'{db_url}'+'/'+f'{db_name}')

### What are the top 5 brands by receipts scanned for most recent month?

Brands with the highest number of occurrences on all receipts: This measures the total number of times items from the brand appear across all receipts, counting every instance where an item from the brand is scanned.

Note that, as discussed in the Data Quality Section about brand and BrandCodes, since the best possible way to relate receipts and brands is by brandCode which is half empty, the imputing method of extracting the first word from description is used below. Additionally, brandCode is still treated as the primary identifier for naming.

The most recent month is 2021-01-03, which is dynamically calculated and not hard-coded. The query below ensures this is handled appropriately.

In [87]:
#### For most recent month:
query_1 = f"""
SELECT brand,ROUND(score,4) AS score,
DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
FROM 
(
SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
COUNT(RR.receipt_id) AS score
FROM rewards_receipts RR
INNER JOIN receipts R
ON RR.receipt_id = R.receipt_id
WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = (SELECT DATE_FORMAT(MAX(scanned_date_time),\'%Y%m\') FROM receipts)
GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
) S1
WHERE brand IS NOT NULL AND brand != 'ITEM' AND score != 0
LIMIT 10;
"""
results=run_query(engine,query_1)

columns = ["brandCode", "Receipts Scanned", "Brand_Rank"]
table = PrettyTable()
table.field_names = columns

for row in results:
    table.add_row(row)

print(table)


+-----------+------------------+------------+
| brandCode | Receipts Scanned | Brand_Rank |
+-----------+------------------+------------+
|  MUELLER  |        11        |     1      |
|  THINDUST |        11        |     1      |
+-----------+------------------+------------+


Since there are not atleast 5 distinct Brands, We could explore the top 5 brands for previous months.

#### For month = '2021-02'

In [88]:
## Hard coded month
set_month = '202102'
set_most_recent_month_query = f"SET @MOST_RECENT_MONTH = {set_month};"
run_query(engine,query = set_most_recent_month_query)

query_1 = f"""
SELECT brand,ROUND(score,4) AS score,
DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
FROM 
(
SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
COUNT(RR.receipt_id) AS score
FROM rewards_receipts RR
INNER JOIN receipts R
ON RR.receipt_id = R.receipt_id
WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = @MOST_RECENT_MONTH
GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
) S1
WHERE brand IS NOT NULL AND brand != 'ITEM' AND score != 0
LIMIT 5;
"""
results=run_query(engine,query_1)

columns = ["brandCode", "Receipts Scanned", "Brand_Rank"]
table = PrettyTable()
table.field_names = columns

for row in results:
    table.add_row(row)

print(table)

Error in the query -> This result object does not return rows. It has been closed automatically.
+-----------+------------------+------------+
| brandCode | Receipts Scanned | Brand_Rank |
+-----------+------------------+------------+
|  MUELLER  |        29        |     1      |
|  THINDUST |        29        |     1      |
|  FLIPBELT |        28        |     2      |
|   HEINZ   |        10        |     3      |
|   BRAND   |        3         |     4      |
+-----------+------------------+------------+


#### For Month = '202101'

In [89]:
## Hard coded month
set_month = '202101'
set_most_recent_month_query = f"SET @MOST_RECENT_MONTH = {set_month};"
run_query(engine,query = set_most_recent_month_query)

query_1 = f"""
SELECT brand,ROUND(score,4) AS score,
DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
FROM 
(
SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
COUNT(RR.receipt_id) AS score
FROM rewards_receipts RR
INNER JOIN receipts R
ON RR.receipt_id = R.receipt_id
WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = @MOST_RECENT_MONTH
GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
) S1
WHERE brand IS NOT NULL AND brand != 'ITEM' AND score != 0
LIMIT 5;
"""
results=run_query(engine,query_1)

columns = ["brandCode", "Receipts Scanned", "Brand_Rank"]
table = PrettyTable()
table.field_names = columns

for row in results:
    table.add_row(row)

print(table)

Error in the query -> This result object does not return rows. It has been closed automatically.
+----------------+------------------+------------+
|   brandCode    | Receipts Scanned | Brand_Rank |
+----------------+------------------+------------+
|     HY-VEE     |       296        |     1      |
| BEN AND JERRYS |       180        |     2      |
|       PC       |       138        |     3      |
|   KLARBRUNN    |       133        |     4      |
|     PEPSI      |       103        |     5      |
+----------------+------------------+------------+


Apart from the brand 'PC', the other top 5 brands are reasonable. This is due to the limitation of the imputation method discussed earlier. In some cases, the brand codes may not be accurate when extracted from the description.

### 2) How does the ranking of the top 5 brands by receipts scanned for the recent month compare to the ranking for the previous month

#### For the most Recent month

In [95]:
#### Query 2
query_2 = f"""
SELECT S2.brand AS Brand, S2.score AS CurrentScore, S2.brandRank AS CurrentBrandRank,S3.score AS PreviousScore, S3.brandRank AS PreviousBrandRank
FROM
(
    SELECT brand,ROUND(score,4) AS score,
    DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
    FROM 
    (
    SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
    COUNT(RR.receipt_id) AS score
    FROM rewards_receipts RR
    INNER JOIN receipts R
    ON RR.receipt_id = R.receipt_id
    WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = (SELECT DATE_FORMAT(MAX(scanned_date_time),\'%Y%m\') FROM receipts)
    GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
    ) S1
    WHERE brand IS NOT NULL AND brand != 'ITEM' AND score != 0
    LIMIT 5
) S2
LEFT JOIN
(
    SELECT brand,ROUND(score,4) AS score,
    DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
    FROM 
    (
    SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
    COUNT(RR.receipt_id) AS score
    FROM rewards_receipts RR
    INNER JOIN receipts R
    ON RR.receipt_id = R.receipt_id
    WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = DATE_FORMAT(DATE_SUB(STR_TO_DATE(CONCAT((SELECT DATE_FORMAT(MAX(scanned_date_time),\'%Y%m\') FROM receipts), '01'), '%Y%m%d'), INTERVAL 1 MONTH), '%Y%m')

    GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
    ) S1
    WHERE brand IS NOT NULL AND brand != 'ITEM'

) S3
ON S2.brand = S3.brand;
"""
results=run_query(engine,query_2)

columns = ["brandCode", 'Current_Receipts_scanned', "Current_Brand_Rank",'Previous_Receipts_scanned', "Previous_Brand_Rank"]
table = PrettyTable()
table.field_names = columns


for row in results:
    table.add_row(row)

print(table)


+-----------+--------------------------+--------------------+---------------------------+---------------------+
| brandCode | Current_Receipts_scanned | Current_Brand_Rank | Previous_Receipts_scanned | Previous_Brand_Rank |
+-----------+--------------------------+--------------------+---------------------------+---------------------+
|  MUELLER  |            11            |         1          |             29            |          1          |
|  THINDUST |            11            |         1          |             29            |          1          |
+-----------+--------------------------+--------------------+---------------------------+---------------------+


#### For month = '202102'

In [96]:
#### Query 2
set_month = '202102'
set_most_recent_month_query = f"SET @MOST_RECENT_MONTH = {set_month};"
run_query(engine,query = set_most_recent_month_query)

query_2 = f"""
SELECT S2.brand AS Brand, S2.score AS CurrentScore, S2.brandRank AS CurrentBrandRank,S3.score AS PreviousScore, S3.brandRank AS PreviousBrandRank
FROM
(
    SELECT brand,ROUND(score,4) AS score,
    DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
    FROM 
    (
    SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
    COUNT(RR.receipt_id) AS score
    FROM rewards_receipts RR
    INNER JOIN receipts R
    ON RR.receipt_id = R.receipt_id
    WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = @MOST_RECENT_MONTH
    GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
    ) S1
    WHERE brand IS NOT NULL AND brand != 'ITEM' AND score != 0
    LIMIT 5
) S2
LEFT JOIN
(
    SELECT brand,ROUND(score,4) AS score,
    DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
    FROM 
    (
    SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
    COUNT(RR.receipt_id) AS score
    FROM rewards_receipts RR
    INNER JOIN receipts R
    ON RR.receipt_id = R.receipt_id
    WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = DATE_FORMAT(DATE_SUB(STR_TO_DATE(CONCAT(@Most_recent_month, '01'), '%Y%m%d'), INTERVAL 1 MONTH), '%Y%m')

    GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
    ) S1
    WHERE brand IS NOT NULL AND brand != 'ITEM'

) S3
ON S2.brand = S3.brand;
"""
results=run_query(engine,query_2)

columns = ["brandCode", 'Current_Receipts_scanned', "Current_Brand_Rank",'Previous_Receipts_scanned', "Previous_Brand_Rank"]
table = PrettyTable()
table.field_names = columns


for row in results:
    table.add_row(row)

print(table)


Error in the query -> This result object does not return rows. It has been closed automatically.
+-----------+--------------------------+--------------------+---------------------------+---------------------+
| brandCode | Current_Receipts_scanned | Current_Brand_Rank | Previous_Receipts_scanned | Previous_Brand_Rank |
+-----------+--------------------------+--------------------+---------------------------+---------------------+
|  MUELLER  |            29            |         1          |             4             |          51         |
|  THINDUST |            29            |         1          |             4             |          51         |
|  FLIPBELT |            28            |         2          |             22            |          33         |
|   HEINZ   |            10            |         3          |             22            |          33         |
|   BRAND   |            3             |         4          |             19            |          36         |
+------

In [97]:
#### For Month = '202101'

#### Query 2
set_month = '202101'
set_most_recent_month_query = f"SET @MOST_RECENT_MONTH = {set_month};"
run_query(engine,query = set_most_recent_month_query)

query_2 = f"""
SELECT S2.brand AS Brand, S2.score AS CurrentScore, S2.brandRank AS CurrentBrandRank,S3.score AS PreviousScore, S3.brandRank AS PreviousBrandRank
FROM
(
    SELECT brand,ROUND(score,4) AS score,
    DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
    FROM 
    (
    SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
    COUNT(RR.receipt_id) AS score
    FROM rewards_receipts RR
    INNER JOIN receipts R
    ON RR.receipt_id = R.receipt_id
    WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = @MOST_RECENT_MONTH
    GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
    ) S1
    WHERE brand IS NOT NULL AND brand != 'ITEM' AND score != 0
    LIMIT 5
) S2
LEFT JOIN
(
    SELECT brand,ROUND(score,4) AS score,
    DENSE_RANK() OVER(ORDER BY score DESC) AS brandRank
    FROM 
    (
    SELECT COALESCE(RR.brandCode,RR.description_first_word) AS brand,
    COUNT(RR.receipt_id) AS score
    FROM rewards_receipts RR
    INNER JOIN receipts R
    ON RR.receipt_id = R.receipt_id
    WHERE DATE_FORMAT(R.scanned_date_time,'%Y%m') = DATE_FORMAT(DATE_SUB(STR_TO_DATE(CONCAT(@Most_recent_month, '01'), '%Y%m%d'), INTERVAL 1 MONTH), '%Y%m')

    GROUP BY COALESCE(RR.brandCode,RR.description_first_word)
    ) S1
    WHERE brand IS NOT NULL AND brand != 'ITEM'

) S3
ON S2.brand = S3.brand;
"""
results=run_query(engine,query_2)

columns = ["brandCode", 'Current_Receipts_scanned', "Current_Brand_Rank",'Previous_Receipts_scanned', "Previous_Brand_Rank"]
table = PrettyTable()
table.field_names = columns


for row in results:
    table.add_row(row)

print(table)


Error in the query -> This result object does not return rows. It has been closed automatically.
+----------------+--------------------------+--------------------+---------------------------+---------------------+
|   brandCode    | Current_Receipts_scanned | Current_Brand_Rank | Previous_Receipts_scanned | Previous_Brand_Rank |
+----------------+--------------------------+--------------------+---------------------------+---------------------+
|     HY-VEE     |           296            |         1          |            None           |         None        |
| BEN AND JERRYS |           180            |         2          |            None           |         None        |
|       PC       |           138            |         3          |            None           |         None        |
|   KLARBRUNN    |           133            |         4          |            None           |         None        |
|     PEPSI      |           103            |         5          |            None  