In [82]:
## Importing necessary libraries
import pandas as pd
import numpy as np
import os
import json
import jsonlines
import datetime
import great_expectations as ge
from sqlalchemy import create_engine
import pymysql
import yaml
from sqlalchemy import create_engine
from sqlalchemy import text
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 100)  # Set maximum rows to 100
pd.set_option('display.max_columns', 100) # Set maximum columns to 20

In [83]:
## reading flattened csv files.

root_dir = os.getcwd()

brands_df = pd.read_csv(os.path.join(root_dir,'brands.csv'))
users_df = pd.read_csv(os.path.join(root_dir,'users.csv'))
receipts_df = pd.read_csv(os.path.join(root_dir,'receipts.csv'))
rewards_receipts_df = pd.read_csv(os.path.join(root_dir,'rewards_receipts.csv'))

In [84]:
## Functions:

# Function to read YAML file
def read_yaml(file_path):
    with open(file_path, 'r') as file:
        try:
            # Load the YAML content into a Python dictionary
            data = yaml.safe_load(file)
            return data
        except yaml.YAMLError as exc:
            print(f"Error reading YAML file: {exc}")
            return None

def create_database(db_url, db_name):
    try:
        # Connect to MySQL using SQLAlchemy's engine
        engine = create_engine(db_url)
        with engine.connect() as connection:
            # Execute the command to create a database
            connection.execute(text(f"CREATE DATABASE IF NOT EXISTS {db_name}"))
            print(f"Database '{db_name}' created successfully!")
    except Exception as e:
        print(f"Error while creating database: {e}")


# Function to run SQL query
def run_query(engine,query):
    try:
        with engine.connect() as connection:
            result = connection.execute(text(query))
            return result.fetchall()
    except Exception as e:
        print(f"Error in the query -> {e}")

#### Data Quality Assessment - Users table

Overview of data quality checks for the Users table:
1) Schema Verification: Ensuring the Schema of the table (possibly new data) be consistent with the already established schema for the Users table.
2) Checking for Duplicates: Ensuring no duplicate records.
3) If assigned primary key is unique: Verifying the uniqueness of the primary key.
4) Missing values: Identifying the columns with missing values.
5) Timestamp validation: Ensuring Last login date is after the created date.
6) Data Type Validaton: Ensuring the appropriate data types for all the columns.
7) Miscellaneous: Verifying the range of the columns.

In [85]:
## Overview of the contents of the Users table
users_df.head()

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,2021-01-03 09:25:30.554,2021-01-03 09:25:30.597
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858


#### 1) Schema Verification:


In [86]:
##user_df columns
users_df.columns

Index(['active', 'role', 'signUpSource', 'state', 'user_id',
       'created_date_time', 'lastLogin_date_time'],
      dtype='object')

Check passed. A more formal way of testing would be to write an expectation and verify which will be discussed in the end.

#### 2) Checking for Duplicates:

In [87]:
## Checking for duplicates

users_duplicates = users_df[users_df.duplicated()]
print(f'Percentage of Duplicate user records :{round(100*users_duplicates.shape[0]/users_df.shape[0],4)}')

Percentage of Duplicate user records :57.1717


#### Data Quality Issue spotted!!
57.17% of the records in the users table are duplicates, which is a serious data quality issue. The duplicate records are removed to ensure data quality.

In [88]:
## Deleting duplicate records
users_df_no_duplicates = users_df.drop_duplicates()

#### 3) If Primary_key : 'user_id' is unique()

In [89]:
## Checking if 'user_id' is unique
print(users_df_no_duplicates['user_id'].is_unique)

True


In [90]:
## Checking if user_id is unique
users_df_no_duplicates['user_id'].isna().sum()

0

This checks that if the chosen primary key : user_id is unique and non missing. Data Quality check Passed!

#### 4) Analyzing Missing values

In [91]:
## percentage of missing values 

users_df_no_duplicates.isna().sum()*100/users_df_no_duplicates.shape[0]

active                  0.000000
role                    0.000000
signUpSource            2.358491
state                   2.830189
user_id                 0.000000
created_date_time       0.000000
lastLogin_date_time    18.867925
dtype: float64

In [92]:
## Analyzing missing values in 'signUpSource'
users_df_no_duplicates[users_df_no_duplicates['signUpSource'].isna()]

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
388,True,consumer,,WI,55308179e4b0eabd8f99caa2,2015-04-16 22:43:53.186,2018-05-07 12:23:40.003
395,True,fetch-staff,,WI,59c124bae4b0299e55b0f330,2017-09-19 09:07:54.302,2021-02-08 10:42:58.117
422,True,consumer,,,5a43c08fe4b014fd6b6a0612,2017-12-27 09:47:27.059,2021-02-12 10:22:37.155
462,True,fetch-staff,,IL,5964eb07e4b03efd0c0f267b,2017-07-11 10:13:11.771,2021-03-04 13:07:49.770
475,True,fetch-staff,,,54943462e4b07e684157a532,2014-12-19 08:21:22.381,2021-03-05 10:52:23.204


In [93]:
## Unique values of signUpSource
users_df_no_duplicates['signUpSource'].unique()

array(['Email', 'Google', nan], dtype=object)

3 out of 6 missing values occur for the user who is a fetch-staff. The unique values for signup source are Email and Google. It is very much possible that fetch-staff can signUp using an internal source which is not email or google. For better data quality, a signupsource exclusively for staff members could be alloted if that is the case. The resources list could be expanded including possible valid sources  such as **App Store**, **Referral**, and other relevant platforms, to guarantee that this field is complete for all users. If the field is still missing, it could be imputed as 'others'.

In [94]:
## Analyzing missing values in States
users_df_no_duplicates[users_df_no_duplicates['state'].isna()]

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
344,True,consumer,Email,,60145ff384231211ce796d51,2021-01-29 13:20:19.722,
375,True,consumer,Email,,60186237c8b50e11d8454d5f,2021-02-01 14:19:03.551,
422,True,consumer,,,5a43c08fe4b014fd6b6a0612,2017-12-27 09:47:27.059,2021-02-12 10:22:37.155
432,True,fetch-staff,Email,,5fbc35711d967d1222cbfefc,2020-11-23 16:19:29.509,2021-02-25 22:25:51.057
455,True,fetch-staff,Email,,5fa41775898c7a11a6bcef3e,2020-11-05 09:17:09.396,2021-03-04 10:02:02.026
475,True,fetch-staff,,,54943462e4b07e684157a532,2014-12-19 08:21:22.381,2021-03-05 10:52:23.204


In [95]:
## Unique values of state
users_df_no_duplicates['state'].unique()

array(['WI', 'KY', 'AL', 'CO', 'IL', nan, 'OH', 'SC', 'NH'], dtype=object)

Missing values could be imputed with 'DNF' (Did not fill) or 'N/A' or 'None' approriately.

In [96]:
## Analyzing Missing Last Login Date:

missing_last_login = users_df_no_duplicates[users_df_no_duplicates['lastLogin_date_time'].isna()]
missing_last_login.head()
# missing_last_login.groupby('signUpSource')\
#     .agg({'signUpSource':'size'})\
#     .rename(columns = {'signUpSource':'count'})\
#     .sort_values(by = 'count',ascending= False)\
#     *100/(missing_last_login.shape[0])

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
97,True,consumer,Email,KY,5ff616a68f142f11dd189163,2021-01-06 13:59:34.996,
143,True,consumer,Email,AL,5ffe115404929101d0aaebb2,2021-01-12 15:15:00.208,
170,True,consumer,Google,WI,5e27526d0bdb6a138c32b556,2020-01-21 13:35:09.795,
180,True,consumer,Email,WI,6002475cfb296c121a81b98d,2021-01-15 19:54:36.571,
181,True,consumer,Email,WI,60024f24e257124ec6b99a13,2021-01-15 20:27:49.345,


The missing last login data indicates that users are not immediately logging in after signing up. Of the users with missing last login dates, 95% signed up via email, while only 5% signed up through Google. This suggests a potential bottleneck in the email sign-up process, where users may be unable to log in right after signing up. One possible cause could be permission issues or other technical restrictions that prevent the app from opening or proceeding to the login page after email registration. Further investigation is needed to identify and address this issue to streamline the sign-up and login experience, especially for email users.

It would be interesting to check if the user_ids for the above table could be present in receipts_df. It is expected that the user_ids would not be present in receipts_df.

In [97]:
## Checking if the user_id present in the missing last login date if they made any orders.
missing_last_login_merge_receipts = missing_last_login\
    .merge(receipts_df, how = 'inner',left_on = 'user_id',right_on = 'userId')\
    [['user_id','created_date_time_x','lastLogin_date_time',\
      'bonusPointsEarned','receipt_id','scanned_date_time']]
missing_last_login_merge_receipts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 0 to 56
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user_id              57 non-null     object 
 1   created_date_time_x  57 non-null     object 
 2   lastLogin_date_time  0 non-null      object 
 3   bonusPointsEarned    32 non-null     float64
 4   receipt_id           57 non-null     object 
 5   scanned_date_time    57 non-null     object 
dtypes: float64(1), object(5)
memory usage: 3.1+ KB


#### Data Quality Issue spotted!!
It is interesting to see that the above data suggests that users with no recorded last login date are still able to scan receipts and earn bonus points. This is unexpected behavior, as we would typically expect a last login date to be present once a user interacts with the app to scan receipts. This anomaly needs to be investigated further, as it could indicate a logging issue or a flaw in the system where login events are not properly captured before allowing user actions such as receipt scanning.

#### 5) Timestamp validation

In [98]:
##Checking if last login date is after creation date

users_df_no_duplicates[users_df_no_duplicates['lastLogin_date_time'] \
                       < users_df_no_duplicates['created_date_time']].shape

(0, 7)

No data entries where last login date and time is before creation date and time.

#### 6) Appropriate formatting of the data

In [99]:
## Data types for the users
users_df_no_duplicates.dtypes

active                   bool
role                   object
signUpSource           object
state                  object
user_id                object
created_date_time      object
lastLogin_date_time    object
dtype: object

I would check if there are any discrepancies in the way the strings are entered. Discrepancy definition: 'GOOGLE' and 'Google'

In [100]:
## checking for discrepancies of the 'object' columns
for columns in ['active','role','signUpSource','state']:
    print(users_df_no_duplicates[columns].unique())

[ True False]
['consumer' 'fetch-staff']
['Email' 'Google' nan]
['WI' 'KY' 'AL' 'CO' 'IL' nan 'OH' 'SC' 'NH']


There are no string discrepancies.

It would be better to convert Date columns from strings to datetime objects.

In [101]:
## Objects to date conversion:
users_df_no_duplicates['created_date_time'] = pd.to_datetime(users_df_no_duplicates['created_date_time'], format='%Y-%m-%d %H:%M:%S.%f')
users_df_no_duplicates['lastLogin_date_time'] = pd.to_datetime(users_df_no_duplicates['lastLogin_date_time'], format='%Y-%m-%d %H:%M:%S.%f')


#### 7) Other Miscellaneous checks:

I would check if the state column is in correct abbrevation and would belong to the states from USA.


In [102]:
## Checking if the states from users_table are subset of the states_abbreviations
states_abbr = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 
    'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 
    'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 
    'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 
    'WI', 'WY', np.nan
]

[state for state in users_df_no_duplicates['state'] if state not in states_abbr]

[]

In [103]:
## Date range checks for date type columns
## choosing a 100 year valid period starting from 2000
start_date = pd.to_datetime('2000-01-01 00:00:00.000')
end_date = pd.to_datetime('2100-12-31 00:00:00.000')

for column in ['created_date_time','lastLogin_date_time']:
    if users_df_no_duplicates[\
        (users_df_no_duplicates[column] > end_date) \
            | (users_df_no_duplicates[column] < start_date)].shape[0] == 0:
        print(f'Check passed for {column}')
    else:
        print(f'Check failed for {column}')

Check passed for created_date_time
Check passed for lastLogin_date_time


Checking if the range of the created_date_time and last_login_date_time to understand if the data is missing for certain months or years

In [104]:
# Extracting year and month for easier grouping
users_df_no_duplicates['created_year_month'] = users_df_no_duplicates['created_date_time'].dt.to_period('M')

# Checking the range of created dates
created_range = users_df_no_duplicates['created_date_time'].min(), users_df_no_duplicates['created_date_time'].max()
print("Created date range:", created_range)

# Grouping by month to check if data is missing for certain periods
created_counts = users_df_no_duplicates['created_year_month'].value_counts().sort_index()

print("Created counts by month:\n",created_counts)

Created date range: (Timestamp('2014-12-19 08:21:22.381000'), Timestamp('2021-02-12 08:11:06.240000'))
Created counts by month:
 2014-12      1
2015-04      1
2017-07      1
2017-09      1
2017-12      1
2020-01      1
2020-07      1
2020-11      4
2020-12      1
2021-01    170
2021-02     30
Freq: M, Name: created_year_month, dtype: int64


#### Data Quality issue spotted!! 

The sporadic distribution of `created_date_time`, with entire years like **2016**, **2018**, and **2019** missing, seems suspicious. Unless there's a specific reason for these gaps, I would flag it as a potential data quality issue for further investigation.

#### Data Quality Assessment - Brands table

Similar to Data Quality Assessment of Users table, here is the 
Overview of data quality checks for the Brands table:

1) Schema Verification: Ensuring the Schema of the table (possibly new data) be consistent with the already established schema for the Users table.
2) Checking for Duplicates: Ensuring no duplicate records.
3) If assigned primary key is unique: Verifying the uniqueness of the primary key.
4) Missing values: Identifying the columns with missing values.
5) Data Type Validaton: Ensuring the appropriate data types for all the columns.


Additionally,
Certain entities are expected to maintain one-one relationship between them. For example: Brand Name and Brand Code are expected to maintain one-one relationship. Some of the checks are written to analyze this in detail.

In [105]:
## over view of the contents of the brands table
brands_df.head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


#### 1) Schema Verification:


In [106]:
##brand_df columns
brands_df.columns

Index(['barcode', 'category', 'categoryCode', 'name', 'topBrand', 'brand_id',
       'cpg_id', 'cpg_ref', 'brandCode'],
      dtype='object')

Check passed. A more formal way of testing would be to write an expectation and verify which will be discussed in the end.

#### 2) Duplicates, Uniqueness of the primary key and Missing values of Primary Key check:

In [107]:
## Checking for duplicate entries in brands table
if brands_df[brands_df.duplicated()].shape[0] == 0:
    print('No Duplicate entries in Brands table')
else:
    print('Duplicate entries in Brands table')

## Checking if brand_id is unique
if brands_df['brand_id'].is_unique:
    print('brand_id is unique')
else:
    print('brand_id is not unique')

## Checking if there are any missing values in brands_id
if brands_df['brand_id'].isna().sum() == 0:
    print('brand_id is not missing')
else:
    print('brand_id is missing')

No Duplicate entries in Brands table
brand_id is unique
brand_id is not missing


3) One-One relationship between columns

In [108]:
##checking if brand name and brand code have one-one to relationship (which is expected)

inconsistent = brands_df.groupby('name')['brandCode'].nunique().reset_index()
inconsistent = inconsistent[inconsistent['brandCode'] > 1]
inconsistent

Unnamed: 0,name,brandCode
73,Baken-Ets,2
129,Caleb's Kola,2
223,Dippin Dots® Cereal,2
313,Health Magazine,2
335,I CAN'T BELIEVE IT'S NOT BUTTER!,2
504,ONE A DAY® WOMENS,2
564,Pull-Ups,2


It is expected that brand name and brand code would go hand in hand but the above table disagrees. Sampling a couple of data points from the above table would bring us closer to the issue.

In [109]:
## Deep analysis of why the name and brandCode are inconsistent

print(brands_df[brands_df['name'] == 'Baken-Ets'][['name','brandCode']])
print(brands_df[brands_df['name'] == 'Caleb\'s Kola'][['name','brandCode']])
print(brands_df[brands_df['name'] == 'Dippin Dots® Cereal'][['name','brandCode']])
print(brands_df[brands_df['name'] == 'I CAN\'T BELIEVE IT\'S NOT BUTTER!'][['name','brandCode']])

          name  brandCode
574  Baken-Ets  BAKEN ETS
848  Baken-Ets  BAKEN-ETS
             name     brandCode
140  Caleb's Kola  CALEB'S KOLA
740  Caleb's Kola   CALEBS KOLA
                     name           brandCode
1081  Dippin Dots® Cereal         DIPPIN DOTS
1163  Dippin Dots® Cereal  DIPPIN DOTS CEREAL
                                 name                         brandCode
176  I CAN'T BELIEVE IT'S NOT BUTTER!  I CAN'T BELIEVE IT'S NOT BUTTER!
846  I CAN'T BELIEVE IT'S NOT BUTTER!   I CAN'T BELIEVE IT'S NOT BUTTER


#### Data Quality Issue spotted

The spelling and punctuation mistakes in the brandCode highlight the inconsistency in the relationship between brandCode and brand name. This data quality issue needs to be addressed, as brandCode is a crucial parameter for identifying a brand. It is a serious problem when trying to understand and analyse brand related KPIs. Additionally, brandCode serves as a foreign key in the rewards_Receipts table, as discussed in the first part of this project. To ensure data integrity, it is essential to have a consistent, error-free one-to-one mapping between brandCode and brand name.

In [110]:
## Automation code to check if there are any one-one relatipnship between non primary keys in a dataframe.

df = brands_df # one of the four tables: Brands, Users, Receipts and Rewards_Receipts

columns_might_have_one_one_relationship = ['name','brandCode','topBrand','brand_id']

for columni in columns_might_have_one_one_relationship:
    for columnj in columns_might_have_one_one_relationship:

        if columni != columnj: 
            inconsistent = df.groupby(columni)[columnj].nunique().reset_index()
            inconsistent = inconsistent[inconsistent[columnj] > 1]
            if  not inconsistent.empty:
                print(f'one-one relationship does not exist between {columni} and {columnj}')

one-one relationship does not exist between name and brandCode
one-one relationship does not exist between name and topBrand
one-one relationship does not exist between name and brand_id
one-one relationship does not exist between brandCode and name
one-one relationship does not exist between brandCode and topBrand
one-one relationship does not exist between brandCode and brand_id
one-one relationship does not exist between topBrand and name
one-one relationship does not exist between topBrand and brandCode
one-one relationship does not exist between topBrand and brand_id


In [111]:
##checking if brand name and topbrand have one-one to relationship (which is expected)

inconsistent = brands_df.groupby('name')['topBrand'].nunique().reset_index()
inconsistent = inconsistent[inconsistent['topBrand'] > 1]
inconsistent

Unnamed: 0,name,topBrand
333,Huggies,2
564,Pull-Ups,2


In [112]:
## Deep analysis of why the name and topbrand are inconsistent

print(brands_df[brands_df['name'] == 'Huggies'][['name','topBrand']])

print(brands_df[brands_df['name'] == 'Pull-Ups'][['name','topBrand']])

         name topBrand
628   Huggies    False
1074  Huggies     True
         name topBrand
126  Pull-Ups    False
978  Pull-Ups     True


#### Data Quality Issue spotted

Based on the above table, it is difficult to determine whether Huggies qualifies as a top brand due to the inconsistency in the data. Such inconsistencies are undesirable and may impede further analysis, especially when evaluating key performance indicators (KPIs) related to brand performance. For meaningful insights, the classification of topBrand should be consistently defined and applied at the brand level to ensure accurate and reliable analysis.

In [113]:
##checking if brand name and brand_id have one-one to relationship (which is expected)

inconsistent = brands_df.groupby('name')['brand_id'].nunique().reset_index()
inconsistent = inconsistent[inconsistent['brand_id'] > 1]
inconsistent

Unnamed: 0,name,brand_id
73,Baken-Ets,2
129,Caleb's Kola,2
215,Diabetic Living Magazine,2
223,Dippin Dots® Cereal,2
313,Health Magazine,2
333,Huggies,2
335,I CAN'T BELIEVE IT'S NOT BUTTER!,2
504,ONE A DAY® WOMENS,2
564,Pull-Ups,2
627,Sierra Mist,2


An interesting question arises here. Can the same brand name have a different brand_uuid? The answer to this question depends on how FETCH assigns brand_uuids to the products/brands.

#### Missing Value checks

In [114]:
## Missing value percentages in brands_df
brands_df.isna().sum()*100/brands_df.shape[0]

barcode          0.000000
category        13.281919
categoryCode    55.698372
name             0.000000
topBrand        52.442159
brand_id         0.000000
cpg_id           0.000000
cpg_ref          0.000000
brandCode       23.050557
dtype: float64

#### Data Quality issue spotted (Missing category values)

13% percent of the category data is missing. Can the category field be imputed using the brand name? Maybe yes. However, there could be cases for example the brand 'Nike'; they could sell shoes and apparels, so the category could be ambiguous here. Therefore, it would not be straight forward to impute category and just by the brand name. 

Ways like automating data entry processes using reference tables that map barcode or brandCode to a specific category and categoryCode could be explored.

In [115]:
## Understanding why category is missing.

category_missing_brands = brands_df[brands_df['category'].isna()]
category_missing_brands.head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
11,511111102540,,,MorningStar,,57c08106e4b0718ff5fcb02c,5332f5f2e4b03c9a25efd0aa,Cpgs,
23,511111303947,,,Bottled Starbucks,,5332f5fee4b03c9a25efd0bd,53e10d6368abd3c7065097cc,Cpgs,
24,511111802914,,,Full Throttle,,5332fa7ce4b03c9a25efd22e,5332f5ebe4b03c9a25efd0a8,Cpgs,
34,511111103653,,,Gold Medal,,5332f772e4b03c9a25efd125,5332f5f3e4b03c9a25efd0ae,Cpgs,
43,511111503699,,,Glaceau vitaminwater,,5332f765e4b03c9a25efd11f,5332f5ebe4b03c9a25efd0a8,Cpgs,


#### Data Quality issue spotted (Missing categoryCode values)

In [116]:
## Understanding why categoryCode is missing given category.
## Later checking if missing categoryCode values are missing for all the respective categories

categoryCode_missing_brands = brands_df[(~brands_df['category'].isna()) & (brands_df['categoryCode'].isna())]\
    [['category','categoryCode']].drop_duplicates().sort_values(by='category')
categoryCode_missing_brands

categoryCode_present_brands = brands_df[(~brands_df['category'].isna()) & (~brands_df['categoryCode'].isna())]\
    [['category','categoryCode']].drop_duplicates().sort_values(by='category')

categoryCode_missing_brands.merge(categoryCode_present_brands, \
                                how = 'left', suffixes=('_missing', '_present'),\
                                on = 'category').sort_values(by = 'categoryCode_present',ascending=False).head(9)

Unnamed: 0,category,categoryCode_missing,categoryCode_present
16,Personal Care,,PERSONAL_CARE
15,Magazines,,MAGAZINES
13,Health & Wellness,,HEALTHY_AND_WELLNESS
12,Grocery,,GROCERY
11,Frozen,,FROZEN
5,Beverages,,BEVERAGES
4,Beer Wine Spirits,,BEER_WINE_SPIRITS
1,Baking,,BAKING
0,Baby,,BABY


#### Data Quality Issue spotted.
The table above reveals that for certain categories, the categoryCode is missing in some instances, while it is properly filled for the same category in other records. This inconsistency highlights a data quality issue where categoryCode has not been filled appropriately, despite having clear information on how it should be completed.

#### Data Quality Issue spotted (Missing brandCodes)

23% of the brandCodes are missing. This is a huge Data Quality issue as 'brandCode' is going to play a key role in connecting tables (connecting rewards_Receipts table in this project). The missing values need to be imputed appropriately. The way the brandCode information is entered could be further investigated to ensure completeness.

In [117]:
## Understanding why brandCode is missing given brand_name.
## Later checking if missing brandCode values are missing for all the respective brand_names

brandCode_missing_brands = brands_df[(~brands_df['name'].isna()) & (brands_df['brandCode'].isna())]\
    [['brandCode','name']].drop_duplicates().sort_values(by='name')
brandCode_missing_brands

brandCode_present_brands = brands_df[(~brands_df['name'].isna()) & (~brands_df['brandCode'].isna())]\
    [['brandCode','name']].drop_duplicates().sort_values(by='name')

brandCode_missing_brands.merge(brandCode_present_brands, \
                                how = 'left', suffixes=('_missing', '_present'),\
                                on = 'name').sort_values(by = 'brandCode_present',ascending=False).head(3)

Unnamed: 0,brandCode_missing,name,brandCode_present
193,,V8 Hydrate,V8 HYDRATE
173,,Sierra Mist,SIERRA MIST
57,,Diabetic Living Magazine,511111805298


#### Data Quality Issue spotted.
The table above reveals that for certain brands, the brandCode is missing in some instances, while it is properly filled for the same brand in other records. This inconsistency highlights a data quality issue where brandCode has not been filled appropriately, despite having clear information on how it should be completed.

Missing top_brand information can be flagged as a mild Data Quality Issue. A third category, such as ‘undecided,’ could be introduced to handle these cases.

#### Consistent Data formatting

In [118]:
brands_df.dtypes

barcode          int64
category        object
categoryCode    object
name            object
topBrand        object
brand_id        object
cpg_id          object
cpg_ref         object
brandCode       object
dtype: object

For TopBrand The ideal data type would be Boolean. There is a need to update that.

In [119]:
## Converting Top brand to Boolean

brands_df['topBrand'] = brands_df['topBrand'].astype('boolean')
brands_df.dtypes

barcode           int64
category         object
categoryCode     object
name             object
topBrand        boolean
brand_id         object
cpg_id           object
cpg_ref          object
brandCode        object
dtype: object

In [120]:
## Verifying if the barcode is in consistent format -> 12 digit
if brands_df[brands_df['barcode'].apply(lambda x: len(str(x))) != 12].empty:
    print('All the barcodes are in consistent format -> 12 digits')
else:
    print('Barcodes is not in consistent format')
    print(brands_df[brands_df['barcode'].apply(lambda x: len(str(x))) != 12]['barcode'])

All the barcodes are in consistent format -> 12 digits


#### Data Quality Assessment for Receipts table

#### Schema verification:


In [121]:
receipts_df.columns

Index(['bonusPointsEarned', 'bonusPointsEarnedReason', 'pointsEarned',
       'purchasedItemCount', 'rewardsReceiptItemList', 'rewardsReceiptStatus',
       'totalSpent', 'userId', 'receipt_id', 'created_date_time',
       'scanned_date_time', 'finished_date_time', 'modify_date_time',
       'pointsAwarded_date_time', 'purchased_date_time'],
      dtype='object')

In [122]:
## Overview of the Receipts table
receipts_df.head(2)

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,2021-01-03 09:25:31.000,2021-01-03 09:25:31.000,2021-01-03 09:25:31.000,2021-01-03 09:25:36.000,2021-01-03 09:25:31,2021-01-02 18:00:00
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,2021-01-03 09:24:43.000,2021-01-03 09:24:43.000,2021-01-03 09:24:43.000,2021-01-03 09:24:48.000,2021-01-03 09:24:43,2021-01-02 09:24:43


In [123]:
## Checking for duplicate entries in receipts table
if receipts_df[receipts_df.duplicated()].shape[0] == 0:
    print('No Duplicate entries in Receipts table')
else:
    print('Duplicate entries in Brands table')
## Checking if receipt_id is unique
if receipts_df['receipt_id'].is_unique:
    print('receipt_id is unique')
else:
    print('receipt_id is not unique')
## Checking if there are any missing values in brands_id
if receipts_df['receipt_id'].isna().sum() == 0:
    print('receipt_id is not missing')
else:
    print('receipt_id is missing')

No Duplicate entries in Receipts table
receipt_id is unique
receipt_id is not missing


Standard Data Quality checks are passed, concering the primary key that is receipt_id.

#### Missing Value Checks

In [124]:
## Missing value percentage for Receipts table

receipts_df.isna().sum()*100/receipts_df.shape[0]

bonusPointsEarned          51.385165
bonusPointsEarnedReason    51.385165
pointsEarned               45.576408
purchasedItemCount         43.252904
rewardsReceiptItemList     39.320822
rewardsReceiptStatus        0.000000
totalSpent                 38.873995
userId                      0.000000
receipt_id                  0.000000
created_date_time           0.000000
scanned_date_time           0.000000
finished_date_time         49.240393
modify_date_time            0.000000
pointsAwarded_date_time    52.010724
purchased_date_time        40.035746
dtype: float64

In [125]:
#### Understanding if the missing points are related to the rewards_status, as it could be that for status being not finished, the rewards points are missing.
missing_points_df = receipts_df[receipts_df['pointsEarned'].isna()]
missing_points_percent_count = missing_points_df\
    .groupby('rewardsReceiptStatus').agg({'rewardsReceiptStatus':'size'})\
    .rename(columns = {'rewardsReceiptStatus':'Missing_Percentage'})\
    .sort_values(by = 'Missing_Percentage',ascending = False)*100/missing_points_df.shape[0]
missing_points_percent_count

Unnamed: 0_level_0,Missing_Percentage
rewardsReceiptStatus,Unnamed: 1_level_1
SUBMITTED,85.098039
PENDING,9.803922
FLAGGED,2.54902
REJECTED,2.54902


In [126]:
## Unique values of the Status
receipts_df['rewardsReceiptStatus'].unique()

array(['FINISHED', 'REJECTED', 'FLAGGED', 'SUBMITTED', 'PENDING'],
      dtype=object)

The two cells above show that points are missing for entries with a status of ‘NOT FINISHED,’ which makes sense since it likely indicates the records are still awaiting points. Therefore, I don’t consider missing points/bonusPoints/bonusPointsEarnedReason a serious data quality issue.

In [127]:
##Checking rewards item list that are empty

missing_items = receipts_df[receipts_df['rewardsReceiptItemList'].isna()]
missing_items.describe()

Unnamed: 0,bonusPointsEarned,pointsEarned,purchasedItemCount,totalSpent
count,2.0,2.0,5.0,5.0
mean,375.0,375.0,0.0,0.0
std,176.776695,176.776695,0.0,0.0
min,250.0,250.0,0.0,0.0
25%,312.5,312.5,0.0,0.0
50%,375.0,375.0,0.0,0.0
75%,437.5,437.5,0.0,0.0
max,500.0,500.0,0.0,0.0


In [128]:
## Understanding how the missing values distributed across other columns when items are missing
missing_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440 entries, 71 to 1118
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bonusPointsEarned        2 non-null      float64
 1   bonusPointsEarnedReason  2 non-null      object 
 2   pointsEarned             2 non-null      float64
 3   purchasedItemCount       5 non-null      float64
 4   rewardsReceiptItemList   0 non-null      object 
 5   rewardsReceiptStatus     440 non-null    object 
 6   totalSpent               5 non-null      float64
 7   userId                   440 non-null    object 
 8   receipt_id               440 non-null    object 
 9   created_date_time        440 non-null    object 
 10  scanned_date_time        440 non-null    object 
 11  finished_date_time       3 non-null      object 
 12  modify_date_time         440 non-null    object 
 13  pointsAwarded_date_time  2 non-null      object 
 14  purchased_date_time     

From the above information, it’s clear that the receipt scanning algorithm is not capturing the contents of some receipts. This could be due to illegible receipt content or technical issues with the computer vision algorithm. While I wouldn’t flag this as a major data quality issue, I would store this information and use it to notify users that their receipts couldn’t be scanned, prompting them to try again later.

#### Chronology of the Date variables check

The expected order is as follows: first, the items are purchased (1), then the instance is created (2), followed by the contents being scanned (3), then finished (4), and finally modified (5).

In [129]:
# Check if chronology is maintained across multiple columns
chronology_issues = receipts_df[
    (receipts_df['purchased_date_time'] > receipts_df['created_date_time']) |
    (receipts_df['created_date_time'] > receipts_df['scanned_date_time']) |
    (receipts_df['scanned_date_time'] > receipts_df['finished_date_time'])
]
# Display the Data percentage where chronology is not maintained
print(f'Percentage of the Data not following the above mentioned chronological order \
= {round(100*chronology_issues.shape[0]/receipts_df.shape[0],4)}')

Percentage of the Data not following the above mentioned chronological order = 1.1618


#### Data Quality Issue spotted

There’s a need for further investigation to understand why these chronological issues are occurring.

#### Checking if the users in the Receipts table are present in the Users table

In [130]:
## checking for presence of the users present in the receipts table but not in the Users table
percentage_of_users_not_in_users_table = 100*receipts_df[~receipts_df['userId']\
    .isin(users_df_no_duplicates['user_id'])].shape[0]/len(receipts_df['userId'].unique())

print(f'Percentage of the Users not present in the Users table = {round(percentage_of_users_not_in_users_table,4)}%')


Percentage of the Users not present in the Users table = 57.3643%


In [131]:
## checking for presence of the users present in the receipts table but not in the Users table
user_id_in_receipts_df = list(receipts_df['userId'].unique())
user_id_in_users_df = list(users_df_no_duplicates['user_id'].unique())

missing_user_id = [x for x in user_id_in_receipts_df if x not in user_id_in_users_df]
percentage_of_users_not_in_users_table = 100*len(missing_user_id)/len(user_id_in_receipts_df)

print(f'Percentage of the Users not present in the Users table = {round(percentage_of_users_not_in_users_table,4)}%')

Percentage of the Users not present in the Users table = 45.3488%


#### Data Quality Issue spotted
It's expected that users who made purchases should appear in the Users table. 45.38% suggests that the Users table is not being refreshed in a timely manner, and consistency needs to be ensured.

#### Data Format Checks

In [132]:
## Objects to date conversion:
receipts_df['created_date_time'] = pd.to_datetime(receipts_df['created_date_time'], format='%Y-%m-%d %H:%M:%S.%f')
receipts_df['scanned_date_time'] = pd.to_datetime(receipts_df['scanned_date_time'], format='%Y-%m-%d %H:%M:%S.%f')
receipts_df['finished_date_time'] = pd.to_datetime(receipts_df['finished_date_time'], format='%Y-%m-%d %H:%M:%S.%f')
receipts_df['modify_date_time'] = pd.to_datetime(receipts_df['modify_date_time'], format='%Y-%m-%d %H:%M:%S.%f')
receipts_df['pointsAwarded_date_time'] = pd.to_datetime(receipts_df['pointsAwarded_date_time'], format='%Y-%m-%d %H:%M:%S.%f')
receipts_df['purchased_date_time'] = pd.to_datetime(receipts_df['purchased_date_time'], format='%Y-%m-%d %H:%M:%S.%f')

#### Data Range Checks

In [133]:
## Description of numerical variables in receipts table
receipts_df.describe()

Unnamed: 0,bonusPointsEarned,pointsEarned,purchasedItemCount,totalSpent
count,544.0,609.0,635.0,684.0
mean,238.893382,585.96289,14.75748,77.796857
std,299.091731,1357.166947,61.13424,347.110349
min,5.0,0.0,0.0,0.0
25%,5.0,5.0,1.0,1.0
50%,45.0,150.0,2.0,18.2
75%,500.0,750.0,5.0,34.96
max,750.0,10199.8,689.0,4721.95


From the table above, we can see that the median is much smaller than the average, indicating a right-skewed distribution. This suggests the presence of outliers in the data. While outliers don't necessarily signal a major data quality issue, they could be flagged as a mild concern. These outliers should be monitored closely for any signs of fraudulent scans or unusual patterns, but they don’t indicate a serious data quality problem on their own.

In [134]:
## Date range checks for date type columns
## choosing a 100 year valid period starting from 2000
start_date = pd.to_datetime('2000-01-01 00:00:00.000')
end_date = pd.to_datetime('2100-12-31 00:00:00.000')

for column in ['created_date_time','scanned_date_time','finished_date_time','modify_date_time','pointsAwarded_date_time','purchased_date_time']:
    if receipts_df[\
        (receipts_df[column] > end_date) \
            | (receipts_df[column] < start_date)].shape[0] == 0:
        print(f'Check passed for {column}')
    else:
        print(f'Check failed for {column}')

Check passed for created_date_time
Check passed for scanned_date_time
Check passed for finished_date_time
Check passed for modify_date_time
Check passed for pointsAwarded_date_time
Check passed for purchased_date_time


#### Data Quality Assessment - Rewards_Receipts table:

In [135]:
## Schema verification
rewards_receipts_df.columns

Index(['barcode', 'description', 'finalPrice', 'itemPrice', 'needsFetchReview',
       'partnerItemId', 'preventTargetGapPoints', 'quantityPurchased',
       'userFlaggedBarcode', 'userFlaggedNewItem', 'userFlaggedPrice',
       'userFlaggedQuantity', 'needsFetchReviewReason',
       'pointsNotAwardedReason', 'pointsPayerId', 'rewardsGroup',
       'rewardsProductPartnerId', 'userFlaggedDescription',
       'originalMetaBriteBarcode', 'originalMetaBriteDescription', 'brandCode',
       'competitorRewardsGroup', 'discountedItemPrice',
       'originalReceiptItemText', 'itemNumber',
       'originalMetaBriteQuantityPurchased', 'pointsEarned', 'targetPrice',
       'competitiveProduct', 'originalFinalPrice',
       'originalMetaBriteItemPrice', 'deleted', 'priceAfterCoupon',
       'metabriteCampaignId', 'receipt_id'],
      dtype='object')

#### Standard Data Quality Checks:

In [136]:
## Checking for duplicate entries in rewards_receipts table
if rewards_receipts_df[rewards_receipts_df.duplicated()].shape[0] == 0:
    print('No Duplicate entries in Rewards_Receipts table')
else:
    print('Duplicate entries in rewards_receipts table')
## Checking if receipt_id is unique
if rewards_receipts_df[['receipt_id','partnerItemId']].drop_duplicates().shape[0] == rewards_receipts_df.shape[0]:
    print('receipt_id and partnerItemId pair is unique')
else:
    print('receipt_id and partnerItemId pair is not unique')
## Checking if there are any missing values in brands_id
if rewards_receipts_df[['receipt_id','partnerItemId']].isna().sum().sum() == 0:
    print('receipt_id and partnerItemId pair is not missing')
else:
    print('receipt_id and partnerItemId pair is missing')

No Duplicate entries in Rewards_Receipts table
receipt_id and partnerItemId pair is unique
receipt_id and partnerItemId pair is not missing


#### Missing Value Checks

In [137]:
## Missing Value Checks
rewards_receipts_df.isna().sum()

barcode                               3851
description                            381
finalPrice                             174
itemPrice                              174
needsFetchReview                      6128
partnerItemId                            0
preventTargetGapPoints                6583
quantityPurchased                      174
userFlaggedBarcode                    6604
userFlaggedNewItem                    6618
userFlaggedPrice                      6642
userFlaggedQuantity                   6642
needsFetchReviewReason                6722
pointsNotAwardedReason                6601
pointsPayerId                         5674
rewardsGroup                          5210
rewardsProductPartnerId               4672
userFlaggedDescription                6787
originalMetaBriteBarcode              6917
originalMetaBriteDescription          6931
brandCode                             4341
competitorRewardsGroup                6666
discountedItemPrice                   1172
originalRec

A significant portion of the data is missing, with some occurring in groups, such as the 'userflagged_' variables, which have many missing values. A reasonable assumption is that these missing values correspond to instances where the user did not flag anything. Similarly, missing values in pointsNotAwardedReason might indicate that points were awarded. Another similar group is OriginalMetaBrite. It should be reviewed whether it's valuable to retain such largely missing data, depending on the business use case. Further investigation is needed to determine the reason for these missing values. Since the foreign key used is brandCode, we can dig deeper to explore potential improvements.

#### Missing BrandCode values and possible imputation techniques

First, we could verify if the chosen foreign key, brandCode, is more suitable than barcode for establishing the link between the rewards_receipts table and the brands table to understand the associated brands.

In [138]:
## checking for presence of the barcodes present in the rewards_receipts table and in the brands table
def convert_barcode(barcode):
    try:
        return int(barcode)
    except:
        return barcode

barcode_in_rewards_receipts_df = list(rewards_receipts_df['barcode'].apply(convert_barcode).unique())
barcode_in_brands_df = list(brands_df['barcode'].unique())

barcode_present = [x for x in barcode_in_rewards_receipts_df if x in barcode_in_brands_df]
print(f'Number of the barcodes present in both the tables = {len(barcode_present)}')

Number of the barcodes present in both the tables = 16


In [139]:
## checking for presence of the brandCodes present in the rewards_receipts table and in the brands table
brandCode_in_rewards_receipts_df = list(rewards_receipts_df['brandCode'].unique())
brandCode_in_brands_df = list(brands_df['brandCode'].unique())

brandCode_present = [x for x in brandCode_in_rewards_receipts_df if x in brandCode_in_brands_df]
print(f'Number of the brandCodes present in both the tables = {len(brandCode_present)}')

Number of the brandCodes present in both the tables = 42


Since the intersection of brandCodes is greater than that of barcodes, it reinforces the decision to use brandCode as the foreign key over barcode.

Can the current intersection number of brandCodes be improved? I propose a union of two changes:

1) The description field often contains brand information, particularly in the first word. Extracting that could serve as a potential imputation method for missing brandCodes in the rewards_receipts table.
2) Impute missing brandCodes in the brands table using the brand's name.

In [140]:
## Imputing missing brandCode information with the first word of the item description

rewards_receipts_df['description_first_word'] = rewards_receipts_df['description']\
    .apply(lambda x: x.split()[0].upper()\
            if pd.notna(x) else np.nan)
rewards_copy = rewards_receipts_df.copy()
rewards_copy['brandCode'] = rewards_copy['brandCode'].fillna(rewards_copy['description_first_word'])

## Imputing missing brandCode information with the brand name in brands table
def impute_brandcode(name):
    if pd.notna(name):

        if name.split()[0].upper() == 'TEST':
            return name.upper()
        else:
            return name.split()[0].upper()
    else:
        return np.nan

brands_df['name_first_word'] = brands_df['name']\
    .apply(impute_brandcode)
brands_copy_df = brands_df.copy()
brands_copy_df['brandCode'] = brands_copy_df['brandCode'].fillna(brands_copy_df['name_first_word'])

In [141]:
## Rationale for the above chosen imputer for missing brandCode
rewards_receipts_df[rewards_receipts_df['brandCode'].isna()][['description_first_word','description']].drop_duplicates().sample(5)

Unnamed: 0,description_first_word,description
5991,BLACK,"Black and White Easter Bunny 16 1/2"" tall by n..."
3820,1OZ,1OZ DFP
159,TASSIMO,"TASSIMO French Vanilla T-Discs, 80ct"
4214,146,146 MSLRG RLNG
1261,SASSY,SASSY COW 1% MILK


In [142]:
## Another rationale of the above chosen imputer for missing
missing_brandcode_desc_present_df = rewards_receipts_df[(rewards_receipts_df['brandCode'].isna()) & (~rewards_receipts_df['description'].isna())]
missing_brandcode_desc_present_df.groupby('description_first_word')\
    .agg({'description_first_word':'size'})\
    .rename(columns={'description_first_word':'count'})\
    .sort_values(by = ['count'],ascending= False)\
    .head(25)

Unnamed: 0_level_0,count
description_first_word,Unnamed: 1_level_1
ITEM,173
PC,138
KLARBRUNN,128
HUGGIES,93
MILLER,90
HYV,84
COMP,74
OSCAR,73
LEGO,56
PLAY,55


Because of the above imputation method, brands like KLARBRUNN, HUGGIES, MUELLER etc are coming into picture. Of course, the imputation is not perfect (for example: ITEM NOT FOUND would become ITEM). Therefore, when performing brand related queries, one needs to keep in mind about this imputation method.

In [143]:
## Rationale for the above chosen imputer for missing brandCode
brands_df[brands_df['brandCode'].isna()][['name_first_word','name']].drop_duplicates().sample(5)

Unnamed: 0,name_first_word,name
748,TEST BRAND @1595968944654,test brand @1595968944654
147,HONEST,Honest Ade
23,BOTTLED,Bottled Starbucks
441,TEST BRAND @1610039587818,test brand @1610039587818
718,SKOL,Skol


In [144]:
## checking for presence of the brandCodes present in the rewards_receipts table and in the brands table
brandCode_in_rewards_receipts_df = list(rewards_copy['brandCode'].unique())
brandCode_in_brands_df = list(brands_copy_df['brandCode'].unique())

present_brandCode_new = [x for x in brandCode_in_rewards_receipts_df if x in brandCode_in_brands_df]
print(f'Number of the brandCodes present in both the Brands table after new imputation technique = {len(present_brandCode_new)}')

Number of the brandCodes present in both the Brands table after new imputation technique = 88


In [145]:
## checking for presence of the brandCodes present in the rewards_receipts table and in the brands table
brandCode_in_rewards_receipts_df = list(rewards_receipts_df['brandCode'].unique())
brandCode_in_brands_df = list(brands_df['brandCode'].unique())

brandCode_present = [x for x in brandCode_in_rewards_receipts_df if x in brandCode_in_brands_df]
print(f'Number of the brandCodes present in both the tables before new imputation technique = {len(brandCode_present)}')

Number of the brandCodes present in both the tables before new imputation technique = 42


There has been an increase of over 200% due to the new imputation techniques. But is this the best approach? Not necessarily, though it is better than leaving the brandCodes missing. Other potential methods to explore include extracting brand information directly from the barcodes, especially if there is a pattern, such as the first four digits indicating something relevant.

The proper approach would be to map the barcodes to their respective brands, store that information, and then use it to accurately extract brand details.

#### Foreign key integrity check

Let's understand about the brandCodes which are present in rewards_receipts table but not present in the Brands table, verifying foreign key integrity. Note that the follow test is done before the proposed imputation.

In [146]:
##checking if unique brandcodes exist in receipt but not in brands

brandcodes_rewards_receipt = list(rewards_receipts_df['brandCode'].unique())
brandcodes_brands = list(brands_df['brandCode'].unique())
len(brandcodes_rewards_receipt)
missing_bar_codes = [x for x in brandcodes_rewards_receipt if x not in brandcodes_brands]
sorted(missing_bar_codes)[:8]

['7UP',
 'ADVIL',
 'AMERICAN BEAUTY',
 'ARROWHEAD',
 'AZTECA',
 'BANZA',
 'BEAR CREEK COUNTRY KITCHENS',
 'BEN AND JERRYS']

Are the brandCodes actually missing?
Answer: No.

After some initial eyeball checks, a few observations were made:

1) '7UP' in the rewards_receipts table and '7 up' in brands_df refer to the same brand, but lack consistency in structure.
2) "BEN AND JERRYS" in rewards_receipts_df and "BEN & JERRY'S" in brands_df are also inconsistent in spelling and punctuation.
3) These are just a few examples of errors, whether in spelling or punctuation, which impact the consistency of brandCodes. 

Given that brandCode is used as a foreign key, this poses a significant issue.

#### Data Quality Issue spotted.
Creating a data pipeline to directly replace eyeball checks for grammatical errors would be challenging. The core issue, as mentioned earlier, lies in the lack of a proper mapping between barcode and brand information. Establishing this mapping would greatly improve data quality.

#### Data format consistency check.

Given the presence of barcodes, it is essential that they follow a consistent format: 1) numeric and 2) 12 digits long.

In [147]:
## Checking if Barcodes are numeric type variables.
rewards_receipt_df_bc_present = rewards_receipts_df[~rewards_receipts_df['barcode'].isna()]
rewards_receipt_df_bc_present['barcode'].dtype

dtype('O')

#### Data Quality Issue Spotted
It is an object datatype.

In [148]:
## Verifying if the barcode is in consistent format -> 12 digit
if rewards_receipt_df_bc_present[rewards_receipt_df_bc_present['barcode'].apply(lambda x: len(str(x))) != 12].empty:
    print('All the barcodes are in consistent format -> 12 digits')
else:
    print('Barcodes is not in consistent format \n Examples:\n')
    print(rewards_receipt_df_bc_present[rewards_receipt_df_bc_present['barcode'].apply(lambda x: len(str(x))) != 12]['barcode'].sample(5))

Barcodes is not in consistent format 
 Examples:

6878    B07BRRLSVC
2608          4011
171           4011
657           4011
59            4011
Name: barcode, dtype: object


#### Data Quality Issue spotted
Some of the values contain letters and some of them are not 12 digits, indicating the data quality issue with the barcodes.

In [149]:
#### Boolean Variables consistency check
bool_cols = ['needsFetchReview','preventTargetGapPoints','userFlaggedNewItem','competitiveProduct','deleted']
for col in bool_cols:
    print(f'Datatype of {col} column is {rewards_receipts_df[col].dtype}')


Datatype of needsFetchReview column is object
Datatype of preventTargetGapPoints column is object
Datatype of userFlaggedNewItem column is object
Datatype of competitiveProduct column is object
Datatype of deleted column is object


#### Data Quality Issue Spotted

There is a need to convert the above variables into boolean type variables.

In [150]:
#### Making sure Boolean variables in Rewards_receipts_df to be Boolean values
for col in bool_cols:
    rewards_receipts_df[col] = rewards_receipts_df[col].astype('boolean')

#### Finalizing Data Quality Assessment of Rewards_Receipts table

In [151]:
#### Implementing the above brandCode imputation methods that were discussed to finalize Data Quality Assessment

brands_df['brandCode'] = brands_df['brandCode'].fillna(brands_df['name_first_word'])
rewards_receipts_df['brandCode'] = rewards_receipts_df['brandCode'].fillna(rewards_receipts_df['description_first_word'])

#### Sample Great Expectations framework

In [152]:
## Unique test:

context = ge.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": users_df_no_duplicates})

#expect user_id to exist:
unique_user_expectation = ge.expectations.ExpectColumnValuesToBeUnique(column = 'user_id')
validation_result = batch.validate(unique_user_expectation)
print(validation_result)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 212,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "expectation_config": {
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "user_id"
    },
    "type": "expect_column_values_to_be_unique",
    "meta": {}
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}


#### Data loading to MYSQL

Now the data is ready to be loaded in the MYSQL database.

In [153]:
## dropping the items list from receipts_df since we already have extracted the table Rewards_Receipts:

new_receipts_df = receipts_df.drop(columns = 'rewardsReceiptItemList')
new_receipts_df.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,2021-01-03 09:25:31,2021-01-03 09:25:31,2021-01-03 09:25:31,2021-01-03 09:25:36,2021-01-03 09:25:31,2021-01-02 18:00:00
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,2021-01-03 09:24:43,2021-01-03 09:24:43,2021-01-03 09:24:43,2021-01-03 09:24:48,2021-01-03 09:24:43,2021-01-02 09:24:43
2,5.0,All-receipts receipt bonus,5.0,1.0,REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,2021-01-03 09:25:37,2021-01-03 09:25:37,NaT,2021-01-03 09:25:42,NaT,2021-01-02 18:00:00
3,5.0,All-receipts receipt bonus,5.0,4.0,FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,2021-01-03 09:25:34,2021-01-03 09:25:34,2021-01-03 09:25:34,2021-01-03 09:25:39,2021-01-03 09:25:34,2021-01-02 18:00:00
4,5.0,All-receipts receipt bonus,5.0,2.0,FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,2021-01-03 09:25:06,2021-01-03 09:25:06,2021-01-03 09:25:11,2021-01-03 09:25:11,2021-01-03 09:25:06,2021-01-02 09:25:06


In [154]:
## Not Including Description first word in Rewards_receipts table

rewards_receipts_df = rewards_receipts_df[[x for x in rewards_receipts_df.columns if x != 'description_first_word']]

In [155]:
#### Extracting MYSQL configuration files

file_path = os.path.join(root_dir,'MYSQL_user_config.yaml')
config = read_yaml(file_path)

user = config.get('MYSQL_credentials')['user']
password = config.get('MYSQL_credentials')['password']
host = config.get('MYSQL_credentials')['host']
port = config.get('MYSQL_credentials')['port']

##### Make sure that MYSQL 'services' is running locally. In windows OS,one can always check witn Win + R and searching for services.msc) and starting MYSQL

In [156]:
## Creating the Database

db_url = f'mysql+pymysql://{user}:{password}@{host}:{port}'
db_name = "FETCH_DB_Amarthya"

create_database(db_url, db_name)

Database 'FETCH_DB_Amarthya' created successfully!


In [157]:
##Engine creation for data loading
engine = create_engine(f'{db_url}'+'/'+f'{db_name}')
run_query(engine, f'USE {db_name}')


Error in the query -> This result object does not return rows. It has been closed automatically.


In [158]:
## Creating Users table
create_user_df_query = """
CREATE TABLE users (
    user_id VARCHAR(24) PRIMARY KEY,
    created_date_time DATETIME(3),
    lastLogin_date_time DATETIME(3),
    state VARCHAR(2),
    signUpSource VARCHAR(15),
    role VARCHAR(15),
    active BOOLEAN
)
"""
run_query(engine,'DROP TABLE IF EXISTS users')
run_query(engine,create_user_df_query)

Error in the query -> This result object does not return rows. It has been closed automatically.
Error in the query -> This result object does not return rows. It has been closed automatically.


In [159]:
## creating Brands table

create_brands_df_query = """
CREATE TABLE brands (
    brand_id VARCHAR(24) PRIMARY KEY,
    name VARCHAR(25),
    category VARCHAR(25),
    categoryCode VARCHAR(25),
    barcode VARCHAR(20),
    cpg_id VARCHAR(6),
    cpg_ref VARCHAR(6),
    brandCode VARCHAR(50),
    topBrand BOOLEAN
);
"""
run_query(engine,'DROP TABLE IF EXISTS brands')
run_query(engine,create_brands_df_query)

Error in the query -> This result object does not return rows. It has been closed automatically.
Error in the query -> This result object does not return rows. It has been closed automatically.


In [160]:
## Creating Receipts table

create_receipts_df_query = """
CREATE TABLE receipts (
    receipt_id VARCHAR(25) PRIMARY KEY,
    created_date_time DATETIME(3),
    scanned_date_time DATETIME(3),
    finished_date_time DATETIME(3),
    modify_date_time DATETIME(3),
    purchased_date_time DATETIME(3),
    pointsAwarded_date_time DATETIME(3),
    bonusPointsEarned DECIMAL(7,3),
    bonusPointsEarnedReason VARCHAR(35),
    pointsEarned DECIMAL(7,3),
    purchasedItemCount INT(7),
    rewardsReceiptStatus VARCHAR(10),
    totalSpent DECIMAL(7,3),
    userId VARCHAR(24)
);
"""
run_query(engine,'DROP TABLE IF EXISTS receipts')
run_query(engine,create_receipts_df_query)

Error in the query -> This result object does not return rows. It has been closed automatically.
Error in the query -> This result object does not return rows. It has been closed automatically.


In [161]:
## Creating rewards_receipts table

create_rewards_receipts_df_query = """
CREATE TABLE rewards_receipts (
    barcode VARCHAR(20),
    description VARCHAR(50),
    finalPrice DECIMAL(8,3),
    itemPrice DECIMAL(8,3),
    needsFetchReview Boolean,
    partnerItemId INT,
    preventTargetGapPoints Boolean,
    quantityPurchased DECIMAL(8,3),
    userFlaggedBarcode VARCHAR(20),
    userFlaggedNewItem BOOLEAN,
    userFlaggedPrice DECIMAL(8,3),
    userFlaggedQuantity DECIMAL(8,3),
    needsFetchReviewReason VARCHAR(20),
    pointsNotAwardedReason VARCHAR(50),
    pointsPayerId VARCHAR(25),
    rewardsGroup VARCHAR(20),
    rewardsProductPartnerId VARCHAR(25),
    userFlaggedDescription VARCHAR(50),
    originalMetaBriteBarcode VARCHAR(20),
    originalMetaBriteDescription VARCHAR(50),
    brandCode VARCHAR(20),
    competitorRewardsGroup VARCHAR(30),
    discountedItemPrice DECIMAL(8,3),
    originalReceiptItemText VARCHAR(50),
    itemNumber INT,
    originalMetaBriteQuantityPurchased DECIMAL(8,3),
    pointsEarned DECIMAL(8,3),
    targetPrice DECIMAL(8,3),
    competitiveProduct BOOLEAN,
    originalFinalPrice DECIMAL(8,3),
    originalMetaBriteItemPrice DECIMAL(8,3),
    deleted BOOLEAN,
    priceAfterCoupon DECIMAL(8,3),
    metabriteCampaignId VARCHAR(50),
    receipt_id VARCHAR(25),
    PRIMARY KEY (receipt_id,partnerItemId)    
);
"""
run_query(engine,'DROP TABLE IF EXISTS rewards_receipts')
run_query(engine,create_rewards_receipts_df_query)

Error in the query -> This result object does not return rows. It has been closed automatically.
Error in the query -> This result object does not return rows. It has been closed automatically.


In [162]:
## Loading tables:

### Time to load tables

#user_df
users_df_no_duplicates.to_sql('users',con = engine, if_exists = 'replace',index = False)

#brands_df
brands_df.to_sql('brands',con = engine, if_exists = 'replace',index = False)

#receipts_df
new_receipts_df.to_sql('receipts',con = engine, if_exists = 'replace',index = False)

#rewards_receipts_df
rewards_receipts_df.to_sql('rewards_receipts',con = engine, if_exists = 'replace',index = False)

6941