In [66]:
## Importing necessary libraries
import pandas as pd
import numpy as np
import os
import json
import jsonlines
import datetime
import great_expectations as ge
from sqlalchemy import create_engine
import pymysql
import yaml
from sqlalchemy import create_engine
from sqlalchemy import text
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 100)  # Set maximum rows to 100
pd.set_option('display.max_columns', 100) # Set maximum columns to 20

In [67]:
## reading flattened csv files.

root_dir = os.getcwd()

brands_df = pd.read_csv(os.path.join(root_dir,'brands.csv'))
users_df = pd.read_csv(os.path.join(root_dir,'users.csv'))
receipts_df = pd.read_csv(os.path.join(root_dir,'receipts.csv'))
rewards_receipts_df = pd.read_csv(os.path.join(root_dir,'rewards_receipts.csv'))

In [68]:
## Functions:
# Function to read YAML file
def read_yaml(file_path):
    with open(file_path, 'r') as file:
        try:
            # Load the YAML content into a Python dictionary
            data = yaml.safe_load(file)
            return data
        except yaml.YAMLError as exc:
            print(f"Error reading YAML file: {exc}")
            return None

#### Data Quality Assessment - Users table

Overview of data quality checks for the Users table:
1) Schema Verification: Ensuring the Schema of the table (possibly new data) be consistent with the already established schema for the Users table.
2) Checking for Duplicates: Ensuring no duplicate records.
3) If assigned primary key is unique: Verifying the uniqueness of the primary key.
4) Missing values: Identifying the columns with missing values.
5) Timestamp validation: Ensuring Last login date is after the created date.
6) Data Type Validaton: Ensuring the appropriate data types for all the columns.
7) Miscellaneous: Verifying the range of the columns.

In [69]:
## Overview of the contents of the Users table
users_df.head()

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,2021-01-03 09:25:30.554,2021-01-03 09:25:30.597
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858


#### 1) Schema Verification:


In [70]:
users_df.columns

Index(['active', 'role', 'signUpSource', 'state', 'user_id',
       'created_date_time', 'lastLogin_date_time'],
      dtype='object')

Check passed. A more formal way of testing would be to write an expectation and verify which will be discussed in the end.

#### 2) Checking for Duplicates:

In [71]:
## Checking for duplicates

users_duplicates = users_df[users_df.duplicated()]
print(f'Percentage of Duplicate user records :{round(100*users_duplicates.shape[0]/users_df.shape[0],4)}')

Percentage of Duplicate user records :57.1717


#### Data Quality Issue spotted!!
57.17% of the records in the users table are duplicates, which is a serious data quality issue. The duplicate records are removed to ensure data quality.

In [72]:
## Deleting duplicate records
users_df_no_duplicates = users_df.drop_duplicates()

#### 3) If Primary_key : 'user_id' is unique()

In [73]:
## Checking if 'user_id' is unique
print(users_df_no_duplicates['user_id'].is_unique)

True


In [74]:
## Checking if user_id is unique
users_df_no_duplicates['user_id'].isna().sum()

0

This checks that if the chosen primary key : user_id is unique and non missing. Data Quality check Passed!

#### 4) Analyzing Missing values

In [75]:
## percentage of missing values 

users_df_no_duplicates.isna().sum()*100/users_df_no_duplicates.shape[0]

active                  0.000000
role                    0.000000
signUpSource            2.358491
state                   2.830189
user_id                 0.000000
created_date_time       0.000000
lastLogin_date_time    18.867925
dtype: float64

In [76]:
## Analyzing missing values in 'signUpSource'
users_df_no_duplicates[users_df_no_duplicates['signUpSource'].isna()]

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
388,True,consumer,,WI,55308179e4b0eabd8f99caa2,2015-04-16 22:43:53.186,2018-05-07 12:23:40.003
395,True,fetch-staff,,WI,59c124bae4b0299e55b0f330,2017-09-19 09:07:54.302,2021-02-08 10:42:58.117
422,True,consumer,,,5a43c08fe4b014fd6b6a0612,2017-12-27 09:47:27.059,2021-02-12 10:22:37.155
462,True,fetch-staff,,IL,5964eb07e4b03efd0c0f267b,2017-07-11 10:13:11.771,2021-03-04 13:07:49.770
475,True,fetch-staff,,,54943462e4b07e684157a532,2014-12-19 08:21:22.381,2021-03-05 10:52:23.204


In [77]:
## Unique values of signUpSource
users_df_no_duplicates['signUpSource'].unique()

array(['Email', 'Google', nan], dtype=object)

3 out of 6 missing values occur for the user who is a fetch-staff. The unique values for signup source are Email and Google. It is very much possible that fetch-staff can signUp using an internal source which is not email or google. For better data quality, a signupsource exclusively for staff members could be alloted if that is the case. The resources list could be expanded including possible valid sources  such as **App Store**, **Referral**, and other relevant platforms, to guarantee that this field is complete for all users. If the field is still missing, it could be imputed as 'others'.

In [78]:
## Analyzing missing values in States
users_df_no_duplicates[users_df_no_duplicates['state'].isna()]

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
344,True,consumer,Email,,60145ff384231211ce796d51,2021-01-29 13:20:19.722,
375,True,consumer,Email,,60186237c8b50e11d8454d5f,2021-02-01 14:19:03.551,
422,True,consumer,,,5a43c08fe4b014fd6b6a0612,2017-12-27 09:47:27.059,2021-02-12 10:22:37.155
432,True,fetch-staff,Email,,5fbc35711d967d1222cbfefc,2020-11-23 16:19:29.509,2021-02-25 22:25:51.057
455,True,fetch-staff,Email,,5fa41775898c7a11a6bcef3e,2020-11-05 09:17:09.396,2021-03-04 10:02:02.026
475,True,fetch-staff,,,54943462e4b07e684157a532,2014-12-19 08:21:22.381,2021-03-05 10:52:23.204


In [79]:
## Unique values of state
users_df_no_duplicates['state'].unique()

array(['WI', 'KY', 'AL', 'CO', 'IL', nan, 'OH', 'SC', 'NH'], dtype=object)

Missing values could be imputed with 'DNF' (Did not fill) or 'N/A' or 'None' approriately.

In [80]:
## Analyzing Missing Last Login Date:

missing_last_login = users_df_no_duplicates[users_df_no_duplicates['lastLogin_date_time'].isna()]
missing_last_login.head()
# missing_last_login.groupby('signUpSource')\
#     .agg({'signUpSource':'size'})\
#     .rename(columns = {'signUpSource':'count'})\
#     .sort_values(by = 'count',ascending= False)\
#     *100/(missing_last_login.shape[0])

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
97,True,consumer,Email,KY,5ff616a68f142f11dd189163,2021-01-06 13:59:34.996,
143,True,consumer,Email,AL,5ffe115404929101d0aaebb2,2021-01-12 15:15:00.208,
170,True,consumer,Google,WI,5e27526d0bdb6a138c32b556,2020-01-21 13:35:09.795,
180,True,consumer,Email,WI,6002475cfb296c121a81b98d,2021-01-15 19:54:36.571,
181,True,consumer,Email,WI,60024f24e257124ec6b99a13,2021-01-15 20:27:49.345,


The missing last login data indicates that users are not immediately logging in after signing up. Of the users with missing last login dates, 95% signed up via email, while only 5% signed up through Google. This suggests a potential bottleneck in the email sign-up process, where users may be unable to log in right after signing up. One possible cause could be permission issues or other technical restrictions that prevent the app from opening or proceeding to the login page after email registration. Further investigation is needed to identify and address this issue to streamline the sign-up and login experience, especially for email users.

It would be interesting to check if the user_ids for the above table could be present in receipts_df. It is expected that the user_ids would not be present in receipts_df.

In [81]:
## Checking if the user_id present in the missing last login date if they made any orders.
missing_last_login_merge_receipts = missing_last_login\
    .merge(receipts_df, how = 'inner',left_on = 'user_id',right_on = 'userId')\
    [['user_id','created_date_time_x','lastLogin_date_time',\
      'bonusPointsEarned','receipt_id','scanned_date_time']]
missing_last_login_merge_receipts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 0 to 56
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user_id              57 non-null     object 
 1   created_date_time_x  57 non-null     object 
 2   lastLogin_date_time  0 non-null      object 
 3   bonusPointsEarned    32 non-null     float64
 4   receipt_id           57 non-null     object 
 5   scanned_date_time    57 non-null     object 
dtypes: float64(1), object(5)
memory usage: 3.1+ KB


#### Data Quality Issue spotted!!
It is interesting to see that the above data suggests that users with no recorded last login date are still able to scan receipts and earn bonus points. This is unexpected behavior, as we would typically expect a last login date to be present once a user interacts with the app to scan receipts. This anomaly needs to be investigated further, as it could indicate a logging issue or a flaw in the system where login events are not properly captured before allowing user actions such as receipt scanning.

#### 5) Timestamp validation

In [82]:
##Checking if last login date is after creation date

users_df_no_duplicates[users_df_no_duplicates['lastLogin_date_time'] \
                       < users_df_no_duplicates['created_date_time']].shape

(0, 7)

No data entries where last login date and time is before creation date and time.

#### 6) Appropriate formatting of the data

In [83]:
## Data types for the users
users_df_no_duplicates.dtypes

active                   bool
role                   object
signUpSource           object
state                  object
user_id                object
created_date_time      object
lastLogin_date_time    object
dtype: object

I would check if there are any discrepancies in the way the strings are entered. Discrepancy definition: 'GOOGLE' and 'Google'

In [84]:
## checking for discrepancies of the 'object' columns
for columns in ['active','role','signUpSource','state']:
    print(users_df_no_duplicates[columns].unique())

[ True False]
['consumer' 'fetch-staff']
['Email' 'Google' nan]
['WI' 'KY' 'AL' 'CO' 'IL' nan 'OH' 'SC' 'NH']


There are no string discrepancies.

It would be better to convert Date columns from strings to datetime objects.

In [85]:
## Objects to date conversion:
users_df_no_duplicates['created_date_time'] = pd.to_datetime(users_df_no_duplicates['created_date_time'], format='%Y-%m-%d %H:%M:%S.%f')
users_df_no_duplicates['lastLogin_date_time'] = pd.to_datetime(users_df_no_duplicates['lastLogin_date_time'], format='%Y-%m-%d %H:%M:%S.%f')


#### 7) Other Miscellaneous checks:

I would check if the state column is in correct abbrevation and would belong to the states from USA.


In [86]:
## Checking if the states from users_table are subset of the states_abbreviations
states_abbr = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 
    'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 
    'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 
    'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 
    'WI', 'WY', np.nan
]

[state for state in users_df_no_duplicates['state'] if state not in states_abbr]

[]

In [87]:
## Date range checks for date type columns
## choosing a 100 year valid period starting from 2000
start_date = pd.to_datetime('2000-01-01 00:00:00.000')
end_date = pd.to_datetime('2100-12-31 00:00:00.000')

for column in ['created_date_time','lastLogin_date_time']:
    if users_df_no_duplicates[\
        (users_df_no_duplicates[column] > end_date) \
            | (users_df_no_duplicates[column] < start_date)].shape[0] == 0:
        print(f'Check passed for {column}')
    else:
        print(f'Check failed for {column}')

Check passed for created_date_time
Check passed for lastLogin_date_time


Checking if the range of the created_date_time and last_login_date_time to understand if the data is missing for certain months or years

In [88]:
# Extracting year and month for easier grouping
users_df_no_duplicates['created_year_month'] = users_df_no_duplicates['created_date_time'].dt.to_period('M')

# Checking the range of created dates
created_range = users_df_no_duplicates['created_date_time'].min(), users_df_no_duplicates['created_date_time'].max()
print("Created date range:", created_range)

# Grouping by month to check if data is missing for certain periods
created_counts = users_df_no_duplicates['created_year_month'].value_counts().sort_index()

print("Created counts by month:\n",created_counts)

Created date range: (Timestamp('2014-12-19 08:21:22.381000'), Timestamp('2021-02-12 08:11:06.240000'))
Created counts by month:
 2014-12      1
2015-04      1
2017-07      1
2017-09      1
2017-12      1
2020-01      1
2020-07      1
2020-11      4
2020-12      1
2021-01    170
2021-02     30
Freq: M, Name: created_year_month, dtype: int64


#### Data Quality issue spotted!! 

The sporadic distribution of `created_date_time`, with entire years like **2016**, **2018**, and **2019** missing, seems suspicious. Unless there's a specific reason for these gaps, I would flag it as a potential data quality issue for further investigation.

#### Data Quality Assessment - Brands table

In [90]:
## over view of the contents of the brands table
brands_df.head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827
