In [305]:
import pandas as pd
import numpy as np
import os
import json
import jsonlines
import datetime
import great_expectations as ge
from sqlalchemy import create_engine
import pymysql
import yaml
from sqlalchemy import create_engine
from sqlalchemy import text



pd.set_option('display.max_rows', 100)  # Set maximum rows to 100
pd.set_option('display.max_columns', 100) # Set maximum columns to 20

In [7]:
# pip install great-expectations

In [8]:
root_dir = os.getcwd()

In [9]:
##reading csvs for detail data quality checks.

brands_df = pd.read_csv(os.path.join(root_dir,'brands.csv'))
users_df = pd.read_csv(os.path.join(root_dir,'users.csv'))
receipts_df = pd.read_csv(os.path.join(root_dir,'receipts.csv'))
rewards_receipts_df = pd.read_csv(os.path.join(root_dir,'rewards_receipts.csv'))

## Data Quality issues

1) Some of the purchase dates are after the scanning date. (TBD: check how data behaves in this scenario)
2) Barcodes don't have specific shape (for example some barcodes are 4 digits) - this is maybe due to the fact that the items dont have barcodes. (TBD: check how data behanves in this scenario)
3) Not an issue but remember this: Same brandcode can have different barcode.
4) Bar code is present but BrandCode is missing in the rewards_receipts_df. But in their description, the first word of the description. This word can be used to subsititute (curious to know if this has any impact in the brand related questions)
5) Foreign key between rewards_receipts df and brands could be brand code
6) Not all brandcodes (non nans) in rewards_receipts_df are present in brands_df. (example difference: 'Ben & jerry's' and 'BEN AND JERRY') - This is a data quality issue. Ed2: '7 UP' and '7UP'
7) Duplicates in user_df and brands_df
8) Empty rewards_receipt_df_list are present.
9) Letters in Barcodes

In [10]:
#modify the code
receipts_df[receipts_df['rewardsReceiptItemList'].isna()]

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
71,,,,,,SUBMITTED,,5a43c08fe4b014fd6b6a0612,5ff475820a7214ada10005cf,2021-01-05 08:19:46.000,2021-01-05 08:19:46.000,,2021-01-05 08:19:46.000,,
93,,,,,,SUBMITTED,,5a43c08fe4b014fd6b6a0612,5ff5ecb90a7214ada10005f9,2021-01-06 11:00:40.000,2021-01-06 11:00:40.000,,2021-01-06 11:00:40.000,,
149,,,,,,SUBMITTED,,5ff7264e8f142f11dd189504,5ff726860a720f05230005ec,2021-01-07 09:19:34.000,2021-01-07 09:19:34.000,,2021-01-07 09:19:34.000,,
175,,,,0.0,,REJECTED,0.0,5ff8da28b3348b11c9337ac6,5ff8da570a720f05c5000015,2021-01-08 16:19:03.000,2021-01-08 16:19:03.000,,2021-01-08 16:19:04.000,,
212,,,,,,SUBMITTED,,59c124bae4b0299e55b0f330,5ffce8570a7214ad4e003e6f,2021-01-11 18:07:51.000,2021-01-11 18:07:51.000,,2021-01-11 18:07:51.000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603c6adf0a720fde1000039a,2021-02-28 22:17:35.736,2021-02-28 22:17:35.736,,2021-02-28 22:17:35.736,,
1111,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603c9e6e0a720fde100003c7,2021-03-01 01:57:34.307,2021-03-01 01:57:34.307,,2021-03-01 01:57:34.307,,
1115,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603d0b710a720fde1000042a,2021-03-01 09:42:41.873,2021-03-01 09:42:41.873,,2021-03-01 09:42:41.873,,
1116,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603cf5290a720fde10000413,2021-03-01 08:07:37.664,2021-03-01 08:07:37.664,,2021-03-01 08:07:37.664,,


In [11]:
rewards_receipts_df['barcode'].fillna(0)

0               4011
1               4011
2       028400642255
3                  0
4               4011
            ...     
6936      B07BRRLSVC
6937      B076FJ92M4
6938      B07BRRLSVC
6939      B076FJ92M4
6940      B07BRRLSVC
Name: barcode, Length: 6941, dtype: object

In [12]:
rewards_receipts_df[(rewards_receipts_df['brandCode'] == 'KLEENEX') & (rewards_receipts_df['barcode'] != "036000391718")]

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,...,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id
2862,36000119749,KLEENEX POCKET RCH IN WRP FACIAL TISSUE 2 PLY ...,5.64,5.64,,1202,,2.0,,,...,,56.4,,,,,,,KLEENEX TRUSTED CARE FACIAL TISSUES 1 - 59 COU...,60099c3c0a7214ad89000135


In [13]:
rewards_receipts_df[rewards_receipts_df['brandCode'] == 'KLEENEX'].groupby('description').agg({'description':'size'}).rename(columns = {'description':'count'}).sort_values(by = 'count',ascending = False)

Unnamed: 0_level_0,count
description,Unnamed: 1_level_1
KLEENEX POP UP RECTANGLE BOX FACIAL TISSUE 2 PLY 8PK 160 CT,87
KLEENEX POCKET RCH IN WRP FACIAL TISSUE 2 PLY 8PK 15 CT,1


In [14]:
rewards_receipts_df.groupby('brandCode').agg({'brandCode':'size'}).rename(columns = {'brandCode':'count'}).sort_values(by = 'count',ascending = False)

Unnamed: 0_level_0,count
brandCode,Unnamed: 1_level_1
HY-VEE,291
BEN AND JERRYS,180
PEPSI,93
KROGER,89
KLEENEX,88
...,...
GERM-X,1
PURINA ONE,1
GREY POUPON,1
GRIMMWAY FARMS,1


In [15]:
##checking if unique brandcodes exist in receipt but not in brands

brandcodes_rewards_receipt = list(rewards_receipts_df['brandCode'].unique())
brandcodes_brands = list(brands_df['brandCode'].unique())
len(brandcodes_rewards_receipt)
missing_bar_codes = [x for x in brandcodes_rewards_receipt if x not in brandcodes_brands]
sorted(missing_bar_codes)

['7UP',
 'ADVIL',
 'AMERICAN BEAUTY',
 'ARROWHEAD',
 'AZTECA',
 'BANZA',
 'BEAR CREEK COUNTRY KITCHENS',
 'BEN AND JERRYS',
 'BETTY CROCKER',
 'BIC',
 'BIGELOW',
 'BLUE DIAMOND',
 "BOAR'S HEAD",
 'BORDEN',
 'BOTA BOX',
 'BRAND',
 "BRASWELL'S",
 'BUNNY',
 "BUSH'S BEST",
 'C&H',
 'CADBURY',
 'CAL-ORGANIC FARMS',
 'CALIFIA FARMS',
 "CAMPBELL'S",
 'CARAMELLO',
 'CHEERIOS',
 'CHEESE',
 'CHEEZ-IT',
 'CHEX',
 'CHICKEN OF THE SEA',
 'CHIQUITA',
 'CINNAMON TOAST CRUNCH',
 'COKE',
 'COLEMAN NATURAL',
 "CONNIE'S PIZZA",
 'CREST 3D WHITE',
 'CRISPIX',
 'DANNON',
 'DARE',
 'DELI',
 'DIET COKE',
 'DIGIORNO',
 'DOLE',
 'DR PEPPER',
 'EDWARDS',
 "EGGLAND'S BEST",
 'EGGO',
 'EL MONTEREY',
 'ENERGIZER MAX',
 'ESSENTIAL EVERYDAY',
 'FAGE',
 "FAMOUS DAVE'S",
 "FLORIDA'S NATURAL",
 'FOLGERS',
 'FORTUNE YAKISOBA',
 'FRANZ',
 "FRENCH'S",
 'FRESH EXPRESS',
 'FRESH STEP',
 'FRONTERA',
 'GALLO FAMILY VINEYARDS',
 'GENERAL MILLS',
 'GERBER',
 'GERM-X',
 'GREEN GIANT',
 'GRIMMWAY FARMS',
 'HANOVER',
 'HARVEST SNA

In [19]:
##checking missing brandcodes

rewards_receipts_df['brandCode'].isna().sum()

4341

In [20]:
rewards_receipts_df.shape

(6941, 35)

In [21]:
rewards_receipts_df[rewards_receipts_df['brandCode'].isna()]

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,...,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id
0,4011,ITEM NOT FOUND,26.00,26.00,False,1,True,5.0,4.011000e+03,True,...,,,,,,,,,,5ff1e1eb0a720f0523000575
1,4011,ITEM NOT FOUND,1.00,1.00,,1,,1.0,,,...,,,,,,,,,,5ff1e1bb0a720f052300056b
2,028400642255,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.00,10.00,True,2,True,1.0,2.840064e+10,True,...,,,,,,,,,,5ff1e1bb0a720f052300056b
3,,,,,False,1,True,,4.011000e+03,True,...,,,,,,,,,,5ff1e1f10a720f052300057a
4,4011,ITEM NOT FOUND,28.00,28.00,False,1,True,4.0,4.011000e+03,True,...,,,,,,,,,,5ff1e1ee0a7214ada100056f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6936,B07BRRLSVC,thindust summer face mask - sun protection nec...,11.99,11.99,,1,,1.0,,,...,,,,,,,,11.99,,603cc2bc0a720fde100003e9
6937,B076FJ92M4,mueller austria hypergrind precision electric ...,22.97,22.97,,0,,1.0,,,...,,,,,,,,22.97,,603cc0630a720fde100003e6
6938,B07BRRLSVC,thindust summer face mask - sun protection nec...,11.99,11.99,,1,,1.0,,,...,,,,,,,,11.99,,603cc0630a720fde100003e6
6939,B076FJ92M4,mueller austria hypergrind precision electric ...,22.97,22.97,,0,,1.0,,,...,,,,,,,,22.97,,603ce7100a7217c72c000405


In [22]:
rewards_receipts_df[rewards_receipts_df['brandCode'].isna()].groupby('description').agg({'description':'size'}).rename(columns = {'description':'count'}).sort_values(by = 'count',ascending = False)

Unnamed: 0_level_0,count
description,Unnamed: 1_level_1
ITEM NOT FOUND,173
KLARBRUNN 12PK 12 FL OZ,120
HUGGIES SIMPLY CLEAN PREMOISTENED WIPE FRAGRANCE FREE BAG 216 COUNT,92
MILLER LITE 24 PACK 12OZ CAN,90
COMP BOOK,73
...,...
CRAFIS,1
COTTONELLE ULTRA COMFORT CARE MEGA ROLL 2 PLY 284 COTTON TOILET TISSUE 12 CT,1
COS SARDINES WATER,1
"CORN NUTS Ranch - 1.7 oz. bag, 216 per case",1


#### Concept of Data Quality.

Rules: 
1) Consistency
2) Validity
3) Uniqueness
4) Threshold
5) Completeness

Data Quality that come off my head:

1) Duplications
2) Possible outliers?
3) Checking if certain columns are actually unique: especially the proposed Primary Key
4) Missing values
5) Consistency in data types.

Great expectations?

#### Data Quality checks using great expectations.

#### Primary key Tests:

Here are the desginated primary keys for the 4 tables.

1) users_df : 'user_id'
2) brands_df: 'brand_id'
3) receipts_df: 'receipts_id'
4) rewards_receipts_df: ('receipt_id','partnerItemId')

#### For User_id table

In [43]:
#Existence of the primary key

context = ge.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": users_df})

#expect user_id to exist:
pk_user_expectation = ge.expectations.ExpectColumnToExist(column = 'user_id')
validation_result = batch.validate(pk_user_expectation)
print(validation_result['success'])

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

True


In [45]:
users_df.head()

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,2021-01-03 09:25:30.554,2021-01-03 09:25:30.597
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858


### Digging deep into user_df
Data Quality on top of my head

1) user_id: exists? (PK test)
2) user_id: unique? (PK test)
3) if not unique: is it carried over other columns as well (in short if the whole data row is duplicate)
4) Missing values
5) last login date_time is after creation date_time
6) state is valid
7) Checking if defined Schema for role : consists only 'consumers'

In [47]:
### 1) user_id: exists?

if 'user_id' in users_df.columns:
    print('Test Passed - user_id exists in user_df')
else:
    print('Test Failed - user_id does not exist in user_df')

Test Passed - user_id exists in user_df


In [49]:
## Unique test:

context = ge.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": users_df})

#expect user_id to exist:
unique_user_expectation = ge.expectations.ExpectColumnValuesToBeUnique(column = 'user_id')
validation_result = batch.validate(unique_user_expectation)
print(validation_result)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 495,
    "unexpected_count": 353,
    "unexpected_percent": 71.31313131313132,
    "partial_unexpected_list": [
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e1eacfcf6c399c274ae6",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e1eacfcf6c399c274ae6",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e1eacfcf6c399c274ae6",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff1e1eacfcf6c399c274ae6",
      "5ff1e194b6a9d73a3a9f1052",
      "5ff370c562fde912123a5e0e",
      "5ff36d0362fde912123a5535",
      "5ff370c562fde912123a5e0e",
      "5ff36be7135e7011bcb856d3",
      "5ff36d0362fde912123a5535"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 71.31313131313132,
    "unexpected_percent_nonmissing": 71.31313131313132,
    "p

This is suspicious behaviour. There is a need to check if there are duplicate entries present in the table or just that duplicate user_ids exist

In [54]:
## Unique test:

context = ge.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": users_df})

#expect user_id to exist:
compound_unique_user_expectation = ge.expectations.ExpectCompoundColumnsToBeUnique(column_list = list(users_df.columns))
validation_result = batch.validate(compound_unique_user_expectation)
print(validation_result['result'])

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{'element_count': 495, 'unexpected_count': 353, 'unexpected_percent': 71.31313131313132, 'partial_unexpected_list': [{'active': True, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI', 'user_id': '5ff1e194b6a9d73a3a9f1052', 'created_date_time': '2021-01-03 09:24:04.800', 'lastLogin_date_time': '2021-01-03 09:25:37.858'}, {'active': True, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI', 'user_id': '5ff1e194b6a9d73a3a9f1052', 'created_date_time': '2021-01-03 09:24:04.800', 'lastLogin_date_time': '2021-01-03 09:25:37.858'}, {'active': True, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI', 'user_id': '5ff1e194b6a9d73a3a9f1052', 'created_date_time': '2021-01-03 09:24:04.800', 'lastLogin_date_time': '2021-01-03 09:25:37.858'}, {'active': True, 'role': 'consumer', 'signUpSource': 'Email', 'state': 'WI', 'user_id': '5ff1e1eacfcf6c399c274ae6', 'created_date_time': '2021-01-03 09:25:30.554', 'lastLogin_date_time': '2021-01-03 09:25:30.597'}, {'active': True, 'role'

In [63]:
### To quantify the above impact even deeper:

duplicate_count_user_df = users_df.groupby(list(users_df.columns)).size().reset_index(name='count').sort_values(by='count', ascending=False)
duplicate_count_user_df[duplicate_count_user_df['count']>1].head()

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time,count
162,True,fetch-staff,Email,NH,5fc961c3b8cfca11a077dd33,2020-12-03 16:08:03.936,2021-02-26 16:39:16.799,20
23,True,consumer,Email,WI,5ff5d15aeb7c7d12096d91a2,2021-01-06 09:03:54.680,2021-01-06 09:08:10.009,18
104,True,consumer,Email,WI,600fb1ac73c60b12049027bb,2021-01-26 00:07:40.879,2021-01-26 00:11:23.950,16
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858,11
95,True,consumer,Email,WI,600987d77d983a11f63cfa92,2021-01-21 07:55:35.327,2021-01-21 07:59:21.063,9


there is a need to remove the duplicate entries and check for unqiur user_ids again, to make sure that all the remainign user_ids a re indeed unique.

In [65]:
# Drop duplicate rows (considering all columns)
users_df_no_duplicates = users_df.drop_duplicates()


In [67]:
## Unique test:

context = ge.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": users_df_no_duplicates})

#expect user_id to exist:
unique_user_expectation = ge.expectations.ExpectColumnValuesToBeUnique(column = 'user_id')
validation_result = batch.validate(unique_user_expectation)
print(validation_result)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 212,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "expectation_config": {
    "meta": {},
    "type": "expect_column_values_to_be_unique",
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "user_id"
    }
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


Success!

In [68]:
### Missing Value check:

users_df_no_duplicates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212 entries, 0 to 475
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   active               212 non-null    bool  
 1   role                 212 non-null    object
 2   signUpSource         207 non-null    object
 3   state                206 non-null    object
 4   user_id              212 non-null    object
 5   created_date_time    212 non-null    object
 6   lastLogin_date_time  172 non-null    object
dtypes: bool(1), object(6)
memory usage: 11.8+ KB


In [70]:
##signup source null check:

users_df_no_duplicates[users_df_no_duplicates['signUpSource'].isna()]

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
388,True,consumer,,WI,55308179e4b0eabd8f99caa2,2015-04-16 22:43:53.186,2018-05-07 12:23:40.003
395,True,fetch-staff,,WI,59c124bae4b0299e55b0f330,2017-09-19 09:07:54.302,2021-02-08 10:42:58.117
422,True,consumer,,,5a43c08fe4b014fd6b6a0612,2017-12-27 09:47:27.059,2021-02-12 10:22:37.155
462,True,fetch-staff,,IL,5964eb07e4b03efd0c0f267b,2017-07-11 10:13:11.771,2021-03-04 13:07:49.770
475,True,fetch-staff,,,54943462e4b07e684157a532,2014-12-19 08:21:22.381,2021-03-05 10:52:23.204


In [72]:
users_df_no_duplicates['signUpSource'].unique()

array(['Email', 'Google', nan], dtype=object)

In [71]:
users_df_no_duplicates[users_df_no_duplicates['role'] == 'fetch-staff']

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
395,True,fetch-staff,,WI,59c124bae4b0299e55b0f330,2017-09-19 09:07:54.302,2021-02-08 10:42:58.117
408,True,fetch-staff,Email,WI,5f2068904928021530f8fc34,2020-07-28 13:04:00.905,2021-02-04 09:30:05.375
432,True,fetch-staff,Email,,5fbc35711d967d1222cbfefc,2020-11-23 16:19:29.509,2021-02-25 22:25:51.057
435,True,fetch-staff,Email,NH,5fc961c3b8cfca11a077dd33,2020-12-03 16:08:03.936,2021-02-26 16:39:16.799
455,True,fetch-staff,Email,,5fa41775898c7a11a6bcef3e,2020-11-05 09:17:09.396,2021-03-04 10:02:02.026
456,True,fetch-staff,Google,AL,5fa32b4d898c7a11a6bcebce,2020-11-04 16:29:33.309,2021-03-04 01:21:58.047
462,True,fetch-staff,,IL,5964eb07e4b03efd0c0f267b,2017-07-11 10:13:11.771,2021-03-04 13:07:49.770
475,True,fetch-staff,,,54943462e4b07e684157a532,2014-12-19 08:21:22.381,2021-03-05 10:52:23.204


The unique values for signup source are Email and Google. It is very miuch possible that fetch-staff can signUp using an internal source which is not email or google. For better data quality, a signupsource exclusively for staff members can be alloted.

### Checking the Missing State.
And also checking if the states are valid.



In [73]:
users_df_no_duplicates[users_df_no_duplicates['state'].isna()]

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
344,True,consumer,Email,,60145ff384231211ce796d51,2021-01-29 13:20:19.722,
375,True,consumer,Email,,60186237c8b50e11d8454d5f,2021-02-01 14:19:03.551,
422,True,consumer,,,5a43c08fe4b014fd6b6a0612,2017-12-27 09:47:27.059,2021-02-12 10:22:37.155
432,True,fetch-staff,Email,,5fbc35711d967d1222cbfefc,2020-11-23 16:19:29.509,2021-02-25 22:25:51.057
455,True,fetch-staff,Email,,5fa41775898c7a11a6bcef3e,2020-11-05 09:17:09.396,2021-03-04 10:02:02.026
475,True,fetch-staff,,,54943462e4b07e684157a532,2014-12-19 08:21:22.381,2021-03-05 10:52:23.204


In [77]:
sorted(users_df_no_duplicates[~users_df_no_duplicates['state'].isna()]['state'].unique())

['AL', 'CO', 'IL', 'KY', 'NH', 'OH', 'SC', 'WI']

From basic research, I understand thay Fetch rewards also has customers from puerto Rico, So it could be that NaN values might correspond to Puerto Rico. Again this is not a significant data Quality issue

In [82]:
missing_last_login = users_df_no_duplicates[users_df_no_duplicates['lastLogin_date_time'].isna()]

In [93]:
missing_last_login.groupby('signUpSource').agg({'signUpSource':'size'}).rename(columns = {'signUpSource':'count'}).sort_values(by = 'count',ascending= False)*100/(missing_last_login.shape[0])

Unnamed: 0_level_0,count
signUpSource,Unnamed: 1_level_1
Email,95.0
Google,5.0


Digging deep, it looks like, for all the lsat login date going missing, 95% of them are from email. This appears to be a bottle neck in the email signing up procedure where the users are not allowed to login immediately.

In [98]:
##Checking if last login date is after creation date

users_df_no_duplicates[users_df_no_duplicates['lastLogin_date_time'] < users_df_no_duplicates['created_date_time']]

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time


In [102]:
users_df_no_duplicates.groupby('role').agg({'role':'size'}).rename(columns = {'role':'count'}).sort_values(by = 'count',ascending= False)*100/(users_df_no_duplicates.shape[0])

Unnamed: 0_level_0,count
role,Unnamed: 1_level_1
consumer,96.226415
fetch-staff,3.773585


Summary of users_df Data Quality checks:

1) Duplicates of data points exists. (Solution - to be removed).
2) Missing Values in SignUpSource. (probably missing due to sources that are not email or google (example: Internal for fetch-staff))
3) Missing State values (Probably due to Fetch International like Puerto Rico)
4) Missing Last_login_date (Noticed out of missing last_login_date, 95% of them are from Email, this appears to be a bottleneck where Email might not allow direct login after the account creation)
5) It is mentioned in the schema that the default value for 'role' column is 'consumer' which is not the case since 3.7% of the accounts are fetch-staff's

### Brands Data Quality issues

In [104]:
brands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   barcode       1167 non-null   int64 
 1   category      1012 non-null   object
 2   categoryCode  517 non-null    object
 3   name          1167 non-null   object
 4   topBrand      555 non-null    object
 5   brand_id      1167 non-null   object
 6   cpg_id        1167 non-null   object
 7   cpg_ref       1167 non-null   object
 8   brandCode     898 non-null    object
dtypes: int64(1), object(8)
memory usage: 82.2+ KB


In [105]:
### To quantify the above impact even deeper:

duplicate_count_brand_df = brands_df.groupby(list(brands_df.columns)).size().reset_index(name='count').sort_values(by='count', ascending=False)
duplicate_count_brand_df[duplicate_count_brand_df['count']>1].head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode,count


In [106]:
## Unique test:

context = ge.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": brands_df})

#expect user_id to exist:
unique_brand_expectation = ge.expectations.ExpectColumnValuesToBeUnique(column = 'brand_id')
validation_result = batch.validate(unique_brand_expectation)
print(validation_result)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 1167,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "expectation_config": {
    "meta": {},
    "type": "expect_column_values_to_be_unique",
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "brand_id"
    }
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


## Every brand ID is unique!!!

In [154]:
##Checking if brand name and brand code go in synch

brands_df.groupby('name').agg({'brandCode':'size'}).sort_values(by = 'brandCode',ascending= False)

Unnamed: 0_level_0,brandCode
name,Unnamed: 1_level_1
ONE A DAY® WOMENS,2
Sierra Mist,2
Caleb's Kola,2
Health Magazine,2
Huggies,2
...,...
LOVE HOME AND PLANET,1
Lance,1
LandShark,1
Late July,1


In [151]:
### Checking if brand names are unique?
brand_count = brands_df.groupby('name').size().reset_index(name='count').sort_values(by='count', ascending=False)
brand_count = brand_count[brand_count['count']>1]
brand_count

Unnamed: 0,name,count
504,ONE A DAY® WOMENS,2
627,Sierra Mist,2
129,Caleb's Kola,2
313,Health Magazine,2
333,Huggies,2
564,Pull-Ups,2
699,V8 Hydrate,2
223,Dippin Dots® Cereal,2
335,I CAN'T BELIEVE IT'S NOT BUTTER!,2
215,Diabetic Living Magazine,2


#### No they are not: Data Quality issue spotted!

#### For the above brands, checking if the entities are unique are same. (We already established that brand ids are unique.)

In [157]:
##checking if brand name and brand code have one-one to relationship (which is expected)

inconsistent = brands_df.groupby('name')['brandCode'].nunique().reset_index()
inconsistent = inconsistent[inconsistent['brandCode'] > 1]
inconsistent

Unnamed: 0,name,brandCode
73,Baken-Ets,2
129,Caleb's Kola,2
223,Dippin Dots® Cereal,2
313,Health Magazine,2
335,I CAN'T BELIEVE IT'S NOT BUTTER!,2
504,ONE A DAY® WOMENS,2
564,Pull-Ups,2


### data Quality issue_spotted; understanding how there are one to two relationship:

In [152]:
brands_df[brands_df['name'].isin(list(brand_count['name']))].sort_values(by = 'name')

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
848,511111701781,Snacks,,Baken-Ets,True,585a961fe4b03e62d1ce0e76,5332f5fbe4b03c9a25efd0ba,Cogs,BAKEN-ETS
574,511111605546,Snacks,,Baken-Ets,,5d9d08d1a60b87376833e348,5332f5fbe4b03c9a25efd0ba,Cogs,BAKEN ETS
140,511111000518,Beverages,,Caleb's Kola,False,5a4d23dae4b0bcb2c74ea77e,5332f5fbe4b03c9a25efd0ba,Cogs,CALEB'S KOLA
740,511111004912,Snacks,,Caleb's Kola,,5d601d74a3a018514994f422,53e10d6368abd3c7065097cc,Cogs,CALEBS KOLA
1007,511111205227,,,Diabetic Living Magazine,,5d658ffa6d5f3b23d1bc7914,53e10d6368abd3c7065097cc,Cogs,
1006,511111805298,Magazines,,Diabetic Living Magazine,,5d66d597a3a018093ab34726,5d5d4fd16d5f3b23d1bc7905,Cogs,511111805298
1163,511111706328,Breakfast & Cereal,,Dippin Dots® Cereal,,5dc1fca91dda2c0ad7da64ae,53e10d6368abd3c7065097cc,Cogs,DIPPIN DOTS CEREAL
1081,511111206330,Breakfast & Cereal,,Dippin Dots® Cereal,,5dc2d9d4a60b873d6b0666d2,5332f5f3e4b03c9a25efd0ae,Cogs,DIPPIN DOTS
194,511111605058,Magazines,,Health Magazine,,5d6415d5a3a018514994f429,5d5d4fd16d5f3b23d1bc7905,Cogs,511111605058
596,511111915287,Magazines,MAGAZINES,Health Magazine,,5f298852be37ce7958c5952d,5d66b9dcee7f2d201c7281cd,Cogs,HEALTH


#### Some discrepancies can be noticed between brand names and brand codes. Like for example: 'BAKEN-ETS' and 'BAKEN ETS'; 'CALEB'S KOLA' and 'CALEBS KOLA'. This causes a serious problem when trying to understand how well the brands are performing. This issue needs to be resolved for better data quality in Brands. The data entry needs to be done in a consistent way.


In [124]:
## Checking for missing values

brands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   barcode       1167 non-null   int64 
 1   category      1012 non-null   object
 2   categoryCode  517 non-null    object
 3   name          1167 non-null   object
 4   topBrand      555 non-null    object
 5   brand_id      1167 non-null   object
 6   cpg_id        1167 non-null   object
 7   cpg_ref       1167 non-null   object
 8   brandCode     898 non-null    object
dtypes: int64(1), object(8)
memory usage: 82.2+ KB


In [114]:
brands_df['category'].unique()

array(['Baking', 'Beverages', 'Candy & Sweets', 'Condiments & Sauces',
       'Canned Goods & Soups', nan, 'Magazines', 'Breakfast & Cereal',
       'Beer Wine Spirits', 'Health & Wellness', 'Beauty', 'Baby',
       'Frozen', 'Grocery', 'Snacks', 'Household', 'Personal Care',
       'Dairy', 'Cleaning & Home Improvement', 'Deli',
       'Beauty & Personal Care', 'Bread & Bakery', 'Outdoor',
       'Dairy & Refrigerated'], dtype=object)

In [116]:
#Analyzing missing category

brands_df[brands_df['category'].isna()]

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
11,511111102540,,,MorningStar,,57c08106e4b0718ff5fcb02c,5332f5f2e4b03c9a25efd0aa,Cpgs,
23,511111303947,,,Bottled Starbucks,,5332f5fee4b03c9a25efd0bd,53e10d6368abd3c7065097cc,Cpgs,
24,511111802914,,,Full Throttle,,5332fa7ce4b03c9a25efd22e,5332f5ebe4b03c9a25efd0a8,Cpgs,
34,511111103653,,,Gold Medal,,5332f772e4b03c9a25efd125,5332f5f3e4b03c9a25efd0ae,Cpgs,
43,511111503699,,,Glaceau vitaminwater,,5332f765e4b03c9a25efd11f,5332f5ebe4b03c9a25efd0a8,Cpgs,
...,...,...,...,...,...,...,...,...,...
1123,511111403562,,,Bud Light,,5332f7a8e4b03c9a25efd135,5332f7a7e4b03c9a25efd134,Cpgs,
1135,511111405184,,,Do It Yourself,,5d658fca6d5f3b23d1bc7912,53e10d6368abd3c7065097cc,Cogs,
1144,511111202516,,,Corona,,57c08242e4b0718ff5fcb032,5332f7a7e4b03c9a25efd134,Cpgs,
1146,511111703105,,,Bellatoria,,5332fa12e4b03c9a25efd1e6,5332fa12e4b03c9a25efd1e7,Cpgs,


### Data Issue spotted!

For missing category values, I can observe that the mapping of brands to category has been incomplete. For example, brands like Bud Light, Corona etc can be mapped to 'Beers & Wine Spirits'. This Data quality issue needs to be resolved to better understand how each category is performing. The solution could be going back to the data entry source and making sure that the category is entered correctly. If the category is not present, then it makes sense to leave it blank.

This also gives an idea of creating a new table (entity) dedicated to categories. Whenever a new category comes, the table can be updated and the brands table and category table could be linked by category_id

In [155]:
### Checking if Category values have a unique pair of category code
### checking inconsistent

inconsistent = brands_df.groupby('category')['categoryCode'].nunique().reset_index()
inconsistent = inconsistent[inconsistent['categoryCode'] > 1]
inconsistent

Unnamed: 0,category,categoryCode


category and category Code maintain one-one relationships for non-missing values

In [159]:
brands_df[(~brands_df['category'].isna()) & (brands_df['categoryCode'].isna())].head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
7,511111104810,Condiments & Sauces,,J.L. Kraft,,5cdad0f5166eb33eb7ce0faa,559c2234e4b06aca36af13c6,Cogs,J.L. KRAFT
8,511111504412,Canned Goods & Soups,,Campbell's Home Style,False,5ab15636e4b0be0a89bb0b07,5a734034e4b0d58f376be874,Cogs,CAMPBELLS HOME STYLE
9,511111504788,Baking,,test,,5c408e8bcd244a1fdb47aee7,59ba6f1ce4b092b29c167346,Cogs,TEST
12,511111201076,Baking,,Calumet,False,588ba07be4b02187f85cdadd,559c2234e4b06aca36af13c6,Cogs,CALUMET
13,511111205012,Magazines,,Entertainment Weekly,,5d6413156d5f3b23d1bc790a,5d5d4fd16d5f3b23d1bc7905,Cogs,511111205012


### Data Quality Issue spotted!

The data entry/ qulaity should be in such a way that each category, categoryCode should be filled automatically

In [161]:
brands_df[(brands_df['category'].isna()) & (brands_df['categoryCode'].isna())]

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
11,511111102540,,,MorningStar,,57c08106e4b0718ff5fcb02c,5332f5f2e4b03c9a25efd0aa,Cpgs,
23,511111303947,,,Bottled Starbucks,,5332f5fee4b03c9a25efd0bd,53e10d6368abd3c7065097cc,Cpgs,
24,511111802914,,,Full Throttle,,5332fa7ce4b03c9a25efd22e,5332f5ebe4b03c9a25efd0a8,Cpgs,
34,511111103653,,,Gold Medal,,5332f772e4b03c9a25efd125,5332f5f3e4b03c9a25efd0ae,Cpgs,
43,511111503699,,,Glaceau vitaminwater,,5332f765e4b03c9a25efd11f,5332f5ebe4b03c9a25efd0a8,Cpgs,
...,...,...,...,...,...,...,...,...,...
1123,511111403562,,,Bud Light,,5332f7a8e4b03c9a25efd135,5332f7a7e4b03c9a25efd134,Cpgs,
1135,511111405184,,,Do It Yourself,,5d658fca6d5f3b23d1bc7912,53e10d6368abd3c7065097cc,Cogs,
1144,511111202516,,,Corona,,57c08242e4b0718ff5fcb032,5332f7a7e4b03c9a25efd134,Cpgs,
1146,511111703105,,,Bellatoria,,5332fa12e4b03c9a25efd1e6,5332fa12e4b03c9a25efd1e7,Cpgs,


In [163]:
## Missing brandCode:

brands_df[brands_df['brandCode'].isna()]

## It should be made sure that brand name and brand code maintain a one-one relationship as well. Once the brand name is entered, appropriate brandcode should be filled.

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
11,511111102540,,,MorningStar,,57c08106e4b0718ff5fcb02c,5332f5f2e4b03c9a25efd0aa,Cpgs,
18,511111317364,Baking,BAKING,test brand @1605535049181,False,5fb28549be37ce522e165cb5,5fb28549be37ce522e165cb4,Cogs,
23,511111303947,,,Bottled Starbucks,,5332f5fee4b03c9a25efd0bd,53e10d6368abd3c7065097cc,Cpgs,
24,511111802914,,,Full Throttle,,5332fa7ce4b03c9a25efd22e,5332f5ebe4b03c9a25efd0a8,Cpgs,
...,...,...,...,...,...,...,...,...,...
1144,511111202516,,,Corona,,57c08242e4b0718ff5fcb032,5332f7a7e4b03c9a25efd134,Cpgs,
1146,511111703105,,,Bellatoria,,5332fa12e4b03c9a25efd1e6,5332fa12e4b03c9a25efd1e7,Cpgs,
1157,511111303015,,,DASANI,,5332fa75e4b03c9a25efd221,5332f5ebe4b03c9a25efd0a8,Cpgs,
1159,511111501619,Beverages,,Pepsi Max,False,585a96cbe4b03e62d1ce0e88,5332f5fbe4b03c9a25efd0ba,Cogs,


### Missing values in TopBrand

This is not necessarily a data quality problem since, 'topBrand' could be subjective. But if there is classification that is useful for business. I woud suggest that we incorporate the 'time' factor into the column 'topBrand' since any Brand can become top in future, or the current topbrands might not be top, so there is a  need for version table or a history table which contians columns brand_id,TopBrand, time

#### Receipts Data Quality issues:

In [165]:
receipts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1119 entries, 0 to 1118
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bonusPointsEarned        544 non-null    float64
 1   bonusPointsEarnedReason  544 non-null    object 
 2   pointsEarned             609 non-null    float64
 3   purchasedItemCount       635 non-null    float64
 4   rewardsReceiptItemList   679 non-null    object 
 5   rewardsReceiptStatus     1119 non-null   object 
 6   totalSpent               684 non-null    float64
 7   userId                   1119 non-null   object 
 8   receipt_id               1119 non-null   object 
 9   created_date_time        1119 non-null   object 
 10  scanned_date_time        1119 non-null   object 
 11  finished_date_time       568 non-null    object 
 12  modify_date_time         1119 non-null   object 
 13  pointsAwarded_date_time  537 non-null    object 
 14  purchased_date_time     

In [167]:
## Unique test:

context = ge.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": receipts_df})

#expect user_id to exist:
unique_receipt_expectation = ge.expectations.ExpectColumnValuesToBeUnique(column = 'receipt_id')
validation_result = batch.validate(unique_receipt_expectation)
print(validation_result)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "result": {
    "element_count": 1119,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "expectation_config": {
    "meta": {},
    "type": "expect_column_values_to_be_unique",
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "receipt_id"
    }
  },
  "success": true,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [169]:
receipts_df.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,2021-01-03 09:25:31.000,2021-01-03 09:25:31.000,2021-01-03 09:25:31.000,2021-01-03 09:25:36.000,2021-01-03 09:25:31,2021-01-02 18:00:00
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,2021-01-03 09:24:43.000,2021-01-03 09:24:43.000,2021-01-03 09:24:43.000,2021-01-03 09:24:48.000,2021-01-03 09:24:43,2021-01-02 09:24:43
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,2021-01-03 09:25:37.000,2021-01-03 09:25:37.000,,2021-01-03 09:25:42.000,,2021-01-02 18:00:00
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,2021-01-03 09:25:34.000,2021-01-03 09:25:34.000,2021-01-03 09:25:34.000,2021-01-03 09:25:39.000,2021-01-03 09:25:34,2021-01-02 18:00:00
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,2021-01-03 09:25:06.000,2021-01-03 09:25:06.000,2021-01-03 09:25:11.000,2021-01-03 09:25:11.000,2021-01-03 09:25:06,2021-01-02 09:25:06


### understanding bonuspoints missing values

In [198]:
receipts_df[receipts_df['bonusPointsEarned'].isna()].head()


Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
15,,,,0.0,"[{'needsFetchReview': True, 'needsFetchReviewR...",FLAGGED,0.0,5ff1e1e9b6a9d73a3a9f10f6,5ff1e1e90a7214ada1000569,2021-01-03 09:25:29.000,2021-01-03 09:25:29.000,,2021-01-03 09:25:29.000,,
28,,,,3.0,"[{'deleted': True, 'description': 'DELETED ITE...",REJECTED,3.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d40a7214ada1000562,2021-01-03 09:25:08.000,2021-01-03 09:25:08.000,,2021-01-03 09:25:08.000,,2017-10-29 19:00:00
32,,,500.0,9.0,"[{'barcode': '029000079236', 'description': 'P...",FINISHED,89.91,5ff36be7135e7011bcb856d3,5ff36c750a7214ada100058f,2021-01-04 13:28:53.000,2021-01-04 13:28:53.000,2021-01-04 13:28:54.000,2021-01-04 13:28:54.000,2021-01-04 13:28:54,2021-01-03 13:28:53
35,,,250.0,5.0,"[{'barcode': '044700009888', 'description': 'O...",FINISHED,49.95,5ff36a3862fde912123a4460,5ff36adb0a720f0523000590,2021-01-04 13:22:03.000,2021-01-04 13:22:03.000,2021-01-04 13:22:04.000,2021-01-04 13:22:04.000,2021-01-04 13:22:04,2020-12-30 00:00:00
52,,,350.0,1.0,"[{'barcode': '044700019917', 'description': 'O...",FINISHED,10.0,5ff370c562fde912123a5e0e,5ff3713c0a7214ada10005b6,2021-01-04 13:49:16.000,2021-01-04 13:49:16.000,2021-01-04 13:49:16.000,2021-01-04 13:49:16.000,2021-01-04 13:49:16,2021-01-03 13:49:16


### Understanding missing points earned

In [204]:
missing_points_df = receipts_df[receipts_df['pointsEarned'].isna()]


missing values in bonuspoints earned could be imputed with 0 and bonus points earned reason can be filled with bonus points not awarded

In [205]:
missing_points_percent_count = missing_points_df\
    .groupby('rewardsReceiptStatus').agg({'rewardsReceiptStatus':'size'})\
    .rename(columns = {'rewardsReceiptStatus':'count'})\
    .sort_values(by = 'count',ascending = False)*100/missing_points_df.shape[0]
missing_points_percent_count

Unnamed: 0_level_0,count
rewardsReceiptStatus,Unnamed: 1_level_1
SUBMITTED,85.098039
PENDING,9.803922
FLAGGED,2.54902
REJECTED,2.54902


#### It can be observed that for the points that are missing, 96% of them are submitted,pending,flagged and awaiting points; while other categories like rejected; it can be automatically filled with 0 - a mild not a very serious data quality issue

In [216]:
##Checking rewards item list that are empty

missing_items = receipts_df[receipts_df['rewardsReceiptItemList'].isna()]
missing_items.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
71,,,,,,SUBMITTED,,5a43c08fe4b014fd6b6a0612,5ff475820a7214ada10005cf,2021-01-05 08:19:46.000,2021-01-05 08:19:46.000,,2021-01-05 08:19:46.000,,
93,,,,,,SUBMITTED,,5a43c08fe4b014fd6b6a0612,5ff5ecb90a7214ada10005f9,2021-01-06 11:00:40.000,2021-01-06 11:00:40.000,,2021-01-06 11:00:40.000,,
149,,,,,,SUBMITTED,,5ff7264e8f142f11dd189504,5ff726860a720f05230005ec,2021-01-07 09:19:34.000,2021-01-07 09:19:34.000,,2021-01-07 09:19:34.000,,
175,,,,0.0,,REJECTED,0.0,5ff8da28b3348b11c9337ac6,5ff8da570a720f05c5000015,2021-01-08 16:19:03.000,2021-01-08 16:19:03.000,,2021-01-08 16:19:04.000,,
212,,,,,,SUBMITTED,,59c124bae4b0299e55b0f330,5ffce8570a7214ad4e003e6f,2021-01-11 18:07:51.000,2021-01-11 18:07:51.000,,2021-01-11 18:07:51.000,,


In [220]:
missing_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440 entries, 71 to 1118
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bonusPointsEarned        2 non-null      float64
 1   bonusPointsEarnedReason  2 non-null      object 
 2   pointsEarned             2 non-null      float64
 3   purchasedItemCount       5 non-null      float64
 4   rewardsReceiptItemList   0 non-null      object 
 5   rewardsReceiptStatus     440 non-null    object 
 6   totalSpent               5 non-null      float64
 7   userId                   440 non-null    object 
 8   receipt_id               440 non-null    object 
 9   created_date_time        440 non-null    object 
 10  scanned_date_time        440 non-null    object 
 11  finished_date_time       3 non-null      object 
 12  modify_date_time         440 non-null    object 
 13  pointsAwarded_date_time  2 non-null      object 
 14  purchased_date_time     

In [219]:

missing_items_percent_count = missing_items\
    .groupby('rewardsReceiptStatus').agg({'rewardsReceiptStatus':'size'})\
    .rename(columns = {'rewardsReceiptStatus':'count'})\
    .sort_values(by = 'count',ascending = False)*100/missing_items.shape[0]
missing_items_percent_count

Unnamed: 0_level_0,count
rewardsReceiptStatus,Unnamed: 1_level_1
SUBMITTED,98.636364
REJECTED,0.681818
FINISHED,0.454545
PENDING,0.227273


#### could be the reason where the scanning algorithm couldnt detect the items in the list. So we can conclude that it is not serious big data quality issue. It would have been a serious issue when there is significant contribution from 'finished' status, and from the bottom table, it is shown that bonus points are awarded for some valid reason and hence finished status

In [211]:
## Checking missing rewards item list:

receipts_df[(receipts_df['rewardsReceiptItemList'].isna()) & (~receipts_df['pointsEarned'].isna())]

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
396,250.0,"Receipt number 3 completed, bonus point schedu...",250.0,0.0,,FINISHED,0.0,6009e60450b3311194385009,6009eb000a7214ada2000003,2021-01-21 14:58:40.000,2021-01-21 14:58:40.000,2021-01-21 14:59:06.000,2021-01-21 14:59:15.000,2021-01-21 14:59:06,2021-01-20 18:00:00
424,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,0.0,,FINISHED,0.0,600afb2a7d983a124e9aded0,600aff160a720f053500000c,2021-01-22 10:36:38.000,2021-01-22 10:36:38.000,2021-01-22 10:37:01.000,2021-01-22 10:37:13.000,2021-01-22 10:37:01,2021-01-21 18:00:00


In [233]:
##understanding the potential reasons why purchased_date_time could be missing:

receipts_df[receipts_df['purchased_date_time'].isna()]

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
15,,,,0.0,"[{'needsFetchReview': True, 'needsFetchReviewR...",FLAGGED,0.0,5ff1e1e9b6a9d73a3a9f10f6,5ff1e1e90a7214ada1000569,2021-01-03 09:25:29.000,2021-01-03 09:25:29.000,,2021-01-03 09:25:29.000,,
71,,,,,,SUBMITTED,,5a43c08fe4b014fd6b6a0612,5ff475820a7214ada10005cf,2021-01-05 08:19:46.000,2021-01-05 08:19:46.000,,2021-01-05 08:19:46.000,,
81,,,,0.0,"[{'needsFetchReview': True, 'needsFetchReviewR...",FLAGGED,0.0,5ff4ce3cc1e2d0121a9b2fba,5ff4ce3c0a720f05230005c4,2021-01-05 14:38:20.000,2021-01-05 14:38:20.000,,2021-01-05 14:38:20.000,,
93,,,,,,SUBMITTED,,5a43c08fe4b014fd6b6a0612,5ff5ecb90a7214ada10005f9,2021-01-06 11:00:40.000,2021-01-06 11:00:40.000,,2021-01-06 11:00:40.000,,
141,,,,0.0,"[{'needsFetchReview': True, 'needsFetchReviewR...",FLAGGED,0.0,5ff73be9eb7c7d31ca8a45bc,5ff73be90a720f052300060a,2021-01-07 10:50:49.000,2021-01-07 10:50:49.000,,2021-01-07 10:50:49.000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603c6adf0a720fde1000039a,2021-02-28 22:17:35.736,2021-02-28 22:17:35.736,,2021-02-28 22:17:35.736,,
1111,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603c9e6e0a720fde100003c7,2021-03-01 01:57:34.307,2021-03-01 01:57:34.307,,2021-03-01 01:57:34.307,,
1115,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603d0b710a720fde1000042a,2021-03-01 09:42:41.873,2021-03-01 09:42:41.873,,2021-03-01 09:42:41.873,,
1116,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603cf5290a720fde10000413,2021-03-01 08:07:37.664,2021-03-01 08:07:37.664,,2021-03-01 08:07:37.664,,


In [226]:
### checking if purchase_date is before created_date_time:


receipts_df[receipts_df['purchased_date_time']>receipts_df['scanned_date_time']]

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
12,150.0,"Receipt number 5 completed, bonus point schedu...",8850.0,10.0,"[{'barcode': '034100573065', 'description': 'M...",FLAGGED,290.0,5ff1e194b6a9d73a3a9f1052,5ff1e1b60a7214ada100055c,2021-01-03 09:24:38.000,2021-01-03 09:24:38.000,,2021-01-03 09:24:38.000,,2021-02-03 09:24:38
14,300.0,"Receipt number 4 completed, bonus point schedu...",300.0,1.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1b20a7214ada100055a,2021-01-03 09:24:34.000,2021-01-03 09:24:34.000,2021-01-03 09:24:35.000,2021-01-03 09:24:35.000,2021-01-03 09:24:35,2021-02-03 09:24:35
85,25.0,COMPLETE_NONPARTNER_RECEIPT,25.0,1.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff4ce33c3d63511e2a484b6,5ff4ce640a7214ada10005e0,2021-01-05 14:39:00.000,2021-01-05 14:39:00.000,2021-01-05 14:39:00.000,2021-01-05 14:39:00.000,2021-01-05 14:39:00,2021-02-05 14:39:00
139,,,8700.0,10.0,"[{'barcode': '034100573065', 'description': 'M...",FLAGGED,290.0,5ff73b90eb7c7d31ca8a452b,5ff73be10a7214ada1000619,2021-01-07 10:50:41.000,2021-01-07 10:50:41.000,,2021-01-07 10:50:41.000,,2021-02-07 10:50:41
158,500.0,"Receipt number 2 completed, bonus point schedu...",9200.0,10.0,"[{'barcode': '034100573065', 'description': 'M...",FLAGGED,290.0,5ff873d1b3348b11c9337716,5ff873f10a720f052300064f,2021-01-08 09:02:09.000,2021-01-08 09:02:09.000,,2021-01-08 09:02:10.000,,2021-02-08 09:02:10
190,250.0,"Receipt number 3 completed, bonus point schedu...",8950.0,10.0,"[{'barcode': '034100573065', 'description': 'M...",FLAGGED,290.0,5ffcb47d04929111f6e9256c,5ffcb4900a720f0515000002,2021-01-11 14:26:56.000,2021-01-11 14:26:56.000,,2021-01-11 14:26:57.000,,2021-02-11 14:26:56
244,25.0,COMPLETE_NONPARTNER_RECEIPT,25.0,1.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5fff2698b3348b03eb45bb10,5fff26ee0a720f05f300001a,2021-01-13 10:59:26.000,2021-01-13 10:59:26.000,2021-01-13 10:59:26.000,2021-01-13 10:59:26.000,2021-01-13 10:59:26,2021-02-13 10:59:26
265,,,8700.0,10.0,"[{'barcode': '034100573065', 'description': 'M...",FLAGGED,290.0,5fff2698b3348b03eb45bb10,5fff26f10a7214ad4c000018,2021-01-13 10:59:29.000,2021-01-13 10:59:29.000,,2021-01-13 10:59:29.000,,2021-02-13 10:59:29
294,,,8700.0,10.0,"[{'barcode': '034100573065', 'description': 'M...",FLAGGED,290.0,6000d46cfb296c121a81b20c,6000d4bc0a7214ad4c000070,2021-01-14 17:33:16.000,2021-01-14 17:33:16.000,,2021-01-14 17:33:17.000,,2021-02-14 17:33:17
362,250.0,"Receipt number 3 completed, bonus point schedu...",250.0,1.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,6008873eb6310511daa4e8eb,600887560a720f05fa000098,2021-01-20 13:41:10.000,2021-01-20 13:41:10.000,2021-01-20 13:41:11.000,2021-01-20 13:41:11.000,2021-01-20 13:41:11,2021-02-20 13:41:10


### data Quality issue spotted!

There are some data points whose purchase date is ahead of the receipt created date. As far I understand the business equation, the purchased date should be before the creation/scanned date time. This could be data issue that needs to be flagged.

In [228]:
### Checking if the all the userids present in the receipt are present in the users_df.

receipts_df[~receipts_df['userId'].isin(users_df_no_duplicates['user_id'])]

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
13,750.0,"Receipt number 1 completed, bonus point schedu...",750.0,11.0,"[{'barcode': '075925306254', 'competitiveProdu...",REJECTED,1.00,5f9c74f7c88c1415cbddb839,5f9c74f70a7214ad07000037,2020-10-30 15:17:59.000,2020-10-30 15:17:59.000,,2021-01-03 09:24:54.000,2020-10-30 15:18:00,2020-10-29 15:17:59
15,,,,0.0,"[{'needsFetchReview': True, 'needsFetchReviewR...",FLAGGED,0.00,5ff1e1e9b6a9d73a3a9f10f6,5ff1e1e90a7214ada1000569,2021-01-03 09:25:29.000,2021-01-03 09:25:29.000,,2021-01-03 09:25:29.000,,
16,750.0,"Receipt number 1 completed, bonus point schedu...",750.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.00,5ff1e1dfcfcf6c399c274ab3,5ff1e1df0a7214ada1000564,2021-01-03 09:25:19.000,2021-01-03 09:25:19.000,2021-01-03 09:25:20.000,2021-01-03 09:25:25.000,2021-01-03 09:25:19,2021-01-02 09:25:19
20,300.0,"Receipt number 4 completed, bonus point schedu...",389.2,6.0,"[{'barcode': '075925306254', 'competitiveProdu...",FINISHED,14.00,5f9c74e3f1937815bd2c1d73,5f9c74f90a7214ad07000038,2020-10-30 15:18:01.000,2020-10-30 15:18:01.000,2021-01-03 09:39:55.000,2021-01-03 09:39:55.000,2021-01-03 09:39:55,2020-10-29 15:18:01
21,750.0,"Receipt number 1 completed, bonus point schedu...",750.0,1.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.00,5ff1e196cfcf6c399c274a38,5ff1e1960a720f0523000567,2021-01-03 09:24:06.000,2021-01-03 09:24:06.000,2021-01-03 09:24:10.000,2021-01-03 09:24:10.000,2021-01-03 09:24:10,2021-01-02 09:24:06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,5.0,All-receipts receipt bonus,5.0,1.0,"[{'brandCode': 'MISSION', 'competitorRewardsGr...",FINISHED,2.23,60253861efa6017a44dc6b50,602538740a7214d8e9000246,2021-02-11 08:00:20.000,2021-02-11 08:00:20.000,2021-02-11 08:00:22.000,2021-02-11 08:00:23.000,2021-02-11 08:00:22,2021-02-11 08:00:20
956,500.0,"Receipt number 2 completed, bonus point schedu...",3500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,20.00,60253891b54593795bf69242,602538920a720f05a8000216,2021-02-11 08:00:50.000,2021-02-11 08:00:50.000,2021-02-11 08:00:50.000,2021-02-11 08:00:55.000,2021-02-11 08:00:50,2021-02-10 18:00:00
966,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,23.00,60253891b54593795bf69242,602538960a720f05a800021b,2021-02-11 08:00:53.000,2021-02-11 08:00:53.000,2021-02-11 08:00:54.000,2021-02-11 08:00:59.000,2021-02-11 08:00:54,2021-02-10 18:00:00
985,100.0,"Receipt number 6 completed, bonus point schedu...",100.0,3.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,29.00,60268c7bb545931ac63683af,60268c7e0a7214d8e9000309,2021-02-12 08:11:10.000,2021-02-12 08:11:10.000,2021-02-12 08:11:11.000,2021-02-12 08:11:16.000,2021-02-12 08:11:11,2021-02-11 18:00:00


### data Quality issue Spotted!! there are users ids that are present in the receipts but not present in users_df.

#### Data Quality issues in rewards_receipts_df


In [234]:
rewards_receipts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6941 entries, 0 to 6940
Data columns (total 35 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   barcode                             3090 non-null   object 
 1   description                         6560 non-null   object 
 2   finalPrice                          6767 non-null   float64
 3   itemPrice                           6767 non-null   float64
 4   needsFetchReview                    813 non-null    object 
 5   partnerItemId                       6941 non-null   int64  
 6   preventTargetGapPoints              358 non-null    object 
 7   quantityPurchased                   6767 non-null   float64
 8   userFlaggedBarcode                  337 non-null    float64
 9   userFlaggedNewItem                  323 non-null    object 
 10  userFlaggedPrice                    299 non-null    float64
 11  userFlaggedQuantity                 299 non

In [240]:
## Unique test:

context = ge.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")

batch = batch_definition.get_batch(batch_parameters={"dataframe": rewards_receipts_df})

#expect user_id to exist:
compound_unique_rr_expectation = ge.expectations.ExpectCompoundColumnsToBeUnique(column_list = ['receipt_id','partnerItemId'])
validation_result = batch.validate(compound_unique_rr_expectation)
print(validation_result['result'])

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{'element_count': 6941, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': [], 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_percent_total': 0.0, 'unexpected_percent_nonmissing': 0.0, 'partial_unexpected_counts': [], 'partial_unexpected_index_list': []}


In [241]:
# passed unique test! 

Digging into missing values:

For everyitem that is scanned is expected to have a barcode.

In [245]:
rewards_receipts_df[rewards_receipts_df['barcode'].isna()].head()

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,userFlaggedPrice,userFlaggedQuantity,needsFetchReviewReason,pointsNotAwardedReason,pointsPayerId,rewardsGroup,rewardsProductPartnerId,userFlaggedDescription,originalMetaBriteBarcode,originalMetaBriteDescription,brandCode,competitorRewardsGroup,discountedItemPrice,originalReceiptItemText,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id
3,,,,,False,1,True,,4011.0,True,26.0,3.0,,,,,,,,,,,,,,,,,,,,,,,5ff1e1f10a720f052300057a
8,,MSSN TORTLLA,2.23,2.23,,1009,,1.0,,,,,,,,,,,,,MISSION,TACO BELL TACO SHELLS,2.23,MSSN TORTLLA,,,,,,,,,,,5ff1e1cd0a720f052300056f
25,,,,,True,2,True,,34100570000.0,True,29.0,1.0,USER_FLAGGED,,,,,MILLER LITE 24 PACK 12OZ CAN,,,,,,,,,,,,,,,,,5f9c74f70a7214ad07000037
26,,,,,True,3,True,,34100570000.0,True,29.0,1.0,USER_FLAGGED,,,,,MILLER LITE 24 PACK 12OZ CAN,,,,,,,,,,,,,,,,,5f9c74f70a7214ad07000037
27,,,,,True,4,True,,34100570000.0,True,29.0,1.0,USER_FLAGGED,,,,,MILLER LITE 24 PACK 12OZ CAN,,,,,,,,,,,,,,,,,5f9c74f70a7214ad07000037


#### data Quality issue spotted!

In [252]:
rewards_receipts_df[rewards_receipts_df['barcode'].isna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3851 entries, 3 to 6847
Data columns (total 35 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   barcode                             0 non-null      object 
 1   description                         3701 non-null   object 
 2   finalPrice                          3701 non-null   float64
 3   itemPrice                           3701 non-null   float64
 4   needsFetchReview                    453 non-null    object 
 5   partnerItemId                       3851 non-null   int64  
 6   preventTargetGapPoints              150 non-null    object 
 7   quantityPurchased                   3701 non-null   float64
 8   userFlaggedBarcode                  150 non-null    float64
 9   userFlaggedNewItem                  150 non-null    object 
 10  userFlaggedPrice                    150 non-null    float64
 11  userFlaggedQuantity                 150 non

#### Below table suggests that orders that have descriptions but no barcode could suggest potential data quality issue in terms of scanning.

In [275]:
missing_brandcode_desc_present_df = rewards_receipts_df[(rewards_receipts_df['brandCode'].isna()) & (~rewards_receipts_df['description'].isna())]

In [276]:
missing_brandcode_desc_present_df['first_word_desc'] = missing_brandcode_desc_present_df['description'].apply(lambda x: x.split()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_brandcode_desc_present_df['first_word_desc'] = missing_brandcode_desc_present_df['description'].apply(lambda x: x.split()[0])


In [277]:
missing_brandcode_desc_present_df.groupby('first_word_desc').agg({'first_word_desc':'size'}).rename(columns={'first_word_desc':'count'}).sort_values(by = ['count'],ascending= False).head(25)


Unnamed: 0_level_0,count
first_word_desc,Unnamed: 1_level_1
ITEM,173
PC,138
KLARBRUNN,128
HUGGIES,93
MILLER,90
HYV,84
COMP,74
OSCAR,73
LEGO,56
EMIL',55


In [278]:
missing_brandcode_desc_present_df[missing_brandcode_desc_present_df['first_word_desc'] == 'BORDEN']['description'].head()

1000    BORDEN 2% MILK, 1/2 GAL
1003    BORDEN 2% MILK, 1/2 GAL
1012    BORDEN 2% MILK, 1/2 GAL
1015    BORDEN 2% MILK, 1/2 GAL
1132    BORDEN 2% MILK, 1/2 GAL
Name: description, dtype: object

In [279]:
missing_brandcode_desc_present_df[missing_brandcode_desc_present_df['first_word_desc'] == 'KLARBRUNN'][['description','brandCode']].head()

Unnamed: 0,description,brandCode
305,KLARBRUNN 12PK 12 FL OZ,
307,KLARBRUNN 12PK 12 FL OZ,
624,KLARBRUNN 12PK 12 FL OZ,
648,KLARBRUNN 12PK 12 SL OZ,
884,KLARBRUNN 12PK 12 FL OZ,


##### The above tables are examples where barcodes are present but one would make sense out of the brands based on description. By 'estimating' the brand from description can help better the analysis made over brands especially the number of scanned items. A better way of imputing brandCodes should be devised in order to improve the understanding of the scanning and receipt flow.

Example: If not for better imputing methods, the 138 scanned receipts of 'Klarbrunn' brand would not counted for let's say which brand is mostly contributing to water etc

### Checking if the brandCodes present in the rewards_receipts_table are actually present in the brands table.

In [283]:
##checking if unique brandcodes exist in receipt but not in brands

brandcodes_rewards_receipt = list(rewards_receipts_df['brandCode'].unique())
brandcodes_brands = list(brands_df['brandCode'].unique())
len(brandcodes_rewards_receipt)
missing_bar_codes = [x for x in brandcodes_rewards_receipt if x not in brandcodes_brands]
sorted(missing_bar_codes)

['7UP',
 'ADVIL',
 'AMERICAN BEAUTY',
 'ARROWHEAD',
 'AZTECA',
 'BANZA',
 'BEAR CREEK COUNTRY KITCHENS',
 'BEN AND JERRYS',
 'BETTY CROCKER',
 'BIC',
 'BIGELOW',
 'BLUE DIAMOND',
 "BOAR'S HEAD",
 'BORDEN',
 'BOTA BOX',
 'BRAND',
 "BRASWELL'S",
 'BUNNY',
 "BUSH'S BEST",
 'C&H',
 'CADBURY',
 'CAL-ORGANIC FARMS',
 'CALIFIA FARMS',
 "CAMPBELL'S",
 'CARAMELLO',
 'CHEERIOS',
 'CHEESE',
 'CHEEZ-IT',
 'CHEX',
 'CHICKEN OF THE SEA',
 'CHIQUITA',
 'CINNAMON TOAST CRUNCH',
 'COKE',
 'COLEMAN NATURAL',
 "CONNIE'S PIZZA",
 'CREST 3D WHITE',
 'CRISPIX',
 'DANNON',
 'DARE',
 'DELI',
 'DIET COKE',
 'DIGIORNO',
 'DOLE',
 'DR PEPPER',
 'EDWARDS',
 "EGGLAND'S BEST",
 'EGGO',
 'EL MONTEREY',
 'ENERGIZER MAX',
 'ESSENTIAL EVERYDAY',
 'FAGE',
 "FAMOUS DAVE'S",
 "FLORIDA'S NATURAL",
 'FOLGERS',
 'FORTUNE YAKISOBA',
 'FRANZ',
 "FRENCH'S",
 'FRESH EXPRESS',
 'FRESH STEP',
 'FRONTERA',
 'GALLO FAMILY VINEYARDS',
 'GENERAL MILLS',
 'GERBER',
 'GERM-X',
 'GREEN GIANT',
 'GRIMMWAY FARMS',
 'HANOVER',
 'HARVEST SNA

### Are they actually missing??? 
Ans: No

Example:
1) '7UP' present in receipt rewards table; and '7 up' is present in brands_df => which are actually the same brands. There is a need to maintain a consistent structure in the brandCodes. 
2) "BEN AND JERRYS" and "BEN & JERRY'S" in rewards_receipts_df and brands_df respectively.

There are multiple examples like these which proves the lack of consistency in determining BrandCodes

### Loading the dfs into MYSQL

In [281]:
users_df_no_duplicates.head()

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,2021-01-03 09:25:30.554,2021-01-03 09:25:30.597
6,True,consumer,Email,WI,5ff1e1e8cfcf6c399c274ad9,2021-01-03 09:25:28.354,2021-01-03 09:25:28.392
7,True,consumer,Email,WI,5ff1e1b7cfcf6c399c274a5a,2021-01-03 09:24:39.626,2021-01-03 09:24:39.665
9,True,consumer,Email,WI,5ff1e1f1cfcf6c399c274b0b,2021-01-03 09:25:37.564,2021-01-03 09:25:37.599


In [282]:
brands_df.head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


In [288]:
new_receipts_df = receipts_df.drop(columns = 'rewardsReceiptItemList')
new_receipts_df.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,2021-01-03 09:25:31.000,2021-01-03 09:25:31.000,2021-01-03 09:25:31.000,2021-01-03 09:25:36.000,2021-01-03 09:25:31,2021-01-02 18:00:00
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,2021-01-03 09:24:43.000,2021-01-03 09:24:43.000,2021-01-03 09:24:43.000,2021-01-03 09:24:48.000,2021-01-03 09:24:43,2021-01-02 09:24:43
2,5.0,All-receipts receipt bonus,5.0,1.0,REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,2021-01-03 09:25:37.000,2021-01-03 09:25:37.000,,2021-01-03 09:25:42.000,,2021-01-02 18:00:00
3,5.0,All-receipts receipt bonus,5.0,4.0,FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,2021-01-03 09:25:34.000,2021-01-03 09:25:34.000,2021-01-03 09:25:34.000,2021-01-03 09:25:39.000,2021-01-03 09:25:34,2021-01-02 18:00:00
4,5.0,All-receipts receipt bonus,5.0,2.0,FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,2021-01-03 09:25:06.000,2021-01-03 09:25:06.000,2021-01-03 09:25:11.000,2021-01-03 09:25:11.000,2021-01-03 09:25:06,2021-01-02 09:25:06


In [289]:
rewards_receipts_df.head()

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,userFlaggedPrice,userFlaggedQuantity,needsFetchReviewReason,pointsNotAwardedReason,pointsPayerId,rewardsGroup,rewardsProductPartnerId,userFlaggedDescription,originalMetaBriteBarcode,originalMetaBriteDescription,brandCode,competitorRewardsGroup,discountedItemPrice,originalReceiptItemText,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id
0,4011.0,ITEM NOT FOUND,26.0,26.0,False,1,True,5.0,4011.0,True,26.0,5.0,,,,,,,,,,,,,,,,,,,,,,,5ff1e1eb0a720f0523000575
1,4011.0,ITEM NOT FOUND,1.0,1.0,,1,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,5ff1e1bb0a720f052300056b
2,28400642255.0,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.0,10.0,True,2,True,1.0,28400640000.0,True,10.0,1.0,USER_FLAGGED,Action not allowed for user and CPG,5332f5fbe4b03c9a25efd0ba,DORITOS SPICY SWEET CHILI SINGLE SERVE,5332f5fbe4b03c9a25efd0ba,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,,,,,,,,,,,,,,,,,5ff1e1bb0a720f052300056b
3,,,,,False,1,True,,4011.0,True,26.0,3.0,,,,,,,,,,,,,,,,,,,,,,,5ff1e1f10a720f052300057a
4,4011.0,ITEM NOT FOUND,28.0,28.0,False,1,True,4.0,4011.0,True,28.0,4.0,,,,,,,,,,,,,,,,,,,,,,,5ff1e1ee0a7214ada100056f


In [292]:
# Function to read YAML file
def read_yaml(file_path):
    with open(file_path, 'r') as file:
        try:
            # Load the YAML content into a Python dictionary
            data = yaml.safe_load(file)
            return data
        except yaml.YAMLError as exc:
            print(f"Error reading YAML file: {exc}")
            return None

In [301]:
file_path = os.path.join(root_dir,'MYSQL_user_config.yaml')
config = read_yaml(file_path)

user = config.get('MYSQL_credentials')['user']
password = config.get('MYSQL_credentials')['password']
host = config.get('MYSQL_credentials')['host']
port = config.get('MYSQL_credentials')['port']

##### Make sure that MYSQL 'services' is running locally (Can always check witn Win + R and searching for services.msc) and starting MYSQL

In [308]:
def create_database(db_url, db_name):
    try:
        # Connect to MySQL using SQLAlchemy's engine
        engine = create_engine(db_url)
        with engine.connect() as connection:
            # Execute the command to create a database
            connection.execute(text(f"CREATE DATABASE IF NOT EXISTS {db_name}"))
            print(f"Database '{db_name}' created successfully!")
    except Exception as e:
        print(f"Error while creating database: {e}")

db_url = f'mysql+pymysql://{user}:{password}@{host}:{port}'
db_name = "FETCH_DB"


create_database(db_url, db_name)

Database 'FETCH_DB' created successfully!


In [309]:
def run_query(query):
    try:
        with engine.connect() as connection:
            result = connection.execute(text(query))
            return result.fetchall()
    except Exception as e:
        print(f"Error in the query -> {e}")

In [311]:
##Engine creation for data loading
engine = create_engine(f'{db_url}'+'/'+f'{db_name}')

In [314]:
users_df_no_duplicates.head()

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,2021-01-03 09:25:30.554,2021-01-03 09:25:30.597
6,True,consumer,Email,WI,5ff1e1e8cfcf6c399c274ad9,2021-01-03 09:25:28.354,2021-01-03 09:25:28.392
7,True,consumer,Email,WI,5ff1e1b7cfcf6c399c274a5a,2021-01-03 09:24:39.626,2021-01-03 09:24:39.665
9,True,consumer,Email,WI,5ff1e1f1cfcf6c399c274b0b,2021-01-03 09:25:37.564,2021-01-03 09:25:37.599


In [323]:
users_df_no_duplicates[~users_df_no_duplicates['role'].isna()]['role'].apply(lambda x:len(x)).max()

11

In [360]:
users_df_no_duplicates['active'] = users_df_no_duplicates['active'].apply(lambda x:str(x).upper())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_df_no_duplicates['active'] = users_df_no_duplicates['active'].apply(lambda x:str(x).upper())


In [325]:
create_user_df_query = """
CREATE TABLE users (
    user_id VARCHAR(24) PRIMARY KEY,
    created_date_time DATETIME(3),
    lastLogin_date_time DATETIME(3),
    state VARCHAR(2),
    signUpSource VARCHAR(15),
    role VARCHAR(15),
    active BOOLEAN
)
"""
run_query('DROP TABLE IF EXISTS users')
run_query(create_user_df_query)

Error in the query -> This result object does not return rows. It has been closed automatically.


In [326]:
brands_df.head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


In [331]:
brands_df[~brands_df['brandCode'].isna()]['brandCode'].apply(lambda x:len(x)).max()

46

In [362]:
brands_df['topBrand'] = brands_df['topBrand'].apply(lambda x:str(x).upper())

In [336]:
create_brands_df_query = """
CREATE TABLE brands (
    brand_id VARCHAR(24) PRIMARY KEY,
    name VARCHAR(25),
    category VARCHAR(25),
    categoryCode VARCHAR(25),
    barcode VARCHAR(20),
    cpg_id VARCHAR(6),
    cpg_ref VARCHAR(6),
    brandCode VARCHAR(50),
    topBrand BOOLEAN
);
"""
run_query('DROP TABLE IF EXISTS brands')
run_query(create_brands_df_query)

Error in the query -> This result object does not return rows. It has been closed automatically.
Error in the query -> This result object does not return rows. It has been closed automatically.


In [337]:
new_receipts_df.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,2021-01-03 09:25:31.000,2021-01-03 09:25:31.000,2021-01-03 09:25:31.000,2021-01-03 09:25:36.000,2021-01-03 09:25:31,2021-01-02 18:00:00
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,2021-01-03 09:24:43.000,2021-01-03 09:24:43.000,2021-01-03 09:24:43.000,2021-01-03 09:24:48.000,2021-01-03 09:24:43,2021-01-02 09:24:43
2,5.0,All-receipts receipt bonus,5.0,1.0,REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,2021-01-03 09:25:37.000,2021-01-03 09:25:37.000,,2021-01-03 09:25:42.000,,2021-01-02 18:00:00
3,5.0,All-receipts receipt bonus,5.0,4.0,FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,2021-01-03 09:25:34.000,2021-01-03 09:25:34.000,2021-01-03 09:25:34.000,2021-01-03 09:25:39.000,2021-01-03 09:25:34,2021-01-02 18:00:00
4,5.0,All-receipts receipt bonus,5.0,2.0,FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,2021-01-03 09:25:06.000,2021-01-03 09:25:06.000,2021-01-03 09:25:11.000,2021-01-03 09:25:11.000,2021-01-03 09:25:06,2021-01-02 09:25:06


In [341]:
create_receipts_df_query = """
CREATE TABLE receipts (
    receipt_id VARCHAR(25) PRIMARY KEY,
    created_date_time DATETIME(3),
    scanned_date_time DATETIME(3),
    finished_date_time DATETIME(3),
    modify_date_time DATETIME(3),
    purchased_date_time DATETIME(3),
    pointsAwarded_date_time DATETIME(3),
    bonusPointsEarned DECIMAL(7,3),
    bonusPointsEarnedReason VARCHAR(35),
    pointsEarned DECIMAL(7,3),
    purchasedItemCount INT(7),
    rewardsReceiptStatus VARCHAR(10),
    totalSpent DECIMAL(7,3)
);
"""
run_query('DROP TABLE IF EXISTS receipts')
run_query(create_receipts_df_query)

Error in the query -> This result object does not return rows. It has been closed automatically.
Error in the query -> This result object does not return rows. It has been closed automatically.


In [342]:
rewards_receipts_df.columns

Index(['barcode', 'description', 'finalPrice', 'itemPrice', 'needsFetchReview',
       'partnerItemId', 'preventTargetGapPoints', 'quantityPurchased',
       'userFlaggedBarcode', 'userFlaggedNewItem', 'userFlaggedPrice',
       'userFlaggedQuantity', 'needsFetchReviewReason',
       'pointsNotAwardedReason', 'pointsPayerId', 'rewardsGroup',
       'rewardsProductPartnerId', 'userFlaggedDescription',
       'originalMetaBriteBarcode', 'originalMetaBriteDescription', 'brandCode',
       'competitorRewardsGroup', 'discountedItemPrice',
       'originalReceiptItemText', 'itemNumber',
       'originalMetaBriteQuantityPurchased', 'pointsEarned', 'targetPrice',
       'competitiveProduct', 'originalFinalPrice',
       'originalMetaBriteItemPrice', 'deleted', 'priceAfterCoupon',
       'metabriteCampaignId', 'receipt_id'],
      dtype='object')

In [363]:
rewards_receipts_df['needsFetchReview'] = rewards_receipts_df['needsFetchReview'].apply(lambda x:str(x).upper())
rewards_receipts_df['preventTargetGapPoints'] = rewards_receipts_df['preventTargetGapPoints'].apply(lambda x:str(x).upper())
rewards_receipts_df['userFlaggedNewItem'] = rewards_receipts_df['userFlaggedNewItem'].apply(lambda x:str(x).upper())
rewards_receipts_df['competitiveProduct'] = rewards_receipts_df['competitiveProduct'].apply(lambda x:str(x).upper())
rewards_receipts_df['deleted'] = rewards_receipts_df['deleted'].apply(lambda x:str(x).upper())



In [366]:
rewards_receipts_df['description_first_word'] = rewards_receipts_df['description'].apply(lambda x: x.split()[0] if pd.notna(x) else np.nan)

In [358]:
create_rewards_receipts_df_query = """
CREATE TABLE rewards_receipts (
    barcode VARCHAR(20),
    description VARCHAR(50),
    finalPrice DECIMAL(8,3),
    itemPrice DECIMAL(8,3),
    needsFetchReview Boolean,
    partnerItemId INT,
    preventTargetGapPoints Boolean,
    quantityPurchased DECIMAL(8,3),
    userFlaggedBarcode VARCHAR(20),
    userFlaggedNewItem BOOLEAN,
    userFlaggedPrice DECIMAL(8,3),
    userFlaggedQuantity DECIMAL(8,3),
    needsFetchReviewReason VARCHAR(20),
    pointsNotAwardedReason VARCHAR(50),
    pointsPayerId VARCHAR(25),
    rewardsGroup VARCHAR(20),
    rewardsProductPartnerId VARCHAR(25),
    userFlaggedDescription VARCHAR(50),
    originalMetaBriteBarcode VARCHAR(20),
    originalMetaBriteDescription VARCHAR(50),
    brandCode VARCHAR(20),
    competitorRewardsGroup VARCHAR(30),
    discountedItemPrice DECIMAL(8,3),
    originalReceiptItemText VARCHAR(50),
    itemNumber INT,
    originalMetaBriteQuantityPurchased DECIMAL(8,3),
    pointsEarned DECIMAL(8,3),
    targetPrice DECIMAL(8,3),
    competitiveProduct BOOLEAN,
    originalFinalPrice DECIMAL(8,3),
    originalMetaBriteItemPrice DECIMAL(8,3),
    deleted BOOLEAN,
    priceAfterCoupon DECIMAL(8,3),
    metabriteCampaignId VARCHAR(50),
    receipt_id VARCHAR(25),
    description_first_word VARCHAR(25),
    PRIMARY KEY (receipt_id,partnerItemId)    
);
"""
run_query('DROP TABLE IF EXISTS rewards_receipts')
run_query(create_rewards_receipts_df_query)

Error in the query -> This result object does not return rows. It has been closed automatically.
Error in the query -> This result object does not return rows. It has been closed automatically.


In [373]:
### Time to load tables

#user_df
users_df_no_duplicates.to_sql('users',con = engine, if_exists = 'replace',index = False)

#brands_df
brands_df.to_sql('brands',con = engine, if_exists = 'replace',index = False)

#receipts_df
new_receipts_df.to_sql('receipts',con = engine, if_exists = 'replace',index = False)

#rewards_receipts_df
rewards_receipts_df.to_sql('rewards_receipts',con = engine, if_exists = 'replace',index = False)

6941

In [381]:
rewards_receipts_df[(rewards_receipts_df['pointsEarned'] < 870)&(rewards_receipts_df['pointsEarned'] > 100)].head()

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,userFlaggedPrice,userFlaggedQuantity,needsFetchReviewReason,pointsNotAwardedReason,pointsPayerId,rewardsGroup,rewardsProductPartnerId,userFlaggedDescription,originalMetaBriteBarcode,originalMetaBriteDescription,brandCode,competitorRewardsGroup,discountedItemPrice,originalReceiptItemText,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id,description_first_word
623,76840100354,BEN & JERRYS FROZEN CHUNKY MONKEY ICE CREAM RE...,16.6,16.6,NAN,1011,NAN,4.0,,NAN,,,,,5332f5f6e4b03c9a25efd0b4,BEN AND JERRYS ICE CREAM,5332f5f6e4b03c9a25efd0b4,,,,BEN AND JERRYS,,16.6,BEN & JERRY' S CHUNKY MONKEY PINT,,,166.0,,NAN,,,NAN,,BEN AND JERRYS ICE CREAM,6000b2be0a7214ad4c00004d,BEN
630,76840100354,BEN & JERRYS FROZEN CHUNKY MONKEY ICE CREAM RE...,21.3,21.3,NAN,1030,NAN,5.0,,NAN,,,,,5332f5f6e4b03c9a25efd0b4,BEN AND JERRYS ICE CREAM,5332f5f6e4b03c9a25efd0b4,,,,BEN AND JERRYS,,21.3,BEN 6 JERRY.S CHUN*Y MONKEY PINT,,,213.0,,NAN,,,NAN,,BEN AND JERRYS ICE CREAM,6000b2be0a7214ad4c00004d,BEN
634,76840580750,Ben & Jerry's Chunky Monkey Non-Dairy Frozen D...,15.3,15.3,NAN,1017,NAN,3.0,,NAN,,,,,5332f5f6e4b03c9a25efd0b4,BEN AND JERRYS ICE CREAM,5332f5f6e4b03c9a25efd0b4,,,,BEN AND JERRYS,,15.3,BEN & JERRY' S CHUNKY MONKEY PINT,,,153.0,,NAN,,,NAN,,BEN AND JERRYS ICE CREAM,6000c8460a720f05f300006f,Ben
635,76840580750,Ben & Jerry's Chunky Monkey Non-Dairy Frozen D...,28.7,28.7,NAN,1020,NAN,5.0,,NAN,,,,,5332f5f6e4b03c9a25efd0b4,BEN AND JERRYS ICE CREAM,5332f5f6e4b03c9a25efd0b4,,,,BEN AND JERRYS,,28.7,BEN & JERRY' S CHUNKY MONKEY PINT,,,287.0,,NAN,,,NAN,,BEN AND JERRYS ICE CREAM,6000c8460a720f05f300006f,Ben
636,76840100354,BEN & JERRYS FROZEN CHUNKY MONKEY ICE CREAM RE...,21.44,21.44,NAN,1023,NAN,4.0,,NAN,,,,,5332f5f6e4b03c9a25efd0b4,BEN AND JERRYS ICE CREAM,5332f5f6e4b03c9a25efd0b4,,,,BEN AND JERRYS,,21.44,BEN & JERRY' S CHUNKY MONKEY PINT,,,214.4,,NAN,,,NAN,,BEN AND JERRYS ICE CREAM,6000c8460a720f05f300006f,BEN


In [386]:
receipts_df['scanned_date_time'].dt.strftime('%Y-%m')

AttributeError: Can only use .dt accessor with datetimelike values

In [396]:
receipts_df['scan_check'] = pd.to_datetime(receipts_df['scanned_date_time'], format='%Y-%m')
receipts_df['scan_check'] = receipts_df['scan_check'].dt.strftime('%Y-%m')
receipts_df['scan_check'].head()

0    2021-01
1    2021-01
2    2021-01
3    2021-01
4    2021-01
Name: scan_check, dtype: object

In [407]:
rewards_receipts_df[rewards_receipts_df['receipt_id']=='603cbbb50a720fde100003e3']

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,userFlaggedPrice,userFlaggedQuantity,needsFetchReviewReason,pointsNotAwardedReason,pointsPayerId,rewardsGroup,rewardsProductPartnerId,userFlaggedDescription,originalMetaBriteBarcode,originalMetaBriteDescription,brandCode,competitorRewardsGroup,discountedItemPrice,originalReceiptItemText,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id,description_first_word
6915,B076FJ92M4,mueller austria hypergrind precision electric ...,22.97,22.97,NAN,0,NAN,1.0,,NAN,,,,,,,,,,,,,22.97,mueller austria hypergrind precision electric ...,,,,,NAN,,,NAN,22.97,,603cbbb50a720fde100003e3,mueller
6916,B07BRRLSVC,thindust summer face mask - sun protection nec...,11.99,11.99,NAN,1,NAN,1.0,,NAN,,,,,,,,,,,,,11.99,thindust summer face mask - sun protection nec...,,,,,NAN,,,NAN,11.99,,603cbbb50a720fde100003e3,thindust


In [406]:
receipts_df[receipts_df['scan_check'] == '2021-03'].head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time,scan_check
1089,25.0,COMPLETE_NONPARTNER_RECEIPT,25.0,2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33,603cbbb50a720fde100003e3,2021-03-01 04:02:29.000,2021-03-01 04:02:29.000,,2021-03-01 04:02:30.000,,2020-08-16 19:00:00,2021-03
1090,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603cadab0a720fde100003d6,2021-03-01 03:02:35.175,2021-03-01 03:02:35.175,,2021-03-01 03:02:35.175,,,2021-03
1091,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603c8cdf0a7217c72c0003bb,2021-03-01 00:42:39.190,2021-03-01 00:42:39.190,,2021-03-01 00:42:39.190,,,2021-03
1092,25.0,COMPLETE_NONPARTNER_RECEIPT,25.0,2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33,603d30e60a7217c72c00043f,2021-03-01 12:22:30.000,2021-03-01 12:22:30.000,,2021-03-01 12:22:31.000,,2020-08-16 19:00:00,2021-03
1094,25.0,COMPLETE_NONPARTNER_RECEIPT,25.0,2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33,603d5d6c0a7217c72c000463,2021-03-01 15:32:28.000,2021-03-01 15:32:28.000,,2021-03-01 15:32:29.000,,2020-08-16 19:00:00,2021-03
