In [308]:
import pandas as pd
import numpy as np
import os
import json
import jsonlines
import datetime

pd.set_option('display.max_rows', 100)  # Set maximum rows to 100
pd.set_option('display.max_columns', 20) # Set maximum columns to 20

In [309]:
#pip install jsonlines

In [310]:
root_dir = os.getcwd()
data_src_dir = os.path.join(root_dir, 'data_sources')

We have 3 data sources in json format namely:
Brands, Receipts and Users

The JSON file contains multiple JSON objects (dictionaries) that are not properly separated by commas. To address this, we are using the jsonlines library, which can handle and parse such structures more effectively than the traditional json package.

#### Parsing Brands

In [311]:
brand_json_file = os.path.join(data_src_dir, 'brands.json')

with jsonlines.open(brand_json_file) as file:
    brands_raw_data = []
    for obj in file:

        brands_raw_data.append(obj)
#sample brands
brands_raw_data[0:2]

[{'_id': {'$oid': '601ac115be37ce2ead437551'},
  'barcode': '511111019862',
  'category': 'Baking',
  'categoryCode': 'BAKING',
  'cpg': {'$id': {'$oid': '601ac114be37ce2ead437550'}, '$ref': 'Cogs'},
  'name': 'test brand @1612366101024',
  'topBrand': False},
 {'_id': {'$oid': '601c5460be37ce2ead43755f'},
  'barcode': '511111519928',
  'brandCode': 'STARBUCKS',
  'category': 'Beverages',
  'categoryCode': 'BEVERAGES',
  'cpg': {'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, '$ref': 'Cogs'},
  'name': 'Starbucks',
  'topBrand': False}]

### Parsing Receipts

In [312]:
receipts_json_file = os.path.join(data_src_dir, 'receipts.json')

with jsonlines.open(receipts_json_file) as file:
    receipts_raw_data = []
    for obj in file:

        receipts_raw_data.append(obj)
#sample brands
receipts_raw_data[0:1]

[{'_id': {'$oid': '5ff1e1eb0a720f0523000575'},
  'bonusPointsEarned': 500,
  'bonusPointsEarnedReason': 'Receipt number 2 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)',
  'createDate': {'$date': 1609687531000},
  'dateScanned': {'$date': 1609687531000},
  'finishedDate': {'$date': 1609687531000},
  'modifyDate': {'$date': 1609687536000},
  'pointsAwardedDate': {'$date': 1609687531000},
  'pointsEarned': '500.0',
  'purchaseDate': {'$date': 1609632000000},
  'purchasedItemCount': 5,
  'rewardsReceiptItemList': [{'barcode': '4011',
    'description': 'ITEM NOT FOUND',
    'finalPrice': '26.00',
    'itemPrice': '26.00',
    'needsFetchReview': False,
    'partnerItemId': '1',
    'preventTargetGapPoints': True,
    'quantityPurchased': 5,
    'userFlaggedBarcode': '4011',
    'userFlaggedNewItem': True,
    'userFlaggedPrice': '26.00',
    'userFlaggedQuantity': 5}],
  'rewardsReceiptStatus': 'FINISHED',
  'totalSpent': '26.00',
  'userId': '5ff1e1eacfcf6c399c274ae6

#### Parsing Users

In [313]:
users_json_file = os.path.join(data_src_dir, 'users.json')

with jsonlines.open(users_json_file) as file:
    users_raw_data = []
    for obj in file:

        users_raw_data.append(obj)
#sample brands
users_raw_data[0:2]

[{'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'},
  'active': True,
  'createdDate': {'$date': 1609687444800},
  'lastLogin': {'$date': 1609687537858},
  'role': 'consumer',
  'signUpSource': 'Email',
  'state': 'WI'},
 {'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'},
  'active': True,
  'createdDate': {'$date': 1609687444800},
  'lastLogin': {'$date': 1609687537858},
  'role': 'consumer',
  'signUpSource': 'Email',
  'state': 'WI'}]

#### Normalizing Users table

In [314]:
users_df = pd.json_normalize(users_raw_data, sep = '_')
users_df.head(5)

Unnamed: 0,active,role,signUpSource,state,id_$oid,createdDate_$date,lastLogin_$date
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,1609687530554,1609688000000.0
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0


In [315]:
users_df = users_df.rename(columns = {'id_$oid':'user_id','createdDate_$date':'created_date_time','lastLogin_$date':'lastLogin_date_time'})
users_df.head(5)

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,1609687530554,1609688000000.0
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0


An observation can be made about the date format: Which is a Unix timestamp in milliseconds. It represents the number of milliseconds have passed since Jan-01-1970 (UTC). Therefore there is a need to convert the UNIX timestamp to MM-DD-YYYY format for better readability.

Some of the Last_Login_Date values are missing (NaN). To ensure consistency in the data types within the column, I converted the NaN values to NaT (Not a Time), which is specifically used for missing values in datetime columns. This ensures that the entire column remains in a proper datetime format.

In [316]:
def unix_to_date_convert(unix_ts):
    try:
        seconds = unix_ts/1000
        date = datetime.datetime.fromtimestamp(seconds)
    except:
        return pd.NaT
    return date

In [317]:
users_df['created_date_time'] = users_df['created_date_time'].apply(unix_to_date_convert)
users_df['lastLogin_date_time'] = users_df['lastLogin_date_time'].apply(unix_to_date_convert)
users_df.head()

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,2021-01-03 09:25:30.554,2021-01-03 09:25:30.597
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858


In [318]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   active               495 non-null    bool          
 1   role                 495 non-null    object        
 2   signUpSource         447 non-null    object        
 3   state                439 non-null    object        
 4   user_id              495 non-null    object        
 5   created_date_time    495 non-null    datetime64[ns]
 6   lastLogin_date_time  433 non-null    datetime64[ns]
dtypes: bool(1), datetime64[ns](2), object(4)
memory usage: 23.8+ KB


#### Normalizing Brands data

In [319]:
brands_df = pd.json_normalize(brands_raw_data, sep = '_')
brands_df.head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,id_$oid,cpg_$id_$oid,cpg_$ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


In [320]:
brands_df = brands_df.rename(columns = {'id_$oid':'brand_id','cpg_$id_$oid' : 'cpg_id','cpg_$ref' : 'cpg_ref' })
brands_df.head(5)

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


In [321]:
brands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   barcode       1167 non-null   object
 1   category      1012 non-null   object
 2   categoryCode  517 non-null    object
 3   name          1167 non-null   object
 4   topBrand      555 non-null    object
 5   brand_id      1167 non-null   object
 6   cpg_id        1167 non-null   object
 7   cpg_ref       1167 non-null   object
 8   brandCode     933 non-null    object
dtypes: object(9)
memory usage: 82.2+ KB


#### Normalizing receipts

In [322]:
receipts_df = pd.json_normalize(receipts_raw_data, sep = '_',max_level = 1)
receipts_df.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,_id_$oid,createDate_$date,dateScanned_$date,finishedDate_$date,modifyDate_$date,pointsAwardedDate_$date,purchaseDate_$date
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,1609687531000,1609687531000,1609688000000.0,1609687536000,1609688000000.0,1609632000000.0
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,1609687483000,1609687483000,1609687000000.0,1609687488000,1609687000000.0,1609601000000.0
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,1609687537000,1609687537000,,1609687542000,,1609632000000.0
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,1609687534000,1609687534000,1609688000000.0,1609687539000,1609688000000.0,1609632000000.0
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,1609687506000,1609687506000,1609688000000.0,1609687511000,1609688000000.0,1609601000000.0


In [323]:
receipts_df.columns

Index(['bonusPointsEarned', 'bonusPointsEarnedReason', 'pointsEarned',
       'purchasedItemCount', 'rewardsReceiptItemList', 'rewardsReceiptStatus',
       'totalSpent', 'userId', '_id_$oid', 'createDate_$date',
       'dateScanned_$date', 'finishedDate_$date', 'modifyDate_$date',
       'pointsAwardedDate_$date', 'purchaseDate_$date'],
      dtype='object')

In [324]:
receipts_df = receipts_df.rename(columns = {'_id_$oid':'receipt_id',
                                            'createDate_$date':'created_date_time',
                                            'dateScanned_$date':'scanned_date_time',
                                            'finishedDate_$date':'finished_date_time',
                                            'modifyDate_$date':'modify_date_time',
                                            'pointsAwardedDate_$date':'pointsAwarded_date_time',
                                            'purchaseDate_$date':'purchased_date_time'})
receipts_df.head(5)
#receipts_df.drop(columns = 'rewardsReceiptItemList', inplace = True)

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,1609687531000,1609687531000,1609688000000.0,1609687536000,1609688000000.0,1609632000000.0
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,1609687483000,1609687483000,1609687000000.0,1609687488000,1609687000000.0,1609601000000.0
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,1609687537000,1609687537000,,1609687542000,,1609632000000.0
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,1609687534000,1609687534000,1609688000000.0,1609687539000,1609688000000.0,1609632000000.0
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,1609687506000,1609687506000,1609688000000.0,1609687511000,1609688000000.0,1609601000000.0


In [325]:
## applying time conversion to the receipts dataframe

for column in receipts_df.columns:
    if column[-10:] == '_date_time':
        receipts_df[column] = receipts_df[column].apply(unix_to_date_convert)
receipts_df.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,2021-01-03 09:25:31,2021-01-03 09:25:31,2021-01-03 09:25:31,2021-01-03 09:25:36,2021-01-03 09:25:31,2021-01-02 18:00:00
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,2021-01-03 09:24:43,2021-01-03 09:24:43,2021-01-03 09:24:43,2021-01-03 09:24:48,2021-01-03 09:24:43,2021-01-02 09:24:43
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,2021-01-03 09:25:37,2021-01-03 09:25:37,NaT,2021-01-03 09:25:42,NaT,2021-01-02 18:00:00
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,2021-01-03 09:25:34,2021-01-03 09:25:34,2021-01-03 09:25:34,2021-01-03 09:25:39,2021-01-03 09:25:34,2021-01-02 18:00:00
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,2021-01-03 09:25:06,2021-01-03 09:25:06,2021-01-03 09:25:11,2021-01-03 09:25:11,2021-01-03 09:25:06,2021-01-02 09:25:06


In [326]:
receipts_df[receipts_df['rewardsReceiptItemList'].isna()].shape

(440, 15)

### 440 entries with empty list of rewards items

In [327]:
#### Rewards receipts df

for record in receipts_raw_data:
    if 'rewardsReceiptItemList' not in record:
        record['rewardsReceiptItemList'] = []


rewards_receipts_df = pd.json_normalize(receipts_raw_data,record_path = 'rewardsReceiptItemList', meta = [['_id','$oid']],errors = 'ignore',sep = '_')
rewards_receipts_df = rewards_receipts_df.rename(columns= {'_id_$oid':'receipt_id'})

In [328]:
rewards_receipts_df.head()

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,...,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id
0,4011.0,ITEM NOT FOUND,26.0,26.0,False,1,True,5.0,4011.0,True,...,,,,,,,,,,5ff1e1eb0a720f0523000575
1,4011.0,ITEM NOT FOUND,1.0,1.0,,1,,1.0,,,...,,,,,,,,,,5ff1e1bb0a720f052300056b
2,28400642255.0,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.0,10.0,True,2,True,1.0,28400642255.0,True,...,,,,,,,,,,5ff1e1bb0a720f052300056b
3,,,,,False,1,True,,4011.0,True,...,,,,,,,,,,5ff1e1f10a720f052300057a
4,4011.0,ITEM NOT FOUND,28.0,28.0,False,1,True,4.0,4011.0,True,...,,,,,,,,,,5ff1e1ee0a7214ada100056f


In [329]:
brands_df.to_csv('brands.csv',index = False)
users_df.to_csv('users.csv',index = False)
receipts_df.to_csv('receipts.csv',index = False)
rewards_receipts_df.to_csv('rewards_receipts.csv',index = False)

In [330]:
#### EDA starts
receipts_df['rewardsReceiptItemList'].isna().sum()

440

In [331]:
#EDA

receipts_df[receipts_df['rewardsReceiptItemList'].isna()]

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
71,,,,,,SUBMITTED,,5a43c08fe4b014fd6b6a0612,5ff475820a7214ada10005cf,2021-01-05 08:19:46.000,2021-01-05 08:19:46.000,NaT,2021-01-05 08:19:46.000,NaT,NaT
93,,,,,,SUBMITTED,,5a43c08fe4b014fd6b6a0612,5ff5ecb90a7214ada10005f9,2021-01-06 11:00:40.000,2021-01-06 11:00:40.000,NaT,2021-01-06 11:00:40.000,NaT,NaT
149,,,,,,SUBMITTED,,5ff7264e8f142f11dd189504,5ff726860a720f05230005ec,2021-01-07 09:19:34.000,2021-01-07 09:19:34.000,NaT,2021-01-07 09:19:34.000,NaT,NaT
175,,,,0.0,,REJECTED,0.00,5ff8da28b3348b11c9337ac6,5ff8da570a720f05c5000015,2021-01-08 16:19:03.000,2021-01-08 16:19:03.000,NaT,2021-01-08 16:19:04.000,NaT,NaT
212,,,,,,SUBMITTED,,59c124bae4b0299e55b0f330,5ffce8570a7214ad4e003e6f,2021-01-11 18:07:51.000,2021-01-11 18:07:51.000,NaT,2021-01-11 18:07:51.000,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603c6adf0a720fde1000039a,2021-02-28 22:17:35.736,2021-02-28 22:17:35.736,NaT,2021-02-28 22:17:35.736,NaT,NaT
1111,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603c9e6e0a720fde100003c7,2021-03-01 01:57:34.307,2021-03-01 01:57:34.307,NaT,2021-03-01 01:57:34.307,NaT,NaT
1115,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603d0b710a720fde1000042a,2021-03-01 09:42:41.873,2021-03-01 09:42:41.873,NaT,2021-03-01 09:42:41.873,NaT,NaT
1116,,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33,603cf5290a720fde10000413,2021-03-01 08:07:37.664,2021-03-01 08:07:37.664,NaT,2021-03-01 08:07:37.664,NaT,NaT


In [332]:
rewards_receipts_df.columns

Index(['barcode', 'description', 'finalPrice', 'itemPrice', 'needsFetchReview',
       'partnerItemId', 'preventTargetGapPoints', 'quantityPurchased',
       'userFlaggedBarcode', 'userFlaggedNewItem', 'userFlaggedPrice',
       'userFlaggedQuantity', 'needsFetchReviewReason',
       'pointsNotAwardedReason', 'pointsPayerId', 'rewardsGroup',
       'rewardsProductPartnerId', 'userFlaggedDescription',
       'originalMetaBriteBarcode', 'originalMetaBriteDescription', 'brandCode',
       'competitorRewardsGroup', 'discountedItemPrice',
       'originalReceiptItemText', 'itemNumber',
       'originalMetaBriteQuantityPurchased', 'pointsEarned', 'targetPrice',
       'competitiveProduct', 'originalFinalPrice',
       'originalMetaBriteItemPrice', 'deleted', 'priceAfterCoupon',
       'metabriteCampaignId', 'receipt_id'],
      dtype='object')

In [333]:
rewards_receipts_df['barcode'].fillna(0)

0               4011
1               4011
2       028400642255
3                  0
4               4011
            ...     
6936      B07BRRLSVC
6937      B076FJ92M4
6938      B07BRRLSVC
6939      B076FJ92M4
6940      B07BRRLSVC
Name: barcode, Length: 6941, dtype: object

In [334]:
rewards_receipts_df.shape

(6941, 35)

In [335]:
rewards_receipts_df[(rewards_receipts_df['brandCode'] == 'KLEENEX') & (rewards_receipts_df['barcode'] != "036000391718")]

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,...,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id
2862,36000119749,KLEENEX POCKET RCH IN WRP FACIAL TISSUE 2 PLY ...,5.64,5.64,,1202,,2.0,,,...,,56.4,,,,,,,KLEENEX TRUSTED CARE FACIAL TISSUES 1 - 59 COU...,60099c3c0a7214ad89000135


In [336]:
rewards_receipts_df[rewards_receipts_df['brandCode'] == 'KLEENEX'].groupby('description').agg({'description':'size'}).rename(columns = {'description':'count'}).sort_values(by = 'count',ascending = False)

Unnamed: 0_level_0,count
description,Unnamed: 1_level_1
KLEENEX POP UP RECTANGLE BOX FACIAL TISSUE 2 PLY 8PK 160 CT,87
KLEENEX POCKET RCH IN WRP FACIAL TISSUE 2 PLY 8PK 15 CT,1


In [337]:
rewards_receipts_df.groupby('brandCode').agg({'brandCode':'size'}).rename(columns = {'brandCode':'count'}).sort_values(by = 'count',ascending = False)

Unnamed: 0_level_0,count
brandCode,Unnamed: 1_level_1
HY-VEE,291
BEN AND JERRYS,180
PEPSI,93
KROGER,89
KLEENEX,88
...,...
GERM-X,1
PURINA ONE,1
GREY POUPON,1
GRIMMWAY FARMS,1


In [338]:
##checking if unique brandcodes exist in receipt but not in brands

brandcodes_rewards_receipt = list(rewards_receipts_df['brandCode'].unique())
brandcodes_brands = list(brands_df['brandCode'].unique())

In [339]:
len(brandcodes_rewards_receipt)

228

In [340]:
missing_bar_codes = [x for x in brandcodes_rewards_receipt if x not in brandcodes_brands]

In [341]:
sorted(missing_bar_codes)

['7UP',
 'ADVIL',
 'AMERICAN BEAUTY',
 'ARROWHEAD',
 'AZTECA',
 'BANZA',
 'BEAR CREEK COUNTRY KITCHENS',
 'BEN AND JERRYS',
 'BETTY CROCKER',
 'BIC',
 'BIGELOW',
 'BLUE DIAMOND',
 "BOAR'S HEAD",
 'BORDEN',
 'BOTA BOX',
 'BRAND',
 "BRASWELL'S",
 'BUNNY',
 "BUSH'S BEST",
 'C&H',
 'CADBURY',
 'CAL-ORGANIC FARMS',
 'CALIFIA FARMS',
 "CAMPBELL'S",
 'CARAMELLO',
 'CHEERIOS',
 'CHEESE',
 'CHEEZ-IT',
 'CHEX',
 'CHICKEN OF THE SEA',
 'CHIQUITA',
 'CINNAMON TOAST CRUNCH',
 'COKE',
 'COLEMAN NATURAL',
 "CONNIE'S PIZZA",
 'CREST 3D WHITE',
 'CRISPIX',
 'DANNON',
 'DARE',
 'DELI',
 'DIET COKE',
 'DIGIORNO',
 'DOLE',
 'DR PEPPER',
 'EDWARDS',
 "EGGLAND'S BEST",
 'EGGO',
 'EL MONTEREY',
 'ENERGIZER MAX',
 'ESSENTIAL EVERYDAY',
 'FAGE',
 "FAMOUS DAVE'S",
 "FLORIDA'S NATURAL",
 'FOLGERS',
 'FORTUNE YAKISOBA',
 'FRANZ',
 "FRENCH'S",
 'FRESH EXPRESS',
 'FRESH STEP',
 'FRONTERA',
 'GALLO FAMILY VINEYARDS',
 'GENERAL MILLS',
 'GERBER',
 'GERM-X',
 'GREEN GIANT',
 'GRIMMWAY FARMS',
 'HANOVER',
 'HARVEST SNA

In [342]:
##checking missing brandcodes

rewards_receipts_df['brandCode'].isna().sum()

4341

In [343]:
rewards_receipts_df.shape

(6941, 35)

In [344]:
rewards_receipts_df[rewards_receipts_df['brandCode'].isna()]

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,...,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId,receipt_id
0,4011,ITEM NOT FOUND,26.00,26.00,False,1,True,5.0,4011,True,...,,,,,,,,,,5ff1e1eb0a720f0523000575
1,4011,ITEM NOT FOUND,1,1,,1,,1.0,,,...,,,,,,,,,,5ff1e1bb0a720f052300056b
2,028400642255,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.00,10.00,True,2,True,1.0,028400642255,True,...,,,,,,,,,,5ff1e1bb0a720f052300056b
3,,,,,False,1,True,,4011,True,...,,,,,,,,,,5ff1e1f10a720f052300057a
4,4011,ITEM NOT FOUND,28.00,28.00,False,1,True,4.0,4011,True,...,,,,,,,,,,5ff1e1ee0a7214ada100056f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6936,B07BRRLSVC,thindust summer face mask - sun protection nec...,11.99,11.99,,1,,1.0,,,...,,,,,,,,11.99,,603cc2bc0a720fde100003e9
6937,B076FJ92M4,mueller austria hypergrind precision electric ...,22.97,22.97,,0,,1.0,,,...,,,,,,,,22.97,,603cc0630a720fde100003e6
6938,B07BRRLSVC,thindust summer face mask - sun protection nec...,11.99,11.99,,1,,1.0,,,...,,,,,,,,11.99,,603cc0630a720fde100003e6
6939,B076FJ92M4,mueller austria hypergrind precision electric ...,22.97,22.97,,0,,1.0,,,...,,,,,,,,22.97,,603ce7100a7217c72c000405


In [345]:
rewards_receipts_df[rewards_receipts_df['brandCode'].isna()].groupby('description').agg({'description':'size'}).rename(columns = {'description':'count'}).sort_values(by = 'count',ascending = False)

Unnamed: 0_level_0,count
description,Unnamed: 1_level_1
ITEM NOT FOUND,173
KLARBRUNN 12PK 12 FL OZ,120
HUGGIES SIMPLY CLEAN PREMOISTENED WIPE FRAGRANCE FREE BAG 216 COUNT,92
MILLER LITE 24 PACK 12OZ CAN,90
COMP BOOK,73
...,...
CRAFIS,1
COTTONELLE ULTRA COMFORT CARE MEGA ROLL 2 PLY 284 COTTON TOILET TISSUE 12 CT,1
COS SARDINES WATER,1
"CORN NUTS Ranch - 1.7 oz. bag, 216 per case",1


#### every df together:


In [346]:
brands_df.shape

(1167, 9)

In [347]:
users_df.shape

(495, 7)

In [348]:
receipts_df.shape

(1119, 15)

In [349]:
rewards_receipts_df.shape

(6941, 35)