In [1]:
## Import libraries

import pandas as pd
import numpy as np
import os
import json
import jsonlines
import datetime

pd.set_option('display.max_rows', 100)  # Set maximum rows to 100
pd.set_option('display.max_columns', 20) # Set maximum columns to 20

In [2]:
# Accessing Data_sources folders (the home of json files)
root_dir = os.getcwd()
data_src_dir = os.path.join(root_dir, 'data_sources')

We have 3 data sources in json format namely:
Brands, Receipts and Users

The JSON file contains multiple JSON objects (dictionaries) that are not properly separated by commas. To address this, we are using the jsonlines library, which can handle and parse such structures more effectively than the traditional json package.

#### Parsing Brands

In [3]:
## Parsing brands json
brand_json_file = os.path.join(data_src_dir, 'brands.json')

with jsonlines.open(brand_json_file) as file:
    brands_raw_data = []
    for obj in file:

        brands_raw_data.append(obj)
#sample brands
brands_raw_data[0:2]

[{'_id': {'$oid': '601ac115be37ce2ead437551'},
  'barcode': '511111019862',
  'category': 'Baking',
  'categoryCode': 'BAKING',
  'cpg': {'$id': {'$oid': '601ac114be37ce2ead437550'}, '$ref': 'Cogs'},
  'name': 'test brand @1612366101024',
  'topBrand': False},
 {'_id': {'$oid': '601c5460be37ce2ead43755f'},
  'barcode': '511111519928',
  'brandCode': 'STARBUCKS',
  'category': 'Beverages',
  'categoryCode': 'BEVERAGES',
  'cpg': {'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, '$ref': 'Cogs'},
  'name': 'Starbucks',
  'topBrand': False}]

### Parsing Receipts

In [4]:
## Parsing Receipts json file
receipts_json_file = os.path.join(data_src_dir, 'receipts.json')

with jsonlines.open(receipts_json_file) as file:
    receipts_raw_data = []
    for obj in file:

        receipts_raw_data.append(obj)
#sample brands
receipts_raw_data[0:1]

[{'_id': {'$oid': '5ff1e1eb0a720f0523000575'},
  'bonusPointsEarned': 500,
  'bonusPointsEarnedReason': 'Receipt number 2 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)',
  'createDate': {'$date': 1609687531000},
  'dateScanned': {'$date': 1609687531000},
  'finishedDate': {'$date': 1609687531000},
  'modifyDate': {'$date': 1609687536000},
  'pointsAwardedDate': {'$date': 1609687531000},
  'pointsEarned': '500.0',
  'purchaseDate': {'$date': 1609632000000},
  'purchasedItemCount': 5,
  'rewardsReceiptItemList': [{'barcode': '4011',
    'description': 'ITEM NOT FOUND',
    'finalPrice': '26.00',
    'itemPrice': '26.00',
    'needsFetchReview': False,
    'partnerItemId': '1',
    'preventTargetGapPoints': True,
    'quantityPurchased': 5,
    'userFlaggedBarcode': '4011',
    'userFlaggedNewItem': True,
    'userFlaggedPrice': '26.00',
    'userFlaggedQuantity': 5}],
  'rewardsReceiptStatus': 'FINISHED',
  'totalSpent': '26.00',
  'userId': '5ff1e1eacfcf6c399c274ae6

#### Parsing Users

In [5]:
## Parsing Users json file
users_json_file = os.path.join(data_src_dir, 'users.json')

with jsonlines.open(users_json_file) as file:
    users_raw_data = []
    for obj in file:

        users_raw_data.append(obj)
#sample brands
users_raw_data[0:2]

[{'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'},
  'active': True,
  'createdDate': {'$date': 1609687444800},
  'lastLogin': {'$date': 1609687537858},
  'role': 'consumer',
  'signUpSource': 'Email',
  'state': 'WI'},
 {'_id': {'$oid': '5ff1e194b6a9d73a3a9f1052'},
  'active': True,
  'createdDate': {'$date': 1609687444800},
  'lastLogin': {'$date': 1609687537858},
  'role': 'consumer',
  'signUpSource': 'Email',
  'state': 'WI'}]

#### Normalizing Users table

In [6]:
## Normalizing Users table

users_df = pd.json_normalize(users_raw_data, sep = '_')
users_df.head(5)

Unnamed: 0,active,role,signUpSource,state,id_$oid,createdDate_$date,lastLogin_$date
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,1609687530554,1609688000000.0
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0


In [7]:
## Renaming the columns for better readability

users_df = users_df.rename(columns = {'id_$oid':'user_id','createdDate_$date':'created_date_time','lastLogin_$date':'lastLogin_date_time'})
users_df.head(5)

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,1609687530554,1609688000000.0
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,1609687444800,1609688000000.0


An observation can be made about the date format: Which is a Unix timestamp in milliseconds. It represents the number of milliseconds have passed since Jan-01-1970 (UTC). Therefore there is a need to convert the UNIX timestamp to MM-DD-YYYY format for better readability.

Some of the Last_Login_Date values are missing (NaN). To ensure consistency in the data types within the column, I converted the NaN values to NaT (Not a Time), which is specifically used for missing values in datetime columns. This ensures that the entire column remains in a proper datetime format.

In [8]:
## data format conversion
def unix_to_date_convert(unix_ts):
    try:
        seconds = unix_ts/1000
        date = datetime.datetime.fromtimestamp(seconds)
    except:
        return pd.NaT
    return date

users_df['created_date_time'] = users_df['created_date_time'].apply(unix_to_date_convert)
users_df['lastLogin_date_time'] = users_df['lastLogin_date_time'].apply(unix_to_date_convert)
users_df.head()

Unnamed: 0,active,role,signUpSource,state,user_id,created_date_time,lastLogin_date_time
0,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
1,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
2,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858
3,True,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6,2021-01-03 09:25:30.554,2021-01-03 09:25:30.597
4,True,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052,2021-01-03 09:24:04.800,2021-01-03 09:25:37.858


In [9]:
## Understanding the data formats
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   active               495 non-null    bool          
 1   role                 495 non-null    object        
 2   signUpSource         447 non-null    object        
 3   state                439 non-null    object        
 4   user_id              495 non-null    object        
 5   created_date_time    495 non-null    datetime64[ns]
 6   lastLogin_date_time  433 non-null    datetime64[ns]
dtypes: bool(1), datetime64[ns](2), object(4)
memory usage: 23.8+ KB


#### Normalizing Brands data

In [10]:
## Normalizaing Brands Data

brands_df = pd.json_normalize(brands_raw_data, sep = '_')
brands_df.head()

Unnamed: 0,barcode,category,categoryCode,name,topBrand,id_$oid,cpg_$id_$oid,cpg_$ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


In [11]:
## Renaming Brands 

brands_df = brands_df.rename(columns = {'id_$oid':'brand_id','cpg_$id_$oid' : 'cpg_id','cpg_$ref' : 'cpg_ref' })
brands_df.head(5)

Unnamed: 0,barcode,category,categoryCode,name,topBrand,brand_id,cpg_id,cpg_ref,brandCode
0,511111019862,Baking,BAKING,test brand @1612366101024,False,601ac115be37ce2ead437551,601ac114be37ce2ead437550,Cogs,
1,511111519928,Beverages,BEVERAGES,Starbucks,False,601c5460be37ce2ead43755f,5332f5fbe4b03c9a25efd0ba,Cogs,STARBUCKS
2,511111819905,Baking,BAKING,test brand @1612366146176,False,601ac142be37ce2ead43755d,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146176
3,511111519874,Baking,BAKING,test brand @1612366146051,False,601ac142be37ce2ead43755a,601ac142be37ce2ead437559,Cogs,TEST BRANDCODE @1612366146051
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,False,601ac142be37ce2ead43755e,5332fa12e4b03c9a25efd1e7,Cogs,TEST BRANDCODE @1612366146827


#### Normalizing receipts

In [12]:
## Normalizaing receipts

receipts_df = pd.json_normalize(receipts_raw_data, sep = '_',max_level = 1)
receipts_df.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,_id_$oid,createDate_$date,dateScanned_$date,finishedDate_$date,modifyDate_$date,pointsAwardedDate_$date,purchaseDate_$date
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,1609687531000,1609687531000,1609688000000.0,1609687536000,1609688000000.0,1609632000000.0
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,1609687483000,1609687483000,1609687000000.0,1609687488000,1609687000000.0,1609601000000.0
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,1609687537000,1609687537000,,1609687542000,,1609632000000.0
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,1609687534000,1609687534000,1609688000000.0,1609687539000,1609688000000.0,1609632000000.0
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,1609687506000,1609687506000,1609688000000.0,1609687511000,1609688000000.0,1609601000000.0


In [13]:
## Renaming columns of Receipts

receipts_df = receipts_df.rename(columns = {'_id_$oid':'receipt_id',
                                            'createDate_$date':'created_date_time',
                                            'dateScanned_$date':'scanned_date_time',
                                            'finishedDate_$date':'finished_date_time',
                                            'modifyDate_$date':'modify_date_time',
                                            'pointsAwardedDate_$date':'pointsAwarded_date_time',
                                            'purchaseDate_$date':'purchased_date_time'})
receipts_df.head(5)
#receipts_df.drop(columns = 'rewardsReceiptItemList', inplace = True)

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,1609687531000,1609687531000,1609688000000.0,1609687536000,1609688000000.0,1609632000000.0
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,1609687483000,1609687483000,1609687000000.0,1609687488000,1609687000000.0,1609601000000.0
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,1609687537000,1609687537000,,1609687542000,,1609632000000.0
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,1609687534000,1609687534000,1609688000000.0,1609687539000,1609688000000.0,1609632000000.0
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,1609687506000,1609687506000,1609688000000.0,1609687511000,1609688000000.0,1609601000000.0


In [14]:
## applying time conversion to the receipts dataframe

for column in receipts_df.columns:
    if column[-10:] == '_date_time':
        receipts_df[column] = receipts_df[column].apply(unix_to_date_convert)
receipts_df.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receipt_id,created_date_time,scanned_date_time,finished_date_time,modify_date_time,pointsAwarded_date_time,purchased_date_time
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,2021-01-03 09:25:31,2021-01-03 09:25:31,2021-01-03 09:25:31,2021-01-03 09:25:36,2021-01-03 09:25:31,2021-01-02 18:00:00
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,2021-01-03 09:24:43,2021-01-03 09:24:43,2021-01-03 09:24:43,2021-01-03 09:24:48,2021-01-03 09:24:43,2021-01-02 09:24:43
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,2021-01-03 09:25:37,2021-01-03 09:25:37,NaT,2021-01-03 09:25:42,NaT,2021-01-02 18:00:00
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,2021-01-03 09:25:34,2021-01-03 09:25:34,2021-01-03 09:25:34,2021-01-03 09:25:39,2021-01-03 09:25:34,2021-01-02 18:00:00
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,2021-01-03 09:25:06,2021-01-03 09:25:06,2021-01-03 09:25:11,2021-01-03 09:25:11,2021-01-03 09:25:06,2021-01-02 09:25:06


In [15]:
#### Normalizing Rewards receipts df

for record in receipts_raw_data:
    if 'rewardsReceiptItemList' not in record:
        record['rewardsReceiptItemList'] = []


rewards_receipts_df = pd.json_normalize(receipts_raw_data,record_path = 'rewardsReceiptItemList', meta = [['_id','$oid']],errors = 'ignore',sep = '_')
rewards_receipts_df = rewards_receipts_df.rename(columns= {'_id_$oid':'receipt_id'})

In [16]:
## Writing dataframs to CSVs

brands_df.to_csv('brands.csv',index = False)
users_df.to_csv('users.csv',index = False)
receipts_df.to_csv('receipts.csv',index = False)
rewards_receipts_df.to_csv('rewards_receipts.csv',index = False)

#### The next step is to perform Data Quality checks (‘DateQualityIssues_Loading_Data.ipynb’)  on the above dataframes before loading the data onto MYSQL server.