In [1]:
import pandas as pd
import numpy as np
import pyperclip as pc
import re

bs = pd.read_json('./branch_service_transaction_info.json')
ct = pd.read_json('./customer_transaction_info.json')


# Data Profiling

In [2]:
ct.columns

Index(['txn_id', 'avail_date', 'last_name', 'first_name', 'birthday'], dtype='object')

## Customer Transaction Columns

In [3]:
for i in ct.columns:
    x = ct[i]
    print(f'{i}: {x.dtype}')
    print(f'{x.shape[0]} elements with {x.nunique()} unique elements')
    print(x.unique())
    print('--------------------------------------------------')
    print()

txn_id: object
130653 elements with 62354 unique elements
['TXN-24546' 'TXN-14642' 'TXN-60295' ... 'TXN-18396' 'TXN-43076'
 'TXN-43065']
--------------------------------------------------

avail_date: object
130653 elements with 9489 unique elements
['2030-09-08' '2026-05-26' '2006-09-25' ... '2018-10-06' '2024-11-22'
 '2024-01-06']
--------------------------------------------------

last_name: object
130653 elements with 2345 unique elements
['ORTIZ' 'NIENOW' 'LESCH' ... 'Hamill' 'Stroman' 'Macejkovic']
--------------------------------------------------

first_name: object
130653 elements with 14740 unique elements
['EDUARDO' 'LEA' 'FLETA' ... 'Brandi' 'Jared' 'Santos']
--------------------------------------------------

birthday: object
130653 elements with 7668 unique elements
['1990-07-08' '2000-11-26' '1993-05-22' ... '2002-02-14' '1996-03-28'
 '1997-11-09']
--------------------------------------------------



## Error Scanning

```
TODO: find a way to apply a function to each series in a dataframe and to each element in a series
```
- That way we can actually do regex checking or type checking for possible incorrect
- we can actually use the `apply()` method for `S`erie or ``DataFrame` object


#### Profiling inputs in `ct.txn_id`

In [4]:
test = ct.copy()

In [5]:
test

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
...,...,...,...,...,...
130648,TXN-65468,2012-06-16,Cummings,Henry,2005-08-14
130649,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130650,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130651,TXN-01784,2015-01-11,Schmidt,Emilie,1996-05-21


In [6]:
nulls = test[test.txn_id.isna()].shape[0]
print(f'Amount of null values in ct.txn_id: {nulls}')

Amount of null values in ct.txn_id: 0


In [7]:
if nulls == 0:
    valids = test[test.txn_id.str.contains('TXN-[0-9]{5}')].shape[0]
    print(f'Valid token IDs: {test.shape[0]}/{valids}')

Valid token IDs: 130653/130653


#### Profiling inputs in `ct.avail_date`

In [8]:
test = ct.copy()
test

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
...,...,...,...,...,...
130648,TXN-65468,2012-06-16,Cummings,Henry,2005-08-14
130649,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130650,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130651,TXN-01784,2015-01-11,Schmidt,Emilie,1996-05-21


In [9]:
nulls = test[test.avail_date.isna()].shape[0]
print(f'Null values in ct.avail_date: {nulls}')

Null values in ct.avail_date: 0


In [10]:
if nulls == 0:
    valids = test[test.avail_date.str.fullmatch('[0-9]{4}-(0[0-9]|1[0-2])-([0-2][0-9]|[3][0-1])')].shape[0]
    print(f'Valid date format: {test.shape[0]}/{valids}')

Valid date format: 130653/130653


#### Profiling inputs in `ct.last_name`

In [11]:
test = ct.copy()

In [12]:
test.last_name.shape

(130653,)

In [13]:
nulls = test[test.last_name.isna()].shape[0]
print(f'Null values in ct.last_name: {nulls}')

Null values in ct.last_name: 0


In [14]:
test[test.last_name.str.contains('[^a-zA-Z]+')]

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
40000,TXN-49846,2026-06-24,Harvey,Jaydon,2009-03-04
40001,TXN-49846,2026-06-24,Harvey,Jaydon,2009-03-04
40002,TXN-09756,2006-06-21,Murray,Gisselle,2006-03-10
40003,TXN-09756,2006-06-21,Murray,Gisselle,2006-03-10
40004,TXN-09756,2006-06-21,Murray,Gisselle,2006-03-10
...,...,...,...,...,...
79995,TXN-54391,2028-10-31,"Johns,,,,,,,,",Turner........,2010-04-16
79996,TXN-54391,2028-10-31,"Johns,,,,,,,,",Turner........,2010-04-16
79997,TXN-54391,2028-10-31,"Johns,,,,,,,,",Turner........,2010-04-16
79998,TXN-54391,2028-10-31,"Johns,,,,,,,,",Turner........,2010-04-16


#### Profiling inputs in `ct.first_name`

In [15]:
test = ct.copy()

In [16]:
test.first_name.shape

(130653,)

In [17]:
nulls = test[test.first_name.isna()].shape[0]
print(f'Null values in ct.first_name: {nulls}')

Null values in ct.first_name: 0


In [18]:
test[test.first_name.str.contains('[a-zA-Z]+')]

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
...,...,...,...,...,...
130648,TXN-65468,2012-06-16,Cummings,Henry,2005-08-14
130649,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130650,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130651,TXN-01784,2015-01-11,Schmidt,Emilie,1996-05-21


#### Profiling inputs in `ct.birthday`

In [19]:
test = ct.copy()

In [20]:
nulls = test[test.birthday.isna()].shape[0]
print(f'Null values in ct.birthday: {nulls}')

Null values in ct.birthday: 0


In [21]:
if nulls == 0:
    valids = test[test.birthday.str.fullmatch('[0-9]{4}-(0[0-9]|1[0-2])-([0-2][0-9]|[3][0-1])')].shape[0]
    print(f'Valid date format: {valids}/{test.shape[0]}')

Valid date format: 130653/130653


In [22]:
test.birthday = pd.to_datetime(test.birthday)

In [23]:
print(f'invalid birthdays: {test[test.birthday > pd.Timestamp.today()].shape[0]}')

invalid birthdays: 0


## Branch Service Transaction Columns

In [24]:
for i in bs.columns:
    x = bs[i]
    print(f'{i}: {x.dtype}')
    print(f'{x.shape[0]} elements with {x.nunique()} unique elements')
    print(x.unique())
    print('--------------------------------------------------')
    print()

txn_id: object
130653 elements with 62354 unique elements
['TXN-24546' 'TXN-14642' 'TXN-60295' ... 'TXN-18396' 'TXN-43076'
 'TXN-43065']
--------------------------------------------------

branch_name: object
130653 elements with 9 unique elements
['MallOfAsia' 'Starmall' 'SmallMall' 'MayMall' 'FrankMall' 'Megamall'
 'RobinsonsMall' '' None 'N/A']
--------------------------------------------------

service: object
130653 elements with 7 unique elements
['Manicure' 'HairColor' 'FootSpa' 'Rebond' 'Haircut' 'NailColor'
 'Pedicure']
--------------------------------------------------

price: float64
130653 elements with 8 unique elements
[         nan   0.          30.1237897   66.12345678  77.987989
 100.12123    400.23123     55.2324      88.09393   ]
--------------------------------------------------



#### Profiling `bs.txn_id`

In [25]:
test = bs.copy()

In [26]:
test

Unnamed: 0,txn_id,branch_name,service,price
0,TXN-24546,MallOfAsia,Manicure,
1,TXN-14642,Starmall,HairColor,
2,TXN-60295,SmallMall,FootSpa,
3,TXN-60295,Starmall,FootSpa,
4,TXN-60295,MayMall,FootSpa,
...,...,...,...,...
130648,TXN-65468,MallOfAsia,Haircut,66.123457
130649,TXN-60822,MallOfAsia,Rebond,400.231230
130650,TXN-60822,FrankMall,Rebond,400.231230
130651,TXN-01784,RobinsonsMall,HairColor,88.093930


In [27]:
nulls = test[test.txn_id.isna()].shape[0]
print(f'Amount of null values in bs.txn_id: {nulls}')

Amount of null values in bs.txn_id: 0


In [28]:
if nulls == 0:
    valids = test[test.txn_id.str.fullmatch('TXN-[0-9]{5}')].shape[0]
    print(f'Valid token IDs: {valids}/{test.shape[0]}')

Valid token IDs: 130653/130653
