In [140]:
import pandas as pd
import numpy as np
import pyperclip as pc
import re

bs = pd.read_json('./branch_service_transaction_info.json')
ct = pd.read_json('./customer_transaction_info.json')


# Data Profiling

In [141]:
ct.columns

Index(['txn_id', 'avail_date', 'last_name', 'first_name', 'birthday'], dtype='object')

## Customer Transaction Columns

In [142]:
for i in ct.columns:
    x = ct[i]
    print(f'{i}: {x.dtype}')
    print(f'{x.shape[0]} elements with {x.nunique()} unique elements')
    print(x.unique())
    print('--------------------------------------------------')
    print()

txn_id: object
130653 elements with 62354 unique elements
['TXN-24546' 'TXN-14642' 'TXN-60295' ... 'TXN-18396' 'TXN-43076'
 'TXN-43065']
--------------------------------------------------

avail_date: object
130653 elements with 9489 unique elements
['2030-09-08' '2026-05-26' '2006-09-25' ... '2018-10-06' '2024-11-22'
 '2024-01-06']
--------------------------------------------------

last_name: object
130653 elements with 2345 unique elements
['ORTIZ' 'NIENOW' 'LESCH' ... 'Hamill' 'Stroman' 'Macejkovic']
--------------------------------------------------

first_name: object
130653 elements with 14740 unique elements
['EDUARDO' 'LEA' 'FLETA' ... 'Brandi' 'Jared' 'Santos']
--------------------------------------------------

birthday: object
130653 elements with 7668 unique elements
['1990-07-08' '2000-11-26' '1993-05-22' ... '2002-02-14' '1996-03-28'
 '1997-11-09']
--------------------------------------------------



## Branch Service Transaction Columns

In [143]:
for i in bs.columns:
    x = bs[i]
    print(f'{i}: {x.dtype}')
    print(f'{x.shape[0]} elements with {x.nunique()} unique elements')
    print(x.unique())
    print('--------------------------------------------------')
    print()

txn_id: object
130653 elements with 62354 unique elements
['TXN-24546' 'TXN-14642' 'TXN-60295' ... 'TXN-18396' 'TXN-43076'
 'TXN-43065']
--------------------------------------------------

branch_name: object
130653 elements with 9 unique elements
['MallOfAsia' 'Starmall' 'SmallMall' 'MayMall' 'FrankMall' 'Megamall'
 'RobinsonsMall' '' None 'N/A']
--------------------------------------------------

service: object
130653 elements with 7 unique elements
['Manicure' 'HairColor' 'FootSpa' 'Rebond' 'Haircut' 'NailColor'
 'Pedicure']
--------------------------------------------------

price: float64
130653 elements with 8 unique elements
[         nan   0.          30.1237897   66.12345678  77.987989
 100.12123    400.23123     55.2324      88.09393   ]
--------------------------------------------------



## Error Scanning

```
TODO: find a way to apply a function to each series in a dataframe and to each element in a series
```
- That way we can actually do regex checking or type checking for possible incorrect
- we can actually use the `apply()` method for `Series or ``DataFrame` object


#### Query for invalid inputs in `ct.txn_id`

In [231]:
test = ct.copy()
test

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
...,...,...,...,...,...
130648,TXN-65468,2012-06-16,Cummings,Henry,2005-08-14
130649,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130650,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130651,TXN-01784,2015-01-11,Schmidt,Emilie,1996-05-21


In [232]:
def validate(x):
    if not isinstance(x, str):
        return None
    return x if re.match('TXN-[0-9]{5}', x) else None

test.txn_id = test.txn_id.apply(validate)

In [234]:
test[test.txn_id.isna()].shape

(0, 5)

#### Query for invalid inputs in `ct.avail_date`

In [235]:
test = ct.copy()
test

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
...,...,...,...,...,...
130648,TXN-65468,2012-06-16,Cummings,Henry,2005-08-14
130649,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130650,TXN-60822,2020-09-28,Feil,Jermey,2010-06-15
130651,TXN-01784,2015-01-11,Schmidt,Emilie,1996-05-21
