This file has utility classes for exploring our data to assist in the development and data cleaning processes. Beyond the basic checking of types and exploring tables on the RDS database, the write_raw option to write to an SQL database where queries can be used to explore the data is extremly valuable.

In [70]:
import pandas as pd
from data_cleaning import DataCleaning
from database_utils import DatabaseConnector
from data_extraction import DataExtractor

def inspect_source_table_names():
    db_creds = db_connector.read_db_creds()
    db_engine = db_connector.init_db_engine(db_creds)
    table_names = db_connector.list_db_tables(db_engine)
    return table_names

def inspect_source_table_data(table_name):
    data_frame = data_extractor.read_rds_table(db_connector, table_name)
    explore_data_frame(data_frame)

def explore_data_frame(data_frame):
    print(data_frame.head())
    print(data_frame.info())
    print(data_frame.describe())
    print(data_frame.columns)

def explore_numeric_column(column):
    for cell_value in column:
        if cell_value is not None:
            try:
                int_test = int(cell_value)
            except ValueError:
                print(cell_value)

def explore_date_column(column):
    for cell_value in column:
        if cell_value is not None:
            try:
                date_test = pd.to_datetime(cell_value)
            except ValueError:
                print(cell_value)

data_extractor = DataExtractor()
db_connector = DatabaseConnector()
data_cleaner = DataCleaning()
api_config = data_extractor.read_api_creds()
api_header_dict = {'x-api-key': api_config['stores_api_key']}
number_stores_url = api_config['number_stores_url']
store_data_template = api_config['store_data_template']
card_data_url = api_config['card_data_url']
products_csv_uri = api_config['products_csv_uri']
date_details_url = api_config['date_details_url']

table inspection

In [51]:
table_names = inspect_source_table_names()
print(table_names)


['legacy_store_details', 'legacy_users', 'orders_table']


In [52]:
inspect_source_table_data(table_names[1])

   index first_name last_name date_of_birth                    company  \
0      0   Sigfried     Noack    1990-09-30         Heydrich Junitz KG   
1      1        Guy     Allen    1940-12-01                    Fox Ltd   
2      2      Harry  Lawrence    1995-08-02  Johnson, Jones and Harris   
3      3     Darren   Hussain    1972-09-23                Wheeler LLC   
4      4      Garry     Stone    1952-12-20                 Warner Inc   

                  email_address  \
0             rudi79@winkler.de   
1  rhodesclifford@henderson.com   
2  glen98@bryant-marshall.co.uk   
3    daniellebryan@thompson.org   
4       billy14@long-warren.com   

                                             address         country  \
0                       Zimmerstr. 1/0\n59015 Gießen         Germany   
1  Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH  United Kingdom   
2                 92 Ann drive\nJoanborough\nSK0 6LR  United Kingdom   
3             19 Robinson meadow\nNew Tracy\nW22 2QG  Un

PDF card details inspection

In [57]:
card_data_frame = data_extractor.retrieve_pdf_data(card_data_url)

find and fix card payment issues

In [59]:
explore_date_column(card_data_frame['date_payment_confirmed'])
card_data_frame['date_payment_confirmed'] = pd.to_datetime(card_data_frame['date_payment_confirmed'], errors='coerce')
print("---")
explore_date_column(card_data_frame['date_payment_confirmed'])


---
