In [58]:
from database_utils import DatabaseConnector
from data_extraction import DataExtraction
from data_cleaning import DataCleaning
import pandas as pd

database_connector = DatabaseConnector()
data_extractor = DataExtraction()
data_cleaner = DataCleaning()

# User Data Cleaning

In [59]:
database_connector.list_db_tables()

['legacy_store_details', 'legacy_users', 'orders_table']


## Legacy Store Details Cleaning

In [60]:
# Reads a database table as "legacy_store_details_df" and saves it as a csv
legacy_store_details_df = data_extractor.read_rds_table("legacy_store_details")
legacy_store_details_df.to_csv("original_dfs/original_legacy_store_details_df.csv")

In [61]:
# Prints details of the "legacy_store_details_df"
legacy_store_details_df.info()
legacy_store_details_df.columns

<class 'pandas.core.frame.DataFrame'>
Index: 451 entries, 1 to 450
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   address        451 non-null    object
 1   longitude      451 non-null    object
 2   lat            11 non-null     object
 3   locality       451 non-null    object
 4   store_code     451 non-null    object
 5   staff_numbers  451 non-null    object
 6   opening_date   451 non-null    object
 7   store_type     451 non-null    object
 8   latitude       450 non-null    object
 9   country_code   451 non-null    object
 10  continent      451 non-null    object
dtypes: object(11)
memory usage: 42.3+ KB


Index(['address', 'longitude', 'lat', 'locality', 'store_code',
       'staff_numbers', 'opening_date', 'store_type', 'latitude',
       'country_code', 'continent'],
      dtype='object')

In [62]:
# Cleans the legacy_store_details_data

cleaned_legacy_store_details_df = legacy_store_details_df
cleaned_legacy_store_details_df = cleaned_legacy_store_details_df.drop("lat", axis=1)
cleaned_legacy_store_details_df = data_cleaner.clean_null(cleaned_legacy_store_details_df)
cleaned_legacy_store_details_df = cleaned_legacy_store_details_df[['address', 'longitude', 'latitude', 'locality', 'store_code', 'staff_numbers', 'opening_date', 'store_type', 'country_code', 'continent']]
cleaned_legacy_store_details_df = data_cleaner.clean_rows_by_length_condition(cleaned_legacy_store_details_df, "country_code", 3)
cleaned_legacy_store_details_df = data_cleaner.clean_convert_date_column(cleaned_legacy_store_details_df, "opening_date")

cleaned_legacy_store_details_df.info()
cleaned_legacy_store_details_df.to_csv("cleaned_dfs/cleaned_legacy_store_details_df.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 440 entries, 1 to 450
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   address        440 non-null    object        
 1   longitude      440 non-null    object        
 2   latitude       440 non-null    object        
 3   locality       440 non-null    object        
 4   store_code     440 non-null    object        
 5   staff_numbers  440 non-null    object        
 6   opening_date   440 non-null    datetime64[ns]
 7   store_type     440 non-null    object        
 8   country_code   440 non-null    object        
 9   continent      440 non-null    object        
dtypes: datetime64[ns](1), object(9)
memory usage: 37.8+ KB


## Legacy Users Cleaning

In [63]:
# Reads a database table as "legacy_users_df" and saves it as a csv
legacy_users_df = data_extractor.read_rds_table("legacy_users")
legacy_users_df.to_csv("original_dfs/original_legacy_users_df.csv")

In [64]:
legacy_users_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15320 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15320 non-null  object
 1   last_name      15320 non-null  object
 2   date_of_birth  15320 non-null  object
 3   company        15320 non-null  object
 4   email_address  15320 non-null  object
 5   address        15320 non-null  object
 6   country        15320 non-null  object
 7   country_code   15320 non-null  object
 8   phone_number   15320 non-null  object
 9   join_date      15320 non-null  object
 10  user_uuid      15320 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


In [65]:
# Cleans the legacy_users_data

cleaned_legacy_users_df = legacy_users_df
cleaned_legacy_users_df = data_cleaner.clean_rows_by_length_condition(cleaned_legacy_users_df, "country_code", 3)
cleaned_legacy_users_df = data_cleaner.clean_null(cleaned_legacy_users_df)
cleaned_legacy_users_df = data_cleaner.clean_replace_value_in_column(cleaned_legacy_users_df, "country_code", "GGB", "GB")
cleaned_legacy_users_df = data_cleaner.clean_convert_date_column(cleaned_legacy_users_df, "date_of_birth")
cleaned_legacy_users_df = data_cleaner.clean_convert_date_column(cleaned_legacy_users_df, "join_date")
# cleaned_legacy_users_df = cleaned_legacy_users_df.reset_index(drop=True)    # Resets the index

cleaned_legacy_users_df.info()

# Overwrites the cleaned_df.csv witht the current cleared_df 
cleaned_legacy_users_df.to_csv("cleaned_dfs/cleaned_legacy_users_df.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 15284 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   first_name     15284 non-null  object        
 1   last_name      15284 non-null  object        
 2   date_of_birth  15284 non-null  datetime64[ns]
 3   company        15284 non-null  object        
 4   email_address  15284 non-null  object        
 5   address        15284 non-null  object        
 6   country        15284 non-null  object        
 7   country_code   15284 non-null  object        
 8   phone_number   15284 non-null  object        
 9   join_date      15284 non-null  datetime64[ns]
 10  user_uuid      15284 non-null  object        
dtypes: datetime64[ns](2), object(9)
memory usage: 1.4+ MB


## Orders Table Cleaning

In [66]:
# Reads a database table as "orders_table_df" and saves it as a csv
orders_table_df = data_extractor.read_rds_table("orders_table")
orders_table_df.to_csv("original_dfs/original_orders_table_df.csv")

In [67]:
# Prints details of the "orders_table_df"
orders_table_df.info()
orders_table_df.columns

<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 118804
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   level_0           120123 non-null  int64 
 1   date_uuid         120123 non-null  object
 2   first_name        15284 non-null   object
 3   last_name         15284 non-null   object
 4   user_uuid         120123 non-null  object
 5   card_number       120123 non-null  int64 
 6   store_code        120123 non-null  object
 7   product_code      120123 non-null  object
 8   1                 0 non-null       object
 9   product_quantity  120123 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 10.1+ MB


Index(['level_0', 'date_uuid', 'first_name', 'last_name', 'user_uuid',
       'card_number', 'store_code', 'product_code', '1', 'product_quantity'],
      dtype='object')

In [68]:
# Cleans the orders_table_df

cleaned_orders_table_df = orders_table_df
cleaned_orders_table_df = cleaned_orders_table_df.drop(["level_0", "1"], axis=1)

cleaned_orders_table_df.info()

# Overwrites the cleaned_df.csv witht the current cleared_df 
cleaned_orders_table_df.to_csv("cleaned_dfs/cleaned_orders_table_df.csv")

<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 118804
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   date_uuid         120123 non-null  object
 1   first_name        15284 non-null   object
 2   last_name         15284 non-null   object
 3   user_uuid         120123 non-null  object
 4   card_number       120123 non-null  int64 
 5   store_code        120123 non-null  object
 6   product_code      120123 non-null  object
 7   product_quantity  120123 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 8.2+ MB


# Card Details Cleaning

In [69]:
# Retrieve data from pdf and save to pdf

card_details_df = data_extractor.retrieve_pdf_data()
card_details_df.to_csv("original_dfs/original_card_details.csv")

In [72]:
# Prints details of the "card_details_df"
card_details_df.info()
card_details_df.columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15309 entries, 0 to 15308
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   card_number             15309 non-null  object
 1   expiry_date             15309 non-null  object
 2   card_provider           15309 non-null  object
 3   date_payment_confirmed  15309 non-null  object
dtypes: object(4)
memory usage: 478.5+ KB


Index(['card_number', 'expiry_date', 'card_provider',
       'date_payment_confirmed'],
      dtype='object')