In [1]:
from database_utils import DatabaseConnector
from data_extraction import DataExtraction
from data_cleaning import DataCleaning
import pandas as pd

database_connector = DatabaseConnector()
data_extractor = DataExtraction()
data_cleaner = DataCleaning()

# User Data Cleaning

In [None]:
database_connector.list_db_tables()

## Legacy Store Details Cleaning

In [None]:
# Reads a database table as "legacy_store_details_df" and saves it as a csv
legacy_store_details_df = data_extractor.read_rds_table("legacy_store_details")
legacy_store_details_df.to_csv("original_dfs/original_legacy_store_details_df.csv")

In [None]:
# Prints details of the "legacy_store_details_df"
legacy_store_details_df.info()
legacy_store_details_df.columns

In [None]:
# Cleans the legacy_store_details_data

cleaned_legacy_store_details_df = legacy_store_details_df
cleaned_legacy_store_details_df = cleaned_legacy_store_details_df.drop("lat", axis=1)
cleaned_legacy_store_details_df = data_cleaner.clean_null(cleaned_legacy_store_details_df)
cleaned_legacy_store_details_df = cleaned_legacy_store_details_df[['address', 'longitude', 'latitude', 'locality', 'store_code', 'staff_numbers', 'opening_date', 'store_type', 'country_code', 'continent']]
cleaned_legacy_store_details_df = data_cleaner.clean_rows_by_length_condition(cleaned_legacy_store_details_df, "country_code", 3)
cleaned_legacy_store_details_df = data_cleaner.clean_convert_date_column(cleaned_legacy_store_details_df, "opening_date")

cleaned_legacy_store_details_df.info()
cleaned_legacy_store_details_df.to_csv("cleaned_dfs/cleaned_legacy_store_details_df.csv")

## Legacy Users Cleaning

In [None]:
# Reads a database table as "legacy_users_df" and saves it as a csv
legacy_users_df = data_extractor.read_rds_table("legacy_users")
legacy_users_df.to_csv("original_dfs/original_legacy_users_df.csv")

In [None]:
legacy_users_df.info()

In [None]:
# Cleans the legacy_users_data

cleaned_legacy_users_df = legacy_users_df
cleaned_legacy_users_df = data_cleaner.clean_rows_by_length_condition(cleaned_legacy_users_df, "country_code", 3)
cleaned_legacy_users_df = data_cleaner.clean_null(cleaned_legacy_users_df)
cleaned_legacy_users_df = data_cleaner.clean_replace_value_in_column(cleaned_legacy_users_df, "country_code", "GGB", "GB")
cleaned_legacy_users_df = data_cleaner.clean_convert_date_column(cleaned_legacy_users_df, "date_of_birth")
cleaned_legacy_users_df = data_cleaner.clean_convert_date_column(cleaned_legacy_users_df, "join_date")
# cleaned_legacy_users_df = cleaned_legacy_users_df.reset_index(drop=True)    # Resets the index

cleaned_legacy_users_df.info()

# Overwrites the cleaned_df.csv witht the current cleared_df 
cleaned_legacy_users_df.to_csv("cleaned_dfs/cleaned_legacy_users_df.csv")

## Orders Table Cleaning

In [None]:
# Reads a database table as "orders_table_df" and saves it as a csv
orders_table_df = data_extractor.read_rds_table("orders_table")
orders_table_df.to_csv("original_dfs/original_orders_table_df.csv")

In [None]:
# Prints details of the "orders_table_df"
orders_table_df.info()
orders_table_df.columns

In [None]:
# Cleans the orders_table_df

cleaned_orders_table_df = orders_table_df
cleaned_orders_table_df = cleaned_orders_table_df.drop(["level_0", "1"], axis=1)

cleaned_orders_table_df.info()

# Overwrites the cleaned_df.csv witht the current cleared_df 
cleaned_orders_table_df.to_csv("cleaned_dfs/cleaned_orders_table_df.csv")

# Card Details Cleaning

In [None]:
# Retrieve data from pdf and save to pdf

card_details_df = data_extractor.retrieve_pdf_data()
card_details_df.to_csv("original_dfs/original_card_details.csv")

In [None]:
# Prints details of the "card_details_df"
card_details_df.info()
card_details_df.columns


In [None]:
# Cleans the "card_details_df"

cleaned_card_details_df = card_details_df
cleaned_card_details_df = data_cleaner.clean_null(cleaned_card_details_df)
cleaned_card_details_df = data_cleaner.clean_rows_by_length_condition(cleaned_card_details_df, "expiry_date", 5)
cleaned_card_details_df['expiry_date'] = pd.to_datetime(cleaned_card_details_df['expiry_date'], format='%m/%y')
cleaned_card_details_df = data_cleaner.clean_convert_date_column(cleaned_card_details_df, "date_payment_confirmed")

cleaned_card_details_df.info()

# Overwrites the cleaned_df.csv with the current cleared_df 
cleaned_card_details_df.to_csv("cleaned_dfs/cleaned_card_details_df.csv")

## Store Details Cleaning

In [None]:
data_extractor.list_number_of_stores()


In [10]:
store_data = data_extractor.retrieve_stores_data()

store_data_json = store_data.json()

store_data_df = pd.json_normalize(store_data_json)
# store_data_df = pd.DataFrame.from_dict(store_data_json, orient='columns')

# print(type(df))

In [17]:
import store_header
import requests

def retrieve_stores_data_2(header=store_header.header_dict):
        my_list = list(range(451))
        my_string = '&'.join(map(str, my_list))
        url = "https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details?index=1"
        response = requests.get(url=url, headers=header)
        store_data_json_2 = response.json()
        store_data_df_2 = pd.json_normalize(store_data_json_2)
        return store_data_df_2

retrieve_stores_data_2()

NotImplementedError: 