# Orders Data ETL
## Extract, clean and load orders data to SQL Sales database

In [8]:
import sys
sys.path.append('/Users/ronakasher/Deepa/multinational_retail_data_centralisation')

In [9]:
from source_code.database_utils import DatabaseConnector
from source_code.data_extraction import DataExtractor
from source_code.data_cleaning import DataCleaning
import pandas as pd

## Extract data from SQLAlchemy DB using yaml credentials file

In [10]:
# Read credentials from yaml file and return dictionary of credentials
connector = DatabaseConnector('../db_creds.yaml')
db_creds = connector.read_db_creds()

In [11]:
#Initialise and return sqlalchemy db engine
db_engine = connector.init_db_engine()

In [12]:
#List all tables in DB
table_list = connector.list_db_tables()
print(table_list)

['legacy_store_details', 'legacy_users', 'orders_table']


In [13]:
#Extract user database table to pandas DataFrame
extractor = DataExtractor()
orders_data = extractor.read_rds_table('orders_table')
orders_data.sample(100)

Unnamed: 0_level_0,level_0,date_uuid,first_name,last_name,user_uuid,card_number,store_code,product_code,1,product_quantity
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
44194,44195,d0e9ba4c-d042-400b-a78b-5e2a4cfc3b1f,,,7552fc75-8225-4574-a4b9-3b1919877439,4934395476359340,WA-A34BF06C,H3-2127985Z,,3
14852,14855,93f59ae4-73b2-410d-80c4-40dc6ce5ffe8,,,d4a46d93-f2a2-43c9-9a3f-577a6d584d7b,180030667993421,CH-7638016B,s8-877456Y,,6
32938,32938,5b436b6a-e90a-45a4-b61f-4358767cb22b,Jeffrey,Summers,367fc06f-1c81-4da8-a4bd-60787eaffbb3,340533893910728,WEB-1388012W,M6-0249525F,,2
90200,90200,2b568e0a-382a-49ee-8cae-6f34fb9068b3,,,ea736703-8199-46b1-9671-8a1b57b8bce7,3573569511985860,CO-D819CC5E,v0-0322422o,,1
86078,86078,a1d2a3dc-7551-4d47-a058-1ff04b8afd17,,,19dfc15d-4d78-41c7-9841-852538234927,502037895435,RU-F0666E4B,n7-0111416g,,4
...,...,...,...,...,...,...,...,...,...,...
85285,85285,24404085-206c-4850-935e-9626b8d6d5a5,,,773d977a-a55b-4a23-b958-ee0231d62c56,3542045162976040,SC-ADA59883,I2-9044234M,,3
33426,33426,3deb6447-3b02-48e0-9e28-1f4e9695701a,,,b350f474-13ab-4cfb-b6cb-56d07d04336d,2581120450492200,ST-10F19C97,Q9-2687893d,,7
115342,115342,58546578-c68d-4741-931b-0df2959f980e,,,dfa1884a-bc09-4d4c-a1a0-52cc92ffe3bc,3582852121965810,IN-E47115F0,m0-090471z,,1
112479,112479,13cf7a5c-72dd-476f-8676-3b4c59dfcb0b,,,4c87ab1e-1dbf-46ac-9b39-d2711b4eb746,639051773773,FA-6A7ABBAD,V0-463637X,,6


## Clean orders data using DataCleaning class from source code - includings dropping columns first_name, last_name, 1 and level_0

In [14]:
#Clean user data
clean_orders_data = DataCleaning().clean_orders_data(orders_data)

In [15]:
#Remove Null rows in cleaned data
clean_orders_data.dropna(how='all')
clean_orders_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 118804
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   date_uuid         120123 non-null  object
 1   user_uuid         120123 non-null  object
 2   card_number       120123 non-null  string
 3   store_code        120123 non-null  string
 4   product_code      120123 non-null  string
 5   product_quantity  120123 non-null  int8  
dtypes: int8(1), object(2), string(3)
memory usage: 5.6+ MB


## Upload to Sales DB in SQL

In [16]:
#Upload to sales_data DB (SQL)
upload = DatabaseConnector('../db_creds.yaml')
upload.upload_to_db(clean_orders_data,'orders_table')


123