# Orders Data ETL
## Extract, clean and load orders data to SQL Sales database

In [1]:
import sys
sys.path.append('/Users/ronakasher/Deepa/multinational_retail_data_centralisation')

In [2]:
from source_code.database_utils import DatabaseConnector
from source_code.data_extraction import DataExtractor
from source_code.data_cleaning import DataCleaning
import pandas as pd

## Extract data from SQLAlchemy DB using yaml credentials file

In [3]:
# Read credentials from yaml file and return dictionary of credentials
connector = DatabaseConnector('../db_creds.yaml')
db_creds = connector.read_db_creds()

In [4]:
#Initialise and return sqlalchemy db engine
db_engine = connector.init_db_engine()

In [5]:
#List all tables in DB
table_list = connector.list_db_tables()
print(table_list)

['legacy_store_details', 'legacy_users', 'orders_table']


In [6]:
#Extract user database table to pandas DataFrame
extractor = DataExtractor()
orders_data = extractor.read_rds_table('orders_table')
orders_data.sample(100)

Unnamed: 0_level_0,level_0,date_uuid,first_name,last_name,user_uuid,card_number,store_code,product_code,1,product_quantity
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
43450,43450,8ceffc57-313e-4bda-ae40-d6e867d5fd80,,,dcd3b53a-de5c-443f-8b66-793ec3ffcb2c,4378345582062300,BR-CACF7508,t1-9172271B,,3
12490,12490,88a2ced4-768b-4354-9c69-cbf32430be14,,,75f2009e-e0fb-4457-9b89-153713ea4aca,4130185890853200,TR-4E1AF636,g4-9992446T,,14
86060,86060,184c7fa7-c16d-4d35-9f91-d2d84e8a01dd,Janko,Kreusel,908e2bd4-a3be-4f50-a92d-79cb75b568b2,180081273797946,WEB-1388012W,T0-7447903S,,1
12491,12491,b60ffbe7-f2e4-4d1a-9740-0ee7b365dc06,,,a0f62626-1352-4f03-b9f8-1d23168c3ed6,3552101791030870,RA-2A22B217,o2-3674957Q,,3
883,883,b3979741-dad0-471d-bc2b-f00613328ce3,,,ec4705ed-42e3-4e9f-9692-8904d9004e91,4749056042863770000,KI-DD9663C2,v1-8704395l,,2
...,...,...,...,...,...,...,...,...,...,...
29035,29035,866af21b-7228-41fc-a9f5-15c2802460d3,Charles,Davies,14d68a7e-0cba-44be-93cf-351a64c07c19,4837450567392,WEB-1388012W,e6-6572733P,,7
112912,112912,c1fb3e8b-573f-458e-9693-ef56ff249b85,,,e0bbd3d0-78aa-47a3-a904-9e6aa5bc59fc,3574693572783210,WEB-1388012W,r2-7003477Y,,4
13231,13231,0dcf705b-0883-4607-a3bd-0814e12ddc1e,,,e70e7653-cd37-4ab6-bc5e-3f315ec7900a,639059677117,OB-7A4A9012,F1-6037791p,,2
117197,117197,1323ef55-dc79-49c7-857f-ddbafbbce9ff,Abigail,Davies,62a0a4f5-7ef6-443e-8029-7ff4bbf8cc5f,30417356961334,WEB-1388012W,B1-5575220J,,2


## Clean orders data using DataCleaning class from source code - includings dropping columns first_name, last_name, 1 and level_0

In [7]:
#Clean user data
clean_orders_data = DataCleaning().clean_orders_data(orders_data)

In [8]:
#Remove Null rows in cleaned data
clean_orders_data.dropna(how='all')
clean_orders_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120123 entries, 0 to 118804
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   date_uuid         120123 non-null  object
 1   user_uuid         120123 non-null  object
 2   card_number       120123 non-null  string
 3   store_code        120123 non-null  string
 4   product_code      120123 non-null  string
 5   product_quantity  120123 non-null  int8  
dtypes: int8(1), object(2), string(3)
memory usage: 5.6+ MB


## Upload to Sales DB in SQL

In [9]:
#Upload to sales_data DB (SQL)
upload = DatabaseConnector('../db_creds.yaml')
upload.upload_to_db(clean_orders_data,'orders_table')


123