# User Data ETL
## Extract, clean and load User data to SQL Sales database

In [13]:
import sys
sys.path.append('/Users/ronakasher/Deepa/multinational_retail_data_centralisation')

In [14]:
from source_code.database_utils import DatabaseConnector
from source_code.data_extraction import DataExtractor
from source_code.data_cleaning import DataCleaning
import pandas as pd

## Extract data from SQLAlchemy DB using yaml credentials file

In [15]:
# Read credentials from yaml file and return dictionary of credentials
connector = DatabaseConnector('../db_creds.yaml')
db_creds = connector.read_db_creds()

In [16]:
#Initialise and return sqlalchemy db engine
db_engine = connector.init_db_engine()

In [17]:
#List all tables in DB
table_list = connector.list_db_tables()
print(table_list)

['legacy_store_details', 'legacy_users', 'orders_table']


In [18]:
#Extract user database table to pandas DataFrame
extractor = DataExtractor()
user_data = extractor.read_rds_table('legacy_users')
user_data.sample(100)

Unnamed: 0_level_0,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
11600,Curt,Müller,1992-03-11,Dörr AG,patriciadrewes@krein.com,Schäferstraße 6\n79077 Mellrichstadt,Germany,DE,(08391) 628670,2011-12-18,e5cc973d-fb45-4a0d-ab5c-57f39b67cdf4
2551,Josefine,Gute,1949-05-03,Mälzer GmbH,vdobes@neureuther.de,Jonas-Römer-Straße 0\n02085 Büsingen am Hochrhein,Germany,DE,(09929) 52301,2010-03-06,742759d0-280f-4662-b0c8-f3ec8caa312b
3847,Käthi,Christoph,1980-10-01,Boucsein KG,friederike43@gertz.de,Bolandergasse 299\n21578 Wanzleben,Germany,DE,01830571068,2014-12-08,44a4b33f-91d2-49b8-a8c5-d3ad108fcd75
9291,Terry,Butler,1998-12-26,White-Thomas,allan18@gilbert-morris.com,0 Richardson turnpike\nWest Conor\nPE4E 9FZ,United Kingdom,GB,+44(0)161 4960323,2001-04-01,855281e2-5a1e-47c3-95ea-a7314f3fa956
1938,Ben,Ward,1938-12-05,Ashton-Herbert,oiqbal@dawson.org,28 Phillip extension\nChandlerberg\nTN4A 1PN,United Kingdom,GB,01214960580,2000-02-15,3c99da63-0884-4510-831e-a27a57ab83a1
...,...,...,...,...,...,...,...,...,...,...,...
9998,Michael,Hunter,1951-08-15,Koch-White,clarkjudith@carter-norman.com,Unit 3541 Box 2045\nDPO AP 12344,United States,US,308.048.7992x66671,2015-01-01,2ce832e0-80d0-4840-830e-7cc5c58d6494
13819,Roselinde,Niemeier,1978-08-06,Gorlitz AG,muehledajana@stadelmann.org,Gertraud-Scheibe-Ring 43\n87422 Nabburg,Germany,DE,+49(0)9413 489828,1995-03-21,46c642f6-178b-4d77-9393-186f5e223cd3
15310,Gary,Clark,1990-06-26,Winter Inc,qmills@watson-lewis.com,6 Wong parks\nTurnerville\nS17 6TF,United Kingdom,GB,(0141) 4960179,2004-11-13,02c92839-d725-4bce-beeb-16c7b3534c5c
4973,Sonya,Ramirez,1995-04-10,Wheeler-Beard,jamie27@morton-williamson.com,"846 Matthews Fall Suite 116\nAngelatown, CT 13430",United States,US,(397)841-7725x344,2016-11-26,1dc761f1-7f8a-4b0e-b672-d6a9fb68b130


## Clean User data using DataCleaning class from source code

In [19]:
#Clean user data
clean_user_data = DataCleaning().clean_user_data(user_data)

In [20]:
#Remove Null rows in cleaned data
clean_user_data.dropna(how='all')
clean_user_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15300 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15284 non-null  object
 1   last_name      15285 non-null  object
 2   date_of_birth  15284 non-null  object
 3   company        15284 non-null  object
 4   email_address  15241 non-null  string
 5   address        15284 non-null  object
 6   country        15284 non-null  object
 7   country_code   15284 non-null  string
 8   phone_number   15300 non-null  object
 9   join_date      15284 non-null  object
 10  user_uuid      15284 non-null  object
dtypes: object(9), string(2)
memory usage: 1.4+ MB


In [21]:
#Upload to sales_data DB (SQL)
upload = DatabaseConnector('../db_creds.yaml')
upload.upload_to_db(clean_user_data,'dim_users')


300