# ETL phase 2: Export DB to .csv


## 1. import necessary modules

In [1]:
#==============================================
#=== Mothod 1: Install and import necessary modules
#==============================================
from tool_import_modules import *
modules = ['os', 'pandas', 'pyodbc', 'numpy', 'glob', 'seaborn', 'matplotlib', 'logging', 'time', 'openpyxl']
for module in modules:
    import_neccessary_modules(module)

In [2]:
#==============================================
#=== Mothod 2: Import modules directly
#==============================================
import os
import pandas
import pyodbc
import numpy as np
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import logging
import time
from openpyxl import Workbook
from openpyxl import load_workbook

## 2. Set path, config, and connection

In [9]:
# Set path
my_dbName = '0179Orders_Org'
my_path = r"C:\MyDataFiles\Data_JayCoop_202109"
my_path_DB = my_path + "\DB"
my_path_DW = my_path + "\DW"
my_path_cleaned = my_path + "\cleaned"
directors =  [my_path_DB, my_path_DW, my_path_cleaned]

# Set file names
log_fileName = time.strftime("%Y%m%d") + '_DB.log'
audit_fileName = time.strftime("%Y%m%d") + '_DB_audit.xlsx'
audit_fullPath = os.path.join(my_path, audit_fileName)

# Set log file
os.chdir(my_path)
logger = logging.getLogger()
while logger.hasHandlers():
    logger.removeHandler(logger.handlers[0])
#logger.setLevel(logging.DEBUG)
# Create file handler which logs even debug messages
fh = logging.FileHandler(log_fileName, 'w') # 'w'-overwrite; 'a'-append
fh.setLevel(logging.INFO)
# Create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# Create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s : [%(levelname)s] %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

# Check path
if not os.path.exists(my_path):
    os.makedirs(my_path)
    logger.info("Directory created: " + my_path)
# Clean log files
else:
    logExtension = ".log"
    auditExtension = '.xlsx'
    for root_folder, folders, files in os.walk(my_path):
        for file in files:
            file_path = os.path.join(root_folder, file)
            file_extension = os.path.splitext(file)[1]
            if file_extension == logExtension and file != log_fileName:
                if not os.remove(file_path):
                    logger.info("File deleted successfully: " + file_path)
                else:
                    logger.info("Unable to delete the " + file_path)
            if file_extension == auditExtension and file != audit_fileName:
                if not os.remove(file_path):
                    logger.info("File deleted successfully: " + file_path)
                else:
                    logger.info("Unable to delete the " + file_path)

# Check directors
for director in directors:
    if not os.path.exists(director):
        os.makedirs(director)
        logger.debug('\nDirectory created: ' + director)
# Check auditExcel
if not os.path.isfile(audit_fullPath):
    auditExcel = Workbook()
    sheet1 = auditExcel.active
    sheet1.title = 'CreatedFiles'
    sheet1.append(["File", "CreatedTime", "Path"])
    sheet2 = auditExcel.create_sheet(title="Cleansing")
    sheet2.append(["Database", "Table", "Column", "Value", "Issue"])
    auditExcel.save(audit_fullPath)
else:
    auditExcel = load_workbook(filename = audit_fullPath)
    sheet1 = auditExcel["CreatedFiles"]
    sheet2 = auditExcel["Cleansing"]
    adtExcSh1Row = sheet1.max_row
    adtExcSh2Row = sheet1.max_row
# Print log header
logger.info('==== Extract: from DB(SQL Server) ====')
# Set up SQL Server connector (DATABASE:'0179Orders_Org')
os.chdir(my_path_DB)
sql_conn = pyodbc.connect('DRIVER={SQL Server}; SERVER=localhost; DATABASE=0179Orders_Org; UID=sa; PWD=SQLServer2019')

2021-09-28 00:18:15,317 : [INFO] ==== Extract: from DB(SQL Server) ====


## 3. Get table name list

In [10]:
# Get table name list
logger.info('The tables are creating in ' + my_path_DB)
tables = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' AND TABLE_NAME != 'sysdiagrams'"
tbls = pandas.read_sql(tables, sql_conn)
print(tbls)

2021-09-28 00:18:21,221 : [INFO] The tables are creating in C:\MyDataFiles\Data_JayCoop_202109\DB


              TABLE_NAME
0                 Orders
1               Products
2          Order Details
3   CustomerCustomerDemo
4   CustomerDemographics
5                 Region
6            Territories
7    EmployeeTerritories
8                   test
9              Employees
10            Categories
11             Customers
12              Shippers
13             Suppliers


## 4. Extract DB from Docker to DB\ .CSV

In [8]:
auditExcel = load_workbook(filename = audit_fullPath)
sheet1 = auditExcel["CreatedFiles"]
adtExcSh1Row = sheet1.max_row
for index, row in tbls.iterrows():
    # Read from SQL Server CCBIS
    tableName = row['TABLE_NAME']
    fileName = tableName + '.csv'
    db_csv_fullPath = os.path.join(my_path_DB, tableName)
    query = "SELECT * FROM [dbo].[" + row['TABLE_NAME'] + "]"  
    logger.debug = query
    logger.info('--' + str(index+1) + '. ' + tableName + '.csv')  
    df = pandas.read_sql(query, sql_conn)
                
    # Write to DB\*.csv
    try:
        df.to_csv(my_path_DB + "\\" + tableName + '.csv', index=False)
        adtExcSh1Row = adtExcSh1Row + 1
        sheet1.cell(row=adtExcSh1Row, column=1).value = str(tableName + '.csv')
        sheet1.cell(row=adtExcSh1Row, column=2).value = time.asctime()
        sheet1.cell(row=adtExcSh1Row, column=3).value = my_path_DB
    except:
        tb = sys.exc_info()[2]
        logger.warn('**** File did NOT update successfully. Please try again after make sure file is not opened and have pomission to write. - ' + db_csv_fullPath)
        continue

auditExcel.save(audit_fullPath)
logger.info('Extract Completed Successfully - ' + str(len(os.listdir('.'))) + ' files created in ' + my_path_DB)  

2021-09-28 00:10:35,674 : [INFO] --1. Orders.csv
2021-09-28 00:10:43,333 : [INFO] --2. Products.csv
2021-09-28 00:10:44,959 : [INFO] --3. Order Details.csv
2021-09-28 00:11:01,797 : [INFO] --4. CustomerCustomerDemo.csv
2021-09-28 00:11:01,809 : [INFO] --5. CustomerDemographics.csv
2021-09-28 00:11:01,816 : [INFO] --6. Region.csv
2021-09-28 00:11:01,823 : [INFO] --7. Territories.csv
2021-09-28 00:11:01,832 : [INFO] --8. EmployeeTerritories.csv
2021-09-28 00:11:01,841 : [INFO] --9. test.csv
2021-09-28 00:11:39,883 : [INFO] --10. Employees.csv
2021-09-28 00:11:40,570 : [INFO] --11. Categories.csv
2021-09-28 00:11:40,597 : [INFO] --12. Customers.csv
2021-09-28 00:11:41,007 : [INFO] --13. Shippers.csv
2021-09-28 00:11:41,018 : [INFO] --14. Suppliers.csv
2021-09-28 00:11:41,661 : [INFO] Extract Completed Successfully - 14 files created in C:\MyDataFiles\Data_JayCoop_202109\DB
