# ETL phase 2: Export DB to .csv


## 1. import necessary modules

In [1]:
#==============================================
#=== Mothod 1: Install and import necessary modules
#==============================================
from tool_import_modules import *
modules = ['os', 'pandas', 'pyodbc', 'numpy', 'glob', 'seaborn', 'matplotlib', 'logging', 'time', 'openpyxl']
for module in modules:
    import_neccessary_modules(module)

In [2]:
#==============================================
#=== Mothod 2: Import modules directly
#==============================================
import os
import pandas
import pyodbc
import numpy as np
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import logging
import time
from openpyxl import Workbook
from openpyxl import load_workbook

## 2. Set path, config, and connection

In [3]:
# Set path
my_dbName = '0179Orders_Org'
my_path = r"C:\MyDataFiles\Data_JayCoop_202109"
my_path_DB = my_path + "\DB"
my_path_DW = my_path + "\DW"
my_path_cleaned = my_path + "\cleaned"
directors =  [my_path_DB, my_path_DW, my_path_cleaned]

# Set file names
log_fileName = time.strftime("%Y%m%d") + '_DB.log'
audit_fileName = time.strftime("%Y%m%d") + '_DB_audit.xlsx'
audit_fullPath = os.path.join(my_path, audit_fileName)

# Set log file
os.chdir(my_path)
logger = logging.getLogger()
while logger.hasHandlers():
    logger.removeHandler(logger.handlers[0])
#logger.setLevel(logging.DEBUG)
# Create file handler which logs even debug messages
fh = logging.FileHandler(log_fileName, 'w') # 'w'-overwrite; 'a'-append
fh.setLevel(logging.INFO)
# Create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# Create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s : [%(levelname)s] %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

# Check path
if not os.path.exists(my_path):
    os.makedirs(my_path)
    logger.info("Directory created: " + my_path)
# Clean log files
else:
    logExtension = ".log"
    auditExtension = '.xlsx'
    for root_folder, folders, files in os.walk(my_path):
        for file in files:
            file_path = os.path.join(root_folder, file)
            file_extension = os.path.splitext(file)[1]
            if file_extension == logExtension and file != log_fileName:
                if not os.remove(file_path):
                    logger.info("File deleted successfully: " + file_path)
                else:
                    logger.info("Unable to delete the " + file_path)
            if file_extension == auditExtension and file != audit_fileName:
                if not os.remove(file_path):
                    logger.info("File deleted successfully: " + file_path)
                else:
                    logger.info("Unable to delete the " + file_path)

# Check directors
for director in directors:
    if not os.path.exists(director):
        os.makedirs(director)
        logger.debug('\nDirectory created: ' + director)
# Check auditExcel
if not os.path.isfile(audit_fullPath):
    auditExcel = Workbook()
    sheet1 = auditExcel.active
    sheet1.title = 'CreatedFiles'
    sheet1.append(["File", "CreatedTime", "Path"])
    sheet2 = auditExcel.create_sheet(title="Cleansing")
    sheet2.append(["Database", "Table", "Column", "Value", "Issue"])
    auditExcel.save(audit_fullPath)
else:
    auditExcel = load_workbook(filename = audit_fullPath)
    sheet1 = auditExcel["CreatedFiles"]
    sheet2 = auditExcel["Cleansing"]
    adtExcSh1Row = sheet1.max_row
    adtExcSh2Row = sheet1.max_row
# Print log header
logger.info('==== Extract: from DB(SQL Server) ====')
# Set up SQL Server connector (DATABASE:'0179Orders_Org')
os.chdir(my_path_DB)
sql_conn = pyodbc.connect('DRIVER={SQL Server}; SERVER=localhost; DATABASE=0179Orders_Org; UID=sa; PWD=SQLServer2019')

2021-09-28 14:41:44,494 : [INFO] ==== Extract: from DB(SQL Server) ====


## 3. Get table name list

In [16]:
# Get table name list
logger.info('The tables are creating in ' + my_path_DB)
tables = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' AND TABLE_NAME != 'sysdiagrams'"
tbls = pandas.read_sql(tables, sql_conn)
print(tbls)
# Get table info
tables_info = "SELECT t.TABLE_NAME, SUM(CASE WHEN c.CONSTRAINT_TYPE = 'PRIMARY KEY' THEN 1 ELSE 0 END) AS pk, SUM(CASE WHEN c.CONSTRAINT_TYPE = 'UNIQUE' THEN 1 ELSE 0 END) AS uni, SUM(CASE WHEN c.CONSTRAINT_TYPE = 'FOREIGN KEY' THEN 1 ELSE 0 END) AS fk FROM INFORMATION_SCHEMA.TABLES as t LEFT JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS as c ON t.TABLE_NAME = c.TABLE_NAME WHERE (t.TABLE_TYPE = 'BASE TABLE' AND t.TABLE_NAME != 'sysdiagrams') GROUP BY t.TABLE_NAME ORDER BY t.TABLE_NAME ASC;"
tbls_info = pandas.read_sql(tables2, sql_conn)
print('===============')
print(tbls_info)

2021-09-29 01:27:23,165 : [INFO] The tables are creating in C:\MyDataFiles\Data_JayCoop_202109\DB


              TABLE_NAME
0                 Orders
1               Products
2          Order Details
3   CustomerCustomerDemo
4   CustomerDemographics
5                 Region
6            Territories
7    EmployeeTerritories
8                   test
9              Employees
10            Categories
11             Customers
12              Shippers
13             Suppliers
              TABLE_NAME  pk  uni  fk
0             Categories   1    0   0
1   CustomerCustomerDemo   1    0   2
2   CustomerDemographics   1    0   0
3              Customers   1    0   0
4              Employees   1    0   0
5    EmployeeTerritories   0    0   2
6          Order Details   1    0   2
7                 Orders   1    0   3
8               Products   1    0   2
9                 Region   1    0   0
10              Shippers   1    0   0
11             Suppliers   1    0   0
12           Territories   1    0   1
13                  test   0    0   0


## 4. Extract DB from Docker to DB\ .CSV

In [18]:
auditExcel = load_workbook(filename = audit_fullPath)
sheet1 = auditExcel["CreatedFiles"]
adtExcSh1Row = sheet1.max_row
for index, row in tbls.iterrows():
    # Read from SQL Server CCBIS
    tableName = row['TABLE_NAME']
    fileName = tableName + '.csv'
    db_csv_fullPath = os.path.join(my_path_DB, tableName)
    query = "SELECT * FROM [dbo].[" + row['TABLE_NAME'] + "]"  
    logger.debug = query
    logger.info('--' + str(index+1) + '. ' + tableName + '.csv')  
    df = pandas.read_sql(query, sql_conn)
    # Data overview
    df.info()       
    tables_info2 = "SELECT COLUMN_NAME, DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, NUMERIC_PRECISION, IS_NULLABLE, COLUMN_DEFAULT FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '" + tableName +"'"        
    tbls_info2 = pandas.read_sql(tables_info2, sql_conn)
    print('===============')
    print(tbls_info2)
    # Write to DB\*.csv
    try:
        #df.to_csv(my_path_DB + "\\" + tableName + '.csv', index=False)
        adtExcSh1Row = adtExcSh1Row + 1
        sheet1.cell(row=adtExcSh1Row, column=1).value = str(tableName + '.csv')
        sheet1.cell(row=adtExcSh1Row, column=2).value = time.asctime()
        sheet1.cell(row=adtExcSh1Row, column=3).value = my_path_DB
    except:
        tb = sys.exc_info()[2]
        logger.warn('**** File did NOT update successfully. Please try again after make sure file is not opened and have pomission to write. - ' + db_csv_fullPath)
        continue
auditExcel.save(audit_fullPath)
logger.info('Extract Completed Successfully - ' + str(len(os.listdir('.'))) + ' files created in ' + my_path_DB)  

2021-09-29 01:49:51,853 : [INFO] --1. Orders.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111077 entries, 0 to 111076
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   OrderID         111077 non-null  int64         
 1   CustomerID      100785 non-null  object        
 2   EmployeeID      111077 non-null  int64         
 3   OrderDate       100785 non-null  datetime64[ns]
 4   RequiredDate    100785 non-null  datetime64[ns]
 5   ShippedDate     100765 non-null  datetime64[ns]
 6   ShipVia         100785 non-null  float64       
 7   Freight         100785 non-null  float64       
 8   ShipName        100785 non-null  object        
 9   ShipAddress     100785 non-null  object        
 10  ShipCity        100785 non-null  object        
 11  ShipRegion      100323 non-null  object        
 12  ShipPostalCode  100766 non-null  object        
 13  ShipCountry     100785 non-null  object        
 14  CreditCard      111077 non-null  obj

2021-09-29 01:49:54,544 : [INFO] --2. Products.csv


       COLUMN_NAME DATA_TYPE  CHARACTER_MAXIMUM_LENGTH  NUMERIC_PRECISION  \
0          OrderID       int                       NaN               10.0   
1       CustomerID     nchar                       5.0                NaN   
2       EmployeeID       int                       NaN               10.0   
3        OrderDate  datetime                       NaN                NaN   
4     RequiredDate  datetime                       NaN                NaN   
5      ShippedDate  datetime                       NaN                NaN   
6          ShipVia       int                       NaN               10.0   
7          Freight     money                       NaN               19.0   
8         ShipName  nvarchar                      40.0                NaN   
9      ShipAddress  nvarchar                      60.0                NaN   
10        ShipCity  nvarchar                      15.0                NaN   
11      ShipRegion  nvarchar                      15.0                NaN   

2021-09-29 01:49:55,983 : [INFO] --3. Order Details.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80616 entries, 0 to 80615
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ProductID        80616 non-null  int64  
 1   ProductName      80616 non-null  object 
 2   SupplierID       80616 non-null  int64  
 3   CategoryID       80616 non-null  int64  
 4   QuantityPerUnit  80616 non-null  object 
 5   UnitPrice        80616 non-null  float64
 6   UnitsInStock     80616 non-null  int64  
 7   UnitsOnOrder     80616 non-null  int64  
 8   ReorderLevel     80616 non-null  int64  
 9   Discontinued     80616 non-null  bool   
dtypes: bool(1), float64(1), int64(6), object(2)
memory usage: 5.0+ MB
       COLUMN_NAME DATA_TYPE  CHARACTER_MAXIMUM_LENGTH  NUMERIC_PRECISION  \
0        ProductID       int                       NaN               10.0   
1      ProductName  nvarchar                      40.0                NaN   
2       SupplierID       int         

2021-09-29 01:50:07,586 : [INFO] --4. CustomerCustomerDemo.csv
2021-09-29 01:50:07,628 : [INFO] --5. CustomerDemographics.csv
2021-09-29 01:50:07,665 : [INFO] --6. Region.csv
2021-09-29 01:50:07,703 : [INFO] --7. Territories.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1203498 entries, 0 to 1203497
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   OrderID    1203498 non-null  int64  
 1   ProductID  1203498 non-null  int64  
 2   UnitPrice  1203498 non-null  float64
 3   Quantity   1203498 non-null  int64  
 4   Discount   1203498 non-null  float64
dtypes: float64(2), int64(3)
memory usage: 45.9 MB
  COLUMN_NAME DATA_TYPE CHARACTER_MAXIMUM_LENGTH  NUMERIC_PRECISION  \
0     OrderID       int                     None                 10   
1   ProductID       int                     None                 10   
2   UnitPrice     money                     None                 19   
3    Quantity  smallint                     None                  5   
4    Discount      real                     None                 24   

  IS_NULLABLE COLUMN_DEFAULT  
0          NO           None  
1          NO           None  
2          NO          

2021-09-29 01:50:07,745 : [INFO] --8. EmployeeTerritories.csv
2021-09-29 01:50:07,788 : [INFO] --9. test.csv


            COLUMN_NAME DATA_TYPE  CHARACTER_MAXIMUM_LENGTH  \
0           TerritoryID  nvarchar                      20.0   
1  TerritoryDescription      char                      50.0   
2              RegionID       int                       NaN   

   NUMERIC_PRECISION IS_NULLABLE COLUMN_DEFAULT  
0                NaN          NO           None  
1                NaN          NO           None  
2               10.0          NO           None  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   EmployeeID   49 non-null     int64 
 1   TerritoryID  49 non-null     object
dtypes: int64(1), object(1)
memory usage: 652.0+ bytes
   COLUMN_NAME DATA_TYPE  CHARACTER_MAXIMUM_LENGTH  NUMERIC_PRECISION  \
0   EmployeeID       int                       NaN               10.0   
1  TerritoryID  nvarchar                      20.0                NaN   

  IS_N

2021-09-29 01:50:27,056 : [INFO] --10. Employees.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89999 entries, 0 to 89998
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   stuff   89999 non-null  object
dtypes: object(1)
memory usage: 351.6+ KB
  COLUMN_NAME DATA_TYPE  CHARACTER_MAXIMUM_LENGTH NUMERIC_PRECISION  \
0       stuff      char                      8000              None   

  IS_NULLABLE COLUMN_DEFAULT  
0         YES           None  


2021-09-29 01:50:27,472 : [INFO] --11. Categories.csv
2021-09-29 01:50:27,545 : [INFO] --12. Customers.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10009 entries, 0 to 10008
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   EmployeeID       10009 non-null  int64         
 1   LastName         10009 non-null  object        
 2   FirstName        10009 non-null  object        
 3   Title            10009 non-null  object        
 4   TitleOfCourtesy  10009 non-null  object        
 5   BirthDate        10009 non-null  datetime64[ns]
 6   HireDate         10009 non-null  datetime64[ns]
 7   Address          10009 non-null  object        
 8   City             10009 non-null  object        
 9   Region           7463 non-null   object        
 10  PostalCode       10009 non-null  object        
 11  Country          10009 non-null  object        
 12  HomePhone        10009 non-null  object        
 13  Extension        10009 non-null  object        
 14  Photo            9 non-null      objec

2021-09-29 01:50:27,930 : [INFO] --13. Shippers.csv
2021-09-29 01:50:28,044 : [INFO] --14. Suppliers.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13007 entries, 0 to 13006
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    13007 non-null  object
 1   CompanyName   13007 non-null  object
 2   ContactName   13007 non-null  object
 3   ContactTitle  13007 non-null  object
 4   Address       13007 non-null  object
 5   City          13007 non-null  object
 6   Region        12955 non-null  object
 7   PostalCode    13006 non-null  object
 8   Country       13007 non-null  object
 9   Phone         13007 non-null  object
 10  Fax           12989 non-null  object
dtypes: object(11)
memory usage: 559.0+ KB
     COLUMN_NAME DATA_TYPE  CHARACTER_MAXIMUM_LENGTH NUMERIC_PRECISION  \
0     CustomerID     nchar                         5              None   
1    CompanyName     nchar                        40              None   
2    ContactName     nchar                        30              None   
3   Co

2021-09-29 01:50:28,150 : [INFO] Extract Completed Successfully - 14 files created in C:\MyDataFiles\Data_JayCoop_202109\DB


     COLUMN_NAME DATA_TYPE  CHARACTER_MAXIMUM_LENGTH  NUMERIC_PRECISION  \
0     SupplierID       int                       NaN               10.0   
1    CompanyName  nvarchar              4.000000e+01                NaN   
2    ContactName  nvarchar              3.000000e+01                NaN   
3   ContactTitle  nvarchar              3.000000e+01                NaN   
4        Address  nvarchar              6.000000e+01                NaN   
5           City  nvarchar              1.500000e+01                NaN   
6         Region  nvarchar              1.500000e+01                NaN   
7     PostalCode  nvarchar              1.000000e+01                NaN   
8        Country  nvarchar              1.500000e+01                NaN   
9          Phone  nvarchar              2.400000e+01                NaN   
10           Fax  nvarchar              2.400000e+01                NaN   
11      HomePage     ntext              1.073742e+09                NaN   

   IS_NULLABLE COLUMN_DE