# ETL phase 3: Data Cleansing
<img src="data-prep-kdd-process-crisp-dm.jpg" width="1000" height="600">
<h2><center>Data preparation in both the KDD Process (left) and the CRISP-DM model (right).</center></h2>
<img src="kdd.gif" width="800" height="500">
<h2><center>KDD - Knowledge Discovery in Databases</center></h2>
<img src="CRISP-DM_Process_Diagram.png" width="500" height="300">
<h2><center>CRISP - Cross-industry standard process for data mining</center></h2>

Data cleansing consists of following 3 processes usally<br> 
* Missing Values
* Outlier Values
* Duplidated values

## 1. import necessary modules（Optional）

In [8]:
#==============================================
#=== Mothod 1: Install and import necessary modules
#==============================================
from tool_import_modules import *
modules = ['os', 'pandas', 'pyodbc', 'numpy', 'glob', 'seaborn', 'matplotlib', 'logging', 'time', 'xlwt', 'xlrd', 'openpyxl']
for module in modules:
    import_neccessary_modules(module)

In [9]:
#==============================================
#=== Mothod 2: Import modules directly
#==============================================
import os
import pandas
import pyodbc
import numpy as np
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import logging
import time
import xlwt
import xlrd
import openpyxl as xl

## 2. Set path, config, and connection（Optional）

In [11]:
# Set path
my_path = r"C:\MyDataFiles\Data_JayCoop_202109"
my_path_DB = my_path + "\DB"
my_path_DW = my_path + "\DW"
my_path_cleaned = my_path + "\cleaned"
directors =  [my_path_DB, my_path_DW, my_path_cleaned]
# Set file names
log_fileName = time.strftime("%Y%m%d") + '_DB.log'
audit_fileName = time.strftime("%Y%m%d") + '_DB_audit.xls'
audit_fullPath = os.path.join(my_path, audit_fileName)
# Set log file
os.chdir(my_path)
LOG = logging.getLogger(log_fileName)
LOG.setLevel(logging.DEBUG)
# Create file handler which logs even debug messages
fh = logging.FileHandler(log_fileName, 'w') # 'w'-overwrite; 'a'-append
fh.setLevel(logging.INFO)
# Create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# Create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s : [%(levelname)s] %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# Add the handlers to the logger
LOG.addHandler(fh)
LOG.addHandler(ch)
# Check path
if not os.path.exists(my_path):
    os.makedirs(my_path)
    LOG.info("Directory created: " + my_path)
# Check directors
for director in directors:
    if not os.path.exists(director):
        os.makedirs(director)
        LOG.debug('\nDirectory created: ' + director)
# Check auditExcel
if not os.path.isfile(audit_fullPath):
    auditExcel = xlwt.Workbook()
    sheet1 = auditExcel.add_sheet('Files')
    adtExcSh1Row = 0
    sheet1.write(0, 0, 'File')
    sheet1.write(0, 1, 'CreatedTime')
    sheet1.write(0, 2, 'Path')
    sheet2 = auditExcel.add_sheet('Cleansing')
    adtExcSh2Row = 0
    sheet2.write(0, 0, 'Database')
    sheet2.write(0, 1, 'Table')
    sheet2.write(0, 2, 'Column')
    sheet2.write(0, 3, 'Value')
    sheet2.write(0, 4, 'issue')
else:
    open(audit_fullPath, "a")
    #auditExcel = pandas.ExcelFile(audit_fileName)
    # auditExcel = xl.load_workbook(audit_fullPath)
    #auditExcel = pandas.ExcelFile(audit_fullPath)
    auditExcel = xlrd.open_workbook(audit_fileName)
    sheet2 = auditExcel.sheet_by_name('Cleansing')
    #row_count = sheet2.max_row
    #adtExcSh2Row = sheet2.max_row - 1
    adtExcSh2Row = sheet2.nrows
# Print log header
LOG.info('==== Cleaning Start ====')
# Set up SQL Server connector (DATABASE:'0179Orders_Org')
os.chdir(my_path_DB)
sql_conn = pyodbc.connect('DRIVER={SQL Server}; SERVER=localhost; DATABASE=0179Orders_Org; UID=sa; PWD=SQLServer2019')

2021-09-27 17:40:03,700 : [INFO] ==== Cleaning Start ====
2021-09-27 17:40:03,700 : [INFO] ==== Cleaning Start ====
2021-09-27 17:40:03,700 : [INFO] ==== Cleaning Start ====
2021-09-27 17:40:03,700 : [INFO] ==== Cleaning Start ====
2021-09-27 17:40:03,700 : [INFO] ==== Cleaning Start ====
[2021-09-27 17:40:03,700] [INFO] [20210927_DB.log] - ==== Cleaning Start ====


## 3. Data overview

In [12]:
# Get table name list
os.chdir(my_path_DB)
DB_files=os.listdir()

#LOG.debug(DB_files)
createVar = locals()
#print(createVar)
for i in DB_files:
    if i.endswith("csv"):
        tableName = i.split('.')[0]
        #print(createVar[tableName])
        createVar[tableName] = pandas.read_csv(i)
        #print(createVar[tableName])

  exec(code_obj, self.user_global_ns, self.user_ns)


ParserError: Error tokenizing data. C error: out of memory

In [12]:
# Cleaning
os.chdir(my_path_DB)
#colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
LOG.info("==== Cleaning Start ====")
for file in glob.glob("*.csv"):
    tableName = str(file)[:-4]    
    cleanFile = tableName + "_clean.csv"   

    # Get PK
    pkNameQuery = "SELECT Col.Column_Name as PkName from INFORMATION_SCHEMA.TABLE_CONSTRAINTS Tab, INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE Col WHERE Col.Constraint_Name = Tab.Constraint_Name AND Col.Table_Name = Tab.Table_Name AND Constraint_Type = 'PRIMARY KEY' AND Col.Table_Name = '" + tableName +"'"
    pkList = list(pandas.read_sql(pkNameQuery, sql_conn)["PkName"])

    # Get data
    df = pandas.read_csv(file, index_col = pkList)
    size_org = df.shape[0]
    cols = df.columns
    LOG.info('From: ' + file + ' - size' + str(df.shape))

    # Drop duplicate
    df.drop_duplicates(keep="first", inplace=True)

    # Print duplication info
    size_cleaned = df.shape[0]
    LOG.info('To    : ' + cleanFile + ' - size' + str(df.shape))
    num_duplication = size_org - size_cleaned
    if num_duplication > 0:
        LOG.info('------ [Duplication] ' + str(num_duplication) + ' records dropped from ' + file)
        adtExcSh2Row = adtExcSh2Row + 1
        sheet2.write(adtExcSh2Row, 0, 'Database')
        sheet2.write(adtExcSh2Row, 1, tableName)
        sheet2.write(adtExcSh2Row, 2, '-')
        sheet2.write(adtExcSh2Row, 3, int(num_duplication))
        sheet2.write(adtExcSh2Row, 4, 'Duplication') 
    # Set numeric columns
    df_numeric = df.select_dtypes(include=[np.number])
    numeric_cols = df_numeric.columns.values
   
    # Set non numeric columns
    df_non_numeric = df.select_dtypes(exclude=[np.number])
    non_numeric_cols = df_non_numeric.columns.values

    for col in df.columns:
        # cleaning missing
        missing = df[col].isnull()
        num_missing = np.sum(missing)
        pct_missing = np.mean(missing)
             
        if num_missing > 0: 

            # Print Missing Data Percentage List - % of missing.
            df['{}_ismissing'.format(col)] = missing


            # When numeric, fill with midian value 
            if col in numeric_cols:
                med = df[col].median()
                if col == 'NPS':
                    med = int(med)
                df[col] = df[col].fillna(med)
                adtExcSh2Row = adtExcSh2Row + 1
                sheet2.write(adtExcSh2Row, 0, 'CCBIS')
                sheet2.write(adtExcSh2Row, 1, tableName)
                sheet2.write(adtExcSh2Row, 2, col)
                sheet2.write(adtExcSh2Row, 3, int(num_missing))
                sheet2.write(adtExcSh2Row, 4, 'Missing')                 
                LOG.info('------ [Missing] ' + file + ' - "{}" - {}%'.format(col, round(pct_missing*100)) + ', ' + str(num_missing) + ' records missed - filling with ' + str(med))
            # When not numeric, fill with most frequent value     
            else:
                top = df[col].describe()['top'] # impute with the most frequent value.
                df[col] = df[col].fillna(top)
                adtExcSh2Row = adtExcSh2Row + 1
                sheet2.write(adtExcSh2Row, 0, 'CCBIS')
                sheet2.write(adtExcSh2Row, 1, tableName)
                sheet2.write(adtExcSh2Row, 2, col)
                sheet2.write(adtExcSh2Row, 3, int(num_missing))
                sheet2.write(adtExcSh2Row, 4, 'Missing')   
                LOG.info('------ [Missing] ' + file + ' - "{}" - {}%'.format(col, round(pct_missing*100)) + ', ' + str(num_missing) + ' records missed - filling with "' + top + '"')

        # cleaning outliner
        #df.boxplot(column=col)

    # write to the new csf in 'cleaned' director
    try:
        df.to_csv(my_path_cleaned + "/" + cleanFile)
        adtExcSh1Row = adtExcSh1Row + 1
        sheet1.write(adtExcSh1Row, 0, str(cleanFile))
        sheet1.write(adtExcSh1Row, 1, time.asctime())
        sheet1.write(adtExcSh1Row, 2, my_path_cleaned)
    except:
        tb = sys.exc_info()[2]
        LOG.warn('**** File did NOT update successfully. Please try again after make sure file is not opened and have pomission to write. - ' + my_path_cleaned + "/" + cleanFile)
        continue
    
    os.chdir(my_path_DB)

auditExcel.save(audit_fullPath)
LOG.info('Cleaning Completed Successfully - ' + str(len(os.listdir(my_path_cleaned))) + ' files created in ' + my_path_cleaned)    

2021-09-27 17:40:18,289 : [INFO] ==== Cleaning Start ====
2021-09-27 17:40:18,289 : [INFO] ==== Cleaning Start ====
2021-09-27 17:40:18,289 : [INFO] ==== Cleaning Start ====
2021-09-27 17:40:18,289 : [INFO] ==== Cleaning Start ====
2021-09-27 17:40:18,289 : [INFO] ==== Cleaning Start ====
[2021-09-27 17:40:18,289] [INFO] [20210927_DB.log] - ==== Cleaning Start ====
2021-09-27 17:40:18,837 : [INFO] From: Categories.csv - size(1009, 3)
2021-09-27 17:40:18,837 : [INFO] From: Categories.csv - size(1009, 3)
2021-09-27 17:40:18,837 : [INFO] From: Categories.csv - size(1009, 3)
2021-09-27 17:40:18,837 : [INFO] From: Categories.csv - size(1009, 3)
2021-09-27 17:40:18,837 : [INFO] From: Categories.csv - size(1009, 3)
[2021-09-27 17:40:18,837] [INFO] [20210927_DB.log] - From: Categories.csv - size(1009, 3)
2021-09-27 17:40:18,911 : [INFO] To    : Categories_clean.csv - size(906, 3)
2021-09-27 17:40:18,911 : [INFO] To    : Categories_clean.csv - size(906, 3)
2021-09-27 17:40:18,911 : [INFO] To   

AttributeError: 'Sheet' object has no attribute 'write'