# ETL phase 2, Data Cleaning - Missing Data

In [1]:
#!pip install pandas
#!pip install os
#!pip install seaborn
import pandas as pd
import numpy as np
import os
import glob
import seaborn as sns
import matplotlib.pyplot as plt

In [16]:
#set path
my_path = r"C:\MyDataFiles\Data_CCBIS_202107"
my_path_cleaned = my_path + "\cleaned"
if not os.path.exists(my_path_cleaned):
    os.makedirs(my_path_cleaned)
    print('Directory created: ' + my_path_cleaned)
os.chdir(my_path)

print('Files in ' + my_path)
for file in glob.glob("*.csv"):
    print(file)



Directory created: C:\MyDataFiles\Data_CCBIS_202107\cleaned
Files in C:\MyDataFiles\Data_CCBIS_202107
CDR.csv
DimAgent.csv
DimCustomer.csv
DimGeography.csv
DimHandleType.csv
DimProduct.csv
DimProductGroup.csv
DimServiceType.csv
DimSeverifyType.csv


## --Cleaning Duplicate Data

In [43]:
# Cleaning Duplicate Data
os.chdir(my_path)
print("==== Cleaning Duplicate Data ====")
for file in glob.glob("*.csv"):

    # get file
    df = pd.read_csv(file)
    print('\nFrom: ' + file + str(df.shape))

    # drop duplicate
    df.drop_duplicates(keep="first", inplace=True)

    # set new file path
    fileName = file
    newFileName = str(fileName)[:-4] + "_clean.csv"
    os.chdir(my_path_cleaned)
    file = newFileName

    # write to the csf if need
    df.to_csv(file, index=False) 
    print('To:   ' + file + str(df.shape))

    os.chdir(my_path)

==== Cleaning Duplicate Data ====

From: CDR.csv(16001, 18)
To:   CDR_clean.csv(16000, 18)

From: DimAgent.csv(102, 5)
To:   DimAgent_clean.csv(102, 5)

From: DimCustomer.csv(18484, 29)
To:   DimCustomer_clean.csv(18484, 29)

From: DimGeography.csv(655, 10)
To:   DimGeography_clean.csv(655, 10)

From: DimHandleType.csv(3, 2)
To:   DimHandleType_clean.csv(3, 2)

From: DimProduct.csv(25, 3)
To:   DimProduct_clean.csv(25, 3)

From: DimProductGroup.csv(5, 2)
To:   DimProductGroup_clean.csv(5, 2)

From: DimServiceType.csv(16, 2)
To:   DimServiceType_clean.csv(16, 2)

From: DimSeverifyType.csv(3, 2)
To:   DimSeverifyType_clean.csv(3, 2)


## -- Cleaning Missing Data

In [18]:
# Cleaning Missing Date

os.chdir(my_path)
print("==== Cleaning Missing Data ====")
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.

# Set loop to clean all csv data files
for file in glob.glob("*.csv"):

    # read the data
    df = pd.read_csv(file)
    cols = df.columns
    print('\nFrom: ' + file + str(df.shape))

    # set numeric columns
    df_numeric = df.select_dtypes(include=[np.number])
    numeric_cols = df_numeric.columns.values
   
    # set non numeric columns
    df_non_numeric = df.select_dtypes(exclude=[np.number])
    non_numeric_cols = df_non_numeric.columns.values
     
    for col in df.columns:
        missing = df[col].isnull()
        num_missing = np.sum(missing)
        pct_missing = np.mean(missing)
       
        
        if num_missing > 0: 

            # Print Missing Data Percentage List - % of missing.
            print('--' + file + ' {} - {}%'.format(col, round(pct_missing*100)) + ', ' + str(num_missing))
            df['{}_ismissing'.format(col)] = missing

            # When numeric, fill with midian value 
            if col in numeric_cols:
                med = df[col].median()
                df[col] = df[col].fillna(med)
                print('  filled with ' + str(med))
            # When not numeric, fill with most frequent value     
            else:
                top = df[col].describe()['top'] # impute with the most frequent value.
                df[col] = df[col].fillna(top)
                print('  filled with "' + top + '"')


    # set new file path
    fileName = file
    newFileName = str(fileName)[:-4] + "_clean.csv"
    os.chdir(my_path_cleaned)
    file = newFileName

    # write to the csf if need
    df.to_csv(file, index=False) 
    print('To:   ' + file + str(df.shape))

    os.chdir(my_path)


            #print('created missing indicator for: {}'.format(col))
            



    
    
    #sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))

    


    # then based on the indicator, plot the histogram of missing values
    #ismissing_cols = [col for col in df.columns if 'missing' in col]
    #print(ismissing_cols)
    #df['pct_missing'] = df[ismissing_cols].sum(axis=1)
    #print(df['pct_missing'])

    #df['pct_missing'].value_counts().reset_index().sort_values(by='index').plot.bar(x='index', y='pct_missing')


==== Cleaning Missing Data ====

From: CDR.csv(16001, 18)
--CDR.csv NPS - 0%, 8
  filled with 6.0
To:   CDR_clean.csv(16001, 19)

From: DimAgent.csv(102, 5)
To:   DimAgent_clean.csv(102, 5)

From: DimCustomer.csv(18484, 29)
--DimCustomer.csv Title - 99%, 18383
  filled with "Mr."
--DimCustomer.csv MiddleName - 42%, 7830
  filled with "A"
--DimCustomer.csv LastName - 0%, 1
  filled with "Diaz"
--DimCustomer.csv Suffix - 100%, 18481
  filled with "Jr."
--DimCustomer.csv AddressLine2 - 98%, 18172
  filled with "Verkaufsabteilung"
To:   DimCustomer_clean.csv(18484, 34)

From: DimGeography.csv(655, 10)
To:   DimGeography_clean.csv(655, 10)

From: DimHandleType.csv(3, 2)
To:   DimHandleType_clean.csv(3, 2)

From: DimProduct.csv(25, 3)
To:   DimProduct_clean.csv(25, 3)

From: DimProductGroup.csv(5, 2)
To:   DimProductGroup_clean.csv(5, 2)

From: DimServiceType.csv(16, 2)
To:   DimServiceType_clean.csv(16, 2)

From: DimSeverifyType.csv(3, 2)
To:   DimSeverifyType_clean.csv(3, 2)
