At this point the individual data cleaning pipelines for the four CSV files have been completed and can be found in `data_cleaning_pipelines.ipynb`.
<br><br>
The `products.csv`, `orders.csv` and `orderlines.csv` CSV files have been transformed and stored as `products_clean.csv`, `orders_clean.csv` and `orderlines_clean.csv`.
<br><br>
Many of the values in the `order_lines.unit_price`, `products.price` and `products.promo_price` values are corrupted and the correct values can only be determined by comparing the values across the tables.
`orders.total_paid` appears to be uncorrupted.
<br><br>
Here we will use test driven developement to create a pipeline to clean the values and then add the the pipeline to `data_cleaning_pipelines.ipynb`.

In [1]:
import re
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import data_utils

## Import data

In [2]:
# Import the data
orders = data_utils.clean_orders(data_path="../../data/")
orderlines = data_utils.clean_orderlines(data_path="../../data/")
products = data_utils.clean_products(data_path="../../data/")
brands = data_utils.clean_brands(data_path="../../data/")

# Merge the data
completed_sales =  data_utils.merge_data(orders, orderlines, products, brands)

5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from orderlines.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.




## Merge data

In [4]:
orders.head()

Unnamed: 0,order_id,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled


In [9]:
col_order = [
    'order_id',
    'orderline_id',
    'date',
    'name',
    'product_description',
    'brand',
    'sku',
    'category',
    'total_paid',
    'product_quantity',
    'regular_price',
    'sale_price'
]

def reorder_columns(df, col_list):
    return df[col_list]

def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()
    
def drop_deprecated_columns(df, col_list):
    return (df
            .drop(col_list, axis=1)
           )

def rename_columns(df, col_dict):
    return (df
            .rename(columns=col_dict)
           )
    
def assign_product_categories(df):
    apple_regexp_dict = {
        'iPod': '^.{0,7}apple ipod',
        'iPhone':  'apple iphone',
        'iPad':  'apple ipad',
        'Mac':  'apple macbook|apple iMac|apple Mac mini|desktop computer',
    }
    
    other_regexp_dict = {        
        'Smartwatch':'withings|watch|fitbit|apple watch|smartwatch|smart watch',
        'Accessories': 'kit|strap|armband|belt|bracelet|stylus|pen|Bamboo Wacom Intuos|pencil|pen|rubber pointers|screwdriver|case|funda|housing|casing|folder|bag|backpack|cable|connector|Lightning to USB|Wall socket|power strip|adapter|battery|headset|headphones|mouse|trackpad|stand|support|protect|cover|sleeve|Screensaver|shellhub|dock|microphone|keyboard|keypad',
        'Hardware': 'Philips Hue|temperature sensor|display|monitor|camera|charger|speaker|router|repeater|Synology|nas|server|Parrot FPV Glasses|Command Pack 2 Skycontroller|Apple TV',
        'Software':  'adobe|Office 365|Office Home and Student|software|parallels',
        'Memory': 'hard disk|hard drive|flash drive|USB 2.0 key|USB 2.0 pen|SSD|pendrive|raid|SDHC|sata|memory card|Portable Hard Thunderbolt',
        'Repairs & warranties': 'repair|parts and labor|warranty|applecare|license|protection|installation',
    }
    
    df = df.assign(category = 'unknown')
    
    # Find main apple items
    for label, val in apple_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['product_description'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown') & (x['brand'] == 'Apple'), 
                    label, x['category'])
            )
        )
    
    # Find other items
    for label, val in other_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['product_description'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown'), label, x['category'])
            )
        )
    
    return df

def merge_dataframes(df, merge_df, col):
    return df.merge(merge_df, on=col)

def drop_uncompleted_orders(df):
    return df[df.state=='Completed']

completed_sales = (orders
                   .pipe(start_pipeline)
                   .pipe(drop_uncompleted_orders)
                   .pipe(merge_dataframes, orderlines, 'order_id')
                   .pipe(merge_dataframes, products, 'sku')
                   .pipe(merge_dataframes, brands, 'short')
                   .pipe(rename_columns, col_dict={'long': 'brand', 'desc': 'product_description', 'unit_price': 'sale_price', 'price': 'regular_price', 'id': 'orderline_id'})
                   .pipe(drop_deprecated_columns, col_list=['short', 'created_date', 'state'])
                   .pipe(assign_product_categories)
                   .pipe(reorder_columns, col_order)
             )

completed_sales.head()

Unnamed: 0,order_id,orderline_id,date,name,product_description,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.15,1,139.99,129.16
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,15.76,1,17.99,10.77
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,84.98,1,99.99,77.99
3,245275,1276706,2017-06-28 11:12:30,Tado Smart Climate Control Intelligent AC,intelligent control air conditioning works wit...,Tado,TAD0007,Accessories,149.0,1,179.0,149.0
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,52.99
