At this point the individual data cleaning pipelines for the four CSV files have been completed and can be found in `data_cleaning_pipelines.ipynb`.
<br><br>
The `products.csv`, `orders.csv` and `orderlines.csv` CSV files have been transformed and stored as `products_clean.csv`, `orders_clean.csv` and `orderlines_clean.csv`.
<br><br>
Many of the values in the `order_lines.unit_price`, `products.price` and `products.promo_price` values are corrupted and the correct values can only be determined by comparing the values across the tables.
`orders.total_paid` appears to be uncorrupted.
<br><br>
Here we will use test driven developement to create a pipeline to clean the values and then add the the pipeline to `data_cleaning_pipelines.ipynb`.

## Import data

In [3]:
import data_utils

orderlines_clean = data_utils.clean_orderlines()
orders_clean = data_utils.clean_orders()
brands_clean = data_utils.clean_brands()
products_clean = data_utils.clean_products()

0 missing values were removed from orderlines.
This represents 0.00% of the data.


5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.




## Merge data

In [2]:
orderlines_clean.head()

Unnamed: 0,id,order_id,product_quantity,sku,unit_price,date,short
0,1119109,299539,1,OTT0133,18.99,2017-01-01 00:07:19,OTT
1,1119110,299540,1,LGE0043,399.0,2017-01-01 00:19:45,LGE
2,1119111,299541,1,PAR0071,474.05,2017-01-01 00:20:57,PAR
3,1119112,299542,1,WDT0315,68.39,2017-01-01 00:51:40,WDT
4,1119113,299543,1,JBL0104,23.74,2017-01-01 01:06:38,JBL


In [3]:
col_order = [
    'order_id',
    'orderline_id',
    'date',
    'name',
    'desc',
    'brand',
    'sku',
    'category',
    'total_paid',
    'product_quantity',
    'regular_price',
    'promo_price',
    'sale_price'
]

def reorder_columns(df, col_list):
    return df[col_order]

def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()
    
def drop_deprecated_columns(df, col_list):
    return (df
            .drop(col_list, axis=1)
           )

def rename_columns(df, col_dict):
    return (df
            .rename(columns=col_dict)
           )
    
def assign_product_categories(df):
    apple_regexp_dict = {
        'iPod': '^.{0,7}apple ipod',
        'iPhone':  'apple iphone',
        'iPad':  'apple ipad',
        'Mac':  'apple macbook|apple iMac|apple Mac mini|desktop computer',
    }
    
    other_regexp_dict = {        
        'Smartwatch':'withings|watch|fitbit|apple watch|smartwatch|smart watch',
        'Accessories': 'kit|strap|armband|belt|bracelet|stylus|pen|Bamboo Wacom Intuos|pencil|pen|rubber pointers|screwdriver|case|funda|housing|casing|folder|bag|backpack|cable|connector|Lightning to USB|Wall socket|power strip|adapter|battery|headset|headphones|mouse|trackpad|stand|support|protect|cover|sleeve|Screensaver|shellhub|dock|microphone|keyboard|keypad',
        'Hardware': 'Philips Hue|temperature sensor|display|monitor|camera|charger|speaker|router|repeater|Synology|nas|server|Parrot FPV Glasses|Command Pack 2 Skycontroller|Apple TV',
        'Software':  'adobe|Office 365|Office Home and Student|software|parallels',
        'Memory': 'hard disk|hard drive|flash drive|USB 2.0 key|USB 2.0 pen|SSD|pendrive|raid|SDHC|sata|memory card|Portable Hard Thunderbolt',
        'Repairs & warranties': 'repair|parts and labor|warranty|applecare|license|protection|installation',
    }
    
    df = df.assign(category = 'unknown')
    
    # Find main apple items
    for label, val in apple_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['desc'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown') & (x['brand'] == 'Apple'), 
                    label, x['category'])
            )
        )
    
    # Find other items
    for label, val in other_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['desc'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown'), label, x['category'])
            )
        )
    
    return df

def merge_dataframes(df, merge_df, col):
    return df.merge(merge_df, on=col)

def drop_uncompleted_orders(df):
    return df[df.state=='Completed']

completed_sales = (orders_clean
                   .pipe(start_pipeline)
                   .pipe(drop_uncompleted_orders)
                   .pipe(merge_dataframes, orderlines_clean, 'order_id')
                   .pipe(merge_dataframes, products_clean, 'sku')
                   .pipe(merge_dataframes, brands, 'short')
                   .pipe(rename_columns, col_dict={'long': 'brand', 'unit_price': 'sale_price', 'price': 'regular_price', 'id': 'orderline_id'})
                   .pipe(drop_deprecated_columns, col_list=['short', 'created_date', 'state'])
                   .pipe(assign_product_categories)
                   .pipe(reorder_columns, col_order)
             )

completed_sales.head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.15,1,139.99,1.149.948,129.16
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,15.76,1,17.99,109.904,10.77
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,84.98,1,99.99,999.896,77.99
3,245275,1276706,2017-06-28 11:12:30,Tado Smart Climate Control Intelligent AC,intelligent control air conditioning works wit...,Tado,TAD0007,Accessories,149.0,1,179.0,1.489.994,149.0
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,59.584,52.99


## Explore corrupted values
<div class="alert alert-block alert-info">
    Count the number of decimal places in each price value to identify corrupted values.
</div>

In [68]:
prices = completed_sales[['orderline_id', 'total_paid', 'regular_price', 'promo_price', 'sale_price']].copy()

prices['regular_price_decimal_count'] = prices['regular_price'].str.count(r'\.')
prices['promo_price_decimal_count'] = prices['promo_price'].str.count(r'\.')
prices['sale_price_decimal_count'] = prices['sale_price'].str.count(r'\.')

prices

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count
0,1398738,136.15,139.99,1.149.948,129.16,1,2,1
1,1529178,15.76,17.99,109.904,10.77,1,1,1
2,1181923,84.98,99.99,999.896,77.99,1,1,1
3,1276706,149.00,179,1.489.994,149.0,0,2,1
4,1154394,112.97,103.95,59.584,52.99,1,1,1
...,...,...,...,...,...,...,...,...
61667,1649446,18.98,35,13.99,13.99,0,1,1
61668,1649512,24.97,25,99.898,9.99,0,1,1
61669,1649522,24.97,25,99.898,9.99,0,1,1
61670,1649565,34.96,25,99.898,9.99,0,1,1


In [69]:
prices[['regular_price_decimal_count', 'promo_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count
0,18611,877,
1,42264,40066,61672.0
2,797,20729,


<div class="alert alert-block alert-info">
    sale_price only ever has one decimal so it could be uncorrupted. Let's check the number of values after the decimal to be sure.
</div>

### Check sale_price

In [70]:
temp = prices.copy()
temp[temp.sale_price_decimal_count!=1]

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count


In [71]:
temp = (
    temp.assign(
        sale_price_split = lambda x: x['sale_price'].str.split(r'\.'),
        decimal_places = lambda x: len(x['sale_price_split'][1]))
)
temp.head(5)

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count,sale_price_split,decimal_places
0,1398738,136.15,139.99,1.149.948,129.16,1,2,1,"[129, 16]",2
1,1529178,15.76,17.99,109.904,10.77,1,1,1,"[10, 77]",2
2,1181923,84.98,99.99,999.896,77.99,1,1,1,"[77, 99]",2
3,1276706,149.0,179.0,1.489.994,149.0,0,2,1,"[149, 0]",2
4,1154394,112.97,103.95,59.584,52.99,1,1,1,"[52, 99]",2


In [72]:
temp[temp.decimal_places != 2]

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count,sale_price_split,decimal_places


<div class="alert alert-block alert-success">
        It appears that the <b>sale_price</b> values are not corrupted. All values have one decimal point and two values after the decimal point
</div>

In [94]:
# orderline_id	total_paid	regular_price	promo_price	sale_price	regular_price_decimal_count	promo_price_decimal_count	sale_price_decimal_count

test_data_ids = prices[(prices.regular_price_decimal_count == 1) & (prices.promo_price_decimal_count == 1) & (prices.sale_price_decimal_count == 1)].orderline_id
test_data = completed_sales[completed_sales.orderline_id.isin(test_data_ids)].copy()

test_data.regular_price = test_data.regular_price.astype(float)
test_data.promo_price = test_data.promo_price.astype(float)
test_data.sale_price = test_data.sale_price.astype(float)

test_data.head(5)

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,15.76,1,17.99,109.904,10.77
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,84.98,1,99.99,999.896,77.99
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,59.584,52.99
5,246018,1179702,2017-02-13 22:34:47,"iFixit battery Macbook Pro 13 ""OEM (Mid 2009 /...",OEM battery for MacBook Pro 13,iFixit,IFX0055,Accessories,211.95,1,99.95,999.896,93.99
6,246018,1179711,2017-02-13 22:39:20,"Tucano Nido Hard-Shell Case MacBook Pro 13 ""(L...",Protective cover with slip rubber feet 13 inch...,Tucano,TUC0308,Accessories,211.95,1,29.9,249.901,24.99


In [99]:
test_data.shape

(31291, 13)

### Check regular_price > promo_price

In [118]:
def test_col_vals_are_greater_than_other(df, greater_col, lesser_col):
    num_incorrect_vals = df[df[greater_col] < test_data[lesser_col]].shape[0]
    if num_incorrect_vals == 0:
        print(f"All of the {greater_col} values are greater than the corresponding {lesser_col} values.\n")
        return pd.DataFrame()
    else:
        print(f"There are corrupted values in {greater_col} which are less than their corresponding {lesser_col} values.")
        print(f"This respresents {num_incorrect_vals/df.shape[0]*100}% of the data.\n")
        corrupted_price_orderline_ids = df[df[greater_col] < test_data[lesser_col]].orderline_id
        return corrupted_price_orderline_ids

# test_data[test_data.regular_price < test_data.promo_price].shape[0]

def test_regular_greater_than_promo(df):
    incorrect_val_ids = test_col_vals_are_greater_than_other(df, 'regular_price', 'promo_price')
    return incorrect_val_ids
    
def test_regular_greater_than_sale(df):
    incorrect_val_ids = test_col_vals_are_greater_than_other(df, 'regular_price', 'sale_price')
    return incorrect_val_ids

def test_promo_greater_than_sale(df):
    incorrect_val_ids = test_col_vals_are_greater_than_other(df, 'promo_price', 'sale_price')
    return incorrect_val_ids

regular_less_than_promo_orderline_ids = test_regular_greater_than_promo(test_data)
regular_less_than_sale_orderline_ids = test_regular_greater_than_sale(test_data)
promo_less_than_sale_orderline_ids = test_promo_greater_than_sale(test_data)

There are corrupted values in regular_price which are less than their corresponding promo_price values.
This respresents 80.00063916142022% of the data.

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This respresents 3.30126873541913% of the data.

There are corrupted values in promo_price which are less than their corresponding sale_price values.
This respresents 6.826243967914097% of the data.



In [113]:
test_data[test_data['regular_price'] < test_data['sale_price']].head(5)

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
82,267375,1140087,2017-01-10 22:48:08,"Pure Nude Ultraslim 03 ""7/8 Transparent iPhone...",transparent and flexible cover with 03mm thick...,Puro,PUR0150,Accessories,17.98,1,12.95,129.906,12.99
110,281222,1232865,2017-04-21 19:19:07,iHealth box 50 Reagent Strips glucímetros,Blood glucose test strips in the iHealth glucí...,iHealth,IHE0026,unknown,24.98,1,19.95,199.904,19.99
129,286842,1122531,2017-01-02 22:39:55,Philips Hue Go Portable Light White,Portable light with natural dynamic effects an...,Philips,PHI0056,Hardware,79.99,1,79.95,789.888,79.99
204,299404,1120191,2017-01-01 22:55:53,Mac memory Kingston 4GB SO-DIMM DDR3åÊ1333MhzåÊ,4GB RAM iMac (2011) Mac mini and MacBook Pro (...,Kingston,KIN0156,unknown,415.11,1,35.53,429.913,37.99
272,299829,1119702,2017-01-01 17:50:48,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,282.48,1,249.99,205.994,275.49


<div class="alert alert-block alert-info">
    These values could be correct but have been saved as ints instead of floats. <br>
    Let's change them to floats and check if they are alway >= sale_price.
</div>

In [66]:
temp[temp.regular_price_decimal_count==0].regular_price = temp[temp.regular_price_decimal_count==0].regular_price.astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[temp.regular_price_decimal_count==0].regular_price = temp[temp.regular_price_decimal_count==0].regular_price.astype(float)


In [67]:
type(temp[temp.regular_price_decimal_count==0].regular_price.iloc[1])

str

In [59]:
temp[temp.regular_price_decimal_count==2].head(5)

Unnamed: 0,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count
21,505.76,1.990.002,194,152.95,2,0,1
75,127.02,1.728.001,17.28,14.53,2,1,1
101,795.85,2.499.013,249.901,17.56,2,1,1
281,457.89,5.608.689,4.849.898,457.89,2,2,1
295,33.73,2.499.013,249.901,23.74,2,1,1


# Data integrity tests
## Test promo_price
The promo_price must be greater or equal to the sale_price

## Test regular_price
The regular_price must be greater or equal to the promo_price

## Test total_paid 
The total_paid value per order must be equal to the sum of sale_price*product_quantity of each orderline with the order

In [4]:
completed_sales['regular_price'].str.count(r'\.')

0        1
1        1
2        1
3        0
4        1
        ..
61667    0
61668    0
61669    0
61670    0
61671    0
Name: regular_price, Length: 61672, dtype: int64

In [None]:
def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df