At this point the individual data cleaning pipelines for the four CSV files have been completed and can be found in `data_cleaning_pipelines.ipynb`.
<br><br>
The `products.csv`, `orders.csv` and `orderlines.csv` CSV files have been transformed and stored as `products_clean.csv`, `orders_clean.csv` and `orderlines_clean.csv`.
<br><br>
Many of the values in the `order_lines.unit_price`, `products.price` and `products.promo_price` values are corrupted and the correct values can only be determined by comparing the values across the tables.
`orders.total_paid` appears to be uncorrupted.
<br><br>
Here we will use test driven developement to create a pipeline to clean the values and then add the the pipeline to `data_cleaning_pipelines.ipynb`.

In [147]:
import re
import numpy as np
import pandas as pd

import data_utils

## Import data

In [148]:
orderlines_clean = data_utils.clean_orderlines()
orders_clean = data_utils.clean_orders()
brands_clean = data_utils.clean_brands()
products_clean = data_utils.clean_products()

0 missing values were removed from orderlines.
This represents 0.00% of the data.


5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.




## Merge data

In [3]:
col_order = [
    'order_id',
    'orderline_id',
    'date',
    'name',
    'desc',
    'brand',
    'sku',
    'category',
    'total_paid',
    'product_quantity',
    'regular_price',
    'promo_price',
    'sale_price'
]

def reorder_columns(df, col_list):
    return df[col_list]

def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()
    
def drop_deprecated_columns(df, col_list):
    return (df
            .drop(col_list, axis=1)
           )

def rename_columns(df, col_dict):
    return (df
            .rename(columns=col_dict)
           )
    
def assign_product_categories(df):
    apple_regexp_dict = {
        'iPod': '^.{0,7}apple ipod',
        'iPhone':  'apple iphone',
        'iPad':  'apple ipad',
        'Mac':  'apple macbook|apple iMac|apple Mac mini|desktop computer',
    }
    
    other_regexp_dict = {        
        'Smartwatch':'withings|watch|fitbit|apple watch|smartwatch|smart watch',
        'Accessories': 'kit|strap|armband|belt|bracelet|stylus|pen|Bamboo Wacom Intuos|pencil|pen|rubber pointers|screwdriver|case|funda|housing|casing|folder|bag|backpack|cable|connector|Lightning to USB|Wall socket|power strip|adapter|battery|headset|headphones|mouse|trackpad|stand|support|protect|cover|sleeve|Screensaver|shellhub|dock|microphone|keyboard|keypad',
        'Hardware': 'Philips Hue|temperature sensor|display|monitor|camera|charger|speaker|router|repeater|Synology|nas|server|Parrot FPV Glasses|Command Pack 2 Skycontroller|Apple TV',
        'Software':  'adobe|Office 365|Office Home and Student|software|parallels',
        'Memory': 'hard disk|hard drive|flash drive|USB 2.0 key|USB 2.0 pen|SSD|pendrive|raid|SDHC|sata|memory card|Portable Hard Thunderbolt',
        'Repairs & warranties': 'repair|parts and labor|warranty|applecare|license|protection|installation',
    }
    
    df = df.assign(category = 'unknown')
    
    # Find main apple items
    for label, val in apple_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['desc'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown') & (x['brand'] == 'Apple'), 
                    label, x['category'])
            )
        )
    
    # Find other items
    for label, val in other_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['desc'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown'), label, x['category'])
            )
        )
    
    return df

def merge_dataframes(df, merge_df, col):
    return df.merge(merge_df, on=col)

def drop_uncompleted_orders(df):
    return df[df.state=='Completed']

completed_sales = (orders_clean
                   .pipe(start_pipeline)
                   .pipe(drop_uncompleted_orders)
                   .pipe(merge_dataframes, orderlines_clean, 'order_id')
                   .pipe(merge_dataframes, products_clean, 'sku')
                   .pipe(merge_dataframes, brands_clean, 'short')
                   .pipe(rename_columns, col_dict={'long': 'brand', 'unit_price': 'sale_price', 'price': 'regular_price', 'id': 'orderline_id'})
                   .pipe(drop_deprecated_columns, col_list=['short', 'created_date', 'state'])
                   .pipe(assign_product_categories)
                   .pipe(reorder_columns, col_order)
             )

completed_sales.head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.15,1,139.99,1.149.948,129.16
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,15.76,1,17.99,109.904,10.77
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,84.98,1,99.99,999.896,77.99
3,245275,1276706,2017-06-28 11:12:30,Tado Smart Climate Control Intelligent AC,intelligent control air conditioning works wit...,Tado,TAD0007,Accessories,149.0,1,179.0,1.489.994,149.0
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,59.584,52.99


## Explore corrupted values
### Count decimal points in price values

In [4]:
prices = completed_sales[['orderline_id', 'total_paid', 'regular_price', 'promo_price', 'sale_price']].copy()

prices['regular_price_decimal_count'] = prices['regular_price'].str.count(r'\.')
prices['promo_price_decimal_count'] = prices['promo_price'].str.count(r'\.')
prices['sale_price_decimal_count'] = prices['sale_price'].str.count(r'\.')

prices

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count
0,1398738,136.15,139.99,1.149.948,129.16,1,2,1
1,1529178,15.76,17.99,109.904,10.77,1,1,1
2,1181923,84.98,99.99,999.896,77.99,1,1,1
3,1276706,149.00,179,1.489.994,149.00,0,2,1
4,1154394,112.97,103.95,59.584,52.99,1,1,1
...,...,...,...,...,...,...,...,...
61667,1649446,18.98,35,13.99,13.99,0,1,1
61668,1649512,24.97,25,99.898,9.99,0,1,1
61669,1649522,24.97,25,99.898,9.99,0,1,1
61670,1649565,34.96,25,99.898,9.99,0,1,1


In [69]:
prices[['regular_price_decimal_count', 'promo_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count
0,18611,877,
1,42264,40066,61672.0
2,797,20729,


<div class="alert alert-block alert-info">
    total_paid is the only price column which does not have corrupted data. 
    <br>
    It may be that we can use this column to determine the correct positions of the decimal points in the other price columns.
</div>

# Data integrity tests
## Create test data

In [139]:
# Extract a subset of the data with only one decimal point so we can transform the str values to floats and check our test logic.
test_data_ids = prices[(prices.regular_price_decimal_count == 1) & (prices.promo_price_decimal_count == 1) & (prices.sale_price_decimal_count == 1)].orderline_id
test_data = completed_sales[completed_sales.orderline_id.isin(test_data_ids)].copy()

# Transform the price values to float.
test_data.regular_price = test_data.regular_price.astype(float)
test_data.promo_price = test_data.promo_price.astype(float)
test_data.sale_price = test_data.sale_price.astype(float)

# This data is corrupted and should fail the tests
failing_test_data = test_data
# This subset of the data has regular_price >= promo_price and promo_price >= sale_price and should therefore (hopefully) be uncorrupted
passing_test_data = test_data[(test_data.regular_price >= test_data.promo_price) & (test_data.promo_price >= test_data.sale_price)]

## Tests
We will utilise tests that return data to help debug the logic for cleaning the price values.

In [140]:
def test_col_vals_are_greater_than_other(df, greater_col, lesser_col):
    num_incorrect_vals = df[df[greater_col] < df[lesser_col]].shape[0]
    if num_incorrect_vals == 0:
        print(f"All of the {greater_col} values are greater than the corresponding {lesser_col} values.\n")
        return pd.DataFrame()
    else:
        print(f"There are corrupted values in {greater_col} which are less than their corresponding {lesser_col} values.")
        print(f"This respresents {num_incorrect_vals/df.shape[0]*100:.2f}% of the data.\n")
        corrupted_price_orderline_ids = df[df[greater_col] < df[lesser_col]].orderline_id
        return corrupted_price_orderline_ids

# test_data[test_data.regular_price < test_data.promo_price].shape[0]

def test_regular_greater_than_promo(df):
    incorrect_val_ids = test_col_vals_are_greater_than_other(df, 'regular_price', 'promo_price')
    return incorrect_val_ids
    
def test_regular_greater_than_sale(df):
    incorrect_val_ids = test_col_vals_are_greater_than_other(df, 'regular_price', 'sale_price')
    return incorrect_val_ids

def test_promo_greater_than_sale(df):
    incorrect_val_ids = test_col_vals_are_greater_than_other(df, 'promo_price', 'sale_price')
    return incorrect_val_ids

def test_order_total_paid_equal_sum_of_orderlines(df):
    # Group by order_id and calculate the sum of sale_price*product_quantity for all orderline_ids
    grouped_orderlines = df.groupby('order_id').apply(
        lambda x: pd.Series({
            'calculated_total': (x['product_quantity'] * x['sale_price']).sum(),
            'total_paid': x['total_paid'].iloc[0]  # Total paid is the same for all rows in the group
        }),
        include_groups=False
    )

    # Compare calculated_total with total_paid
    incorrect_orders = grouped_orderlines[grouped_orderlines['calculated_total'] != grouped_orderlines['total_paid']]
    
    if incorrect_orders.empty:
        print("All orders have total_paid values equal to the sum of product_quantity * sale_price.")
        return pd.DataFrame()
    else:
        print(f"There are {incorrect_orders.shape[0]} orders where total_paid does not match the sum of product_quantity * sale_price.")
        print(f"This respresents {incorrect_orders.shape[0]/df.shape[0]*100:.2f}% of the data.\n")
        return incorrect_orders



In [141]:
regular_less_than_promo_orderline_ids = test_regular_greater_than_promo(failing_test_data)
regular_less_than_sale_orderline_ids = test_regular_greater_than_sale(failing_test_data)
promo_less_than_sale_orderline_ids = test_promo_greater_than_sale(failing_test_data)
incorrect_total_price = test_order_total_paid_equal_sum_of_orderlines(failing_test_data)

There are corrupted values in regular_price which are less than their corresponding promo_price values.
This respresents 80.03% of the data.

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This respresents 3.30% of the data.

There are corrupted values in promo_price which are less than their corresponding sale_price values.
This respresents 6.81% of the data.

There are 21455 orders where total_paid does not match the sum of product_quantity * sale_price.
This respresents 68.59% of the data.



In [142]:
regular_less_than_promo_orderline_ids = test_regular_greater_than_promo(passing_test_data)
regular_less_than_sale_orderline_ids = test_regular_greater_than_sale(passing_test_data)
promo_less_than_sale_orderline_ids = test_promo_greater_than_sale(passing_test_data)
incorrect_total_price = test_order_total_paid_equal_sum_of_orderlines(passing_test_data)

All of the regular_price values are greater than the corresponding promo_price values.

All of the regular_price values are greater than the corresponding sale_price values.

All of the promo_price values are greater than the corresponding sale_price values.

There are 3507 orders where total_paid does not match the sum of product_quantity * sale_price.
This respresents 85.20% of the data.



<div class="alert alert-block alert-danger">
    Even when regular_price >= promo_price >= sale_price the sale price values do not match the total_paid per order.
</div>

# Price cleaning munge
## Total paid
Let's first multiple regular_price, promo_price and sale_price values by the product quantity then group them by order and determine if perhaps the correct value is in the wrong column.

In [144]:
total_paid_test_data = passing_test_data[passing_test_data.order_id.isin(incorrect_total_price.index)].copy()
total_paid_test_data.head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,59.584,52.99
59,261656,1321933,2017-08-21 19:01:56,Support Satechi Slim MacBook or iMac Aluminum ...,Stand with aluminum finish and practical desig...,Satechi,SAT0034,Accessories,44.98,1,49.99,44.99,39.99
68,263738,1319085,2017-08-16 17:01:54,Elago Airpod Charging stand Charging Stand Bla...,Silicone holder for positioning and loading th...,Elago,ELA0027,Accessories,80.96,1,19.95,13.99,13.99
70,264244,1587509,2018-01-29 15:18:31,"Battery iFixit MacBook Pro 13 ""(Mid 2009 to Mi...",Internal Battery for MacBook Pro 13-inch (Mid ...,iFixit,IFX0181,Accessories,141.97,1,89.95,79.99,69.99
81,267194,1505973,2017-12-19 21:19:08,"Battery iFixit MacBook Pro 13 ""(Mid 2009 to Mi...",Internal Battery for MacBook Pro 13-inch (Mid ...,iFixit,IFX0181,Accessories,75.98,1,89.95,79.99,72.99


In [145]:
total_paid_test_data.regular_price = total_paid_test_data.regular_price * total_paid_test_data.product_quantity
total_paid_test_data.promo_price = total_paid_test_data.promo_price * total_paid_test_data.product_quantity
total_paid_test_data.sale_price = total_paid_test_data.sale_price * total_paid_test_data.product_quantity
grouped_data = total_paid_test_data.groupby('order_id')
group_sizes = grouped_data.size()
sorted_groups = group_sizes.sort_values(ascending=False)

sorted_groups

order_id
444722    3
445458    3
318935    3
449404    2
454854    2
         ..
414075    1
414094    1
414157    1
414219    1
527017    1
Length: 3507, dtype: int64

In [135]:
grouped_data.get_group(245595)#[['total_paid', 'product_quantity', 'regular_price', 'promo_price', 'sale_price']]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,207.9,119.168,105.98


In [138]:
total_paid_test_data[total_paid_test_data.sku=='PAC1561']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,207.9,119.168,105.98
1572,304087,1133263,2017-01-07 09:52:01,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,52.98,1,103.95,59.584,52.99
1695,304508,1134340,2017-01-07 21:53:10,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,56.98,1,103.95,59.584,52.99
1761,304710,1134849,2017-01-08 12:07:16,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,59.98,1,103.95,59.584,52.99
2457,306758,1139777,2017-01-10 19:50:19,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,56.98,1,103.95,59.584,52.99
4678,312748,1154120,2017-01-20 23:44:05,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,54.98,1,103.95,59.584,52.99
5413,314606,1158703,2017-01-25 10:10:43,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,57.98,1,103.95,59.584,52.99
6595,317709,1166133,2017-01-30 18:52:20,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,127.97,1,103.95,59.584,52.99
7600,320593,1172676,2017-02-06 17:43:09,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,1,103.95,59.584,52.99
7934,321652,1175005,2017-02-09 09:13:05,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,338.97,1,103.95,59.584,52.99


In [131]:
completed_sales[completed_sales.order_id==302857]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
1257,302857,1130622,2017-01-05 00:57:59,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,66.98,1,103.95,59.584,62.99


In [129]:
completed_sales[completed_sales.sku=='PAC1561']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,59.584,52.99
1077,302106,1127954,2017-01-03 22:50:20,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,62.99,1,103.95,59.584,62.99
1257,302857,1130622,2017-01-05 00:57:59,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,66.98,1,103.95,59.584,62.99
1572,304087,1133263,2017-01-07 09:52:01,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,52.98,1,103.95,59.584,52.99
1695,304508,1134340,2017-01-07 21:53:10,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,56.98,1,103.95,59.584,52.99
1761,304710,1134849,2017-01-08 12:07:16,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,59.98,1,103.95,59.584,52.99
2457,306758,1139777,2017-01-10 19:50:19,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,56.98,1,103.95,59.584,52.99
4678,312748,1154120,2017-01-20 23:44:05,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,54.98,1,103.95,59.584,52.99
5413,314606,1158703,2017-01-25 10:10:43,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,57.98,1,103.95,59.584,52.99
6595,317709,1166133,2017-01-30 18:52:20,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,127.97,1,103.95,59.584,52.99


In [108]:
grouped_data.get_group(318935)[['regular_price', 'promo_price', 'sale_price']].sum(axis=0)

regular_price    159.970
promo_price      122.972
sale_price       116.970
dtype: float64

In [121]:
import itertools

def check_price_combinations(df):
    total_paid = df['total_paid'].iloc[0]  # total_paid is the same for all rows in the group

    # Create a list of lists with prices and quantities
    price_quantity_pairs = [
        [(row['regular_price'], row['product_quantity']), 
         (row['promo_price'], row['product_quantity']), 
         (row['sale_price'], row['product_quantity'])] 
        for idx, row in df.iterrows()
    ]
    
    # Check combinations with product_quantity = 1
    for combination in itertools.product(*price_quantity_pairs):
        total = sum(price * qty for price, qty in combination)
        if total_paid - 2 <= total <= total_paid + 2:
            print(f"Match found: {combination, total}")
        else:
            print(combination, total)
    print('\n')
    
    # Check combinations with product_quantity varying (1 or 2)
    # This creates combinations where product_quantity can be 1 or 2 for each row
    for qty_combination in itertools.product([1, 2], repeat=len(df)):
        # Apply the varying quantities to each row's price choices
        varying_price_quantity_pairs = [
            [(row.regular_price, qty_combination[i]), 
             (row.promo_price, qty_combination[i]), 
             (row.sale_price, qty_combination[i])]
            for i, row in enumerate(df.itertuples())
        ]
        
        # Now test each price combination with the varying quantities
        for combination in itertools.product(*varying_price_quantity_pairs):
            total = sum(price * qty for price, qty in combination)
            if total_paid - 2 <= total <= total_paid + 2:
                print(f"Match found: {combination, total}")
            else:
                print(combination, total)

# Example usage
df = grouped_data.get_group(318935)[['total_paid', 'product_quantity', 'regular_price', 'promo_price', 'sale_price']]
check_price_combinations(df)

((79.99, 1.0), (49.99, 1.0), (29.99, 1.0)) 159.97
((79.99, 1.0), (49.99, 1.0), (16.992, 1.0)) 146.97199999999998
((79.99, 1.0), (49.99, 1.0), (16.99, 1.0)) 146.97
((79.99, 1.0), (35.99, 1.0), (29.99, 1.0)) 145.97
((79.99, 1.0), (35.99, 1.0), (16.992, 1.0)) 132.97199999999998
((79.99, 1.0), (35.99, 1.0), (16.99, 1.0)) 132.97
((79.99, 1.0), (29.99, 1.0), (29.99, 1.0)) 139.97
((79.99, 1.0), (29.99, 1.0), (16.992, 1.0)) 126.972
((79.99, 1.0), (29.99, 1.0), (16.99, 1.0)) 126.96999999999998
((69.99, 1.0), (49.99, 1.0), (29.99, 1.0)) 149.97
((69.99, 1.0), (49.99, 1.0), (16.992, 1.0)) 136.97199999999998
((69.99, 1.0), (49.99, 1.0), (16.99, 1.0)) 136.97
((69.99, 1.0), (35.99, 1.0), (29.99, 1.0)) 135.97
((69.99, 1.0), (35.99, 1.0), (16.992, 1.0)) 122.972
((69.99, 1.0), (35.99, 1.0), (16.99, 1.0)) 122.96999999999998
((69.99, 1.0), (29.99, 1.0), (29.99, 1.0)) 129.97
((69.99, 1.0), (29.99, 1.0), (16.992, 1.0)) 116.972
((69.99, 1.0), (29.99, 1.0), (16.99, 1.0)) 116.96999999999998
((69.99, 1.0), (49.

In [None]:
<div class="alert alert-block alert-danger">
    Even when regular_price >= promo_price >= sale_price the sale price values do not match the total_paid per order.
</div>

### Check sale_price

In [70]:
temp = prices.copy()
temp[temp.sale_price_decimal_count!=1]

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count


In [71]:
temp = (
    temp.assign(
        sale_price_split = lambda x: x['sale_price'].str.split(r'\.'),
        decimal_places = lambda x: len(x['sale_price_split'][1]))
)
temp.head(5)

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count,sale_price_split,decimal_places
0,1398738,136.15,139.99,1.149.948,129.16,1,2,1,"[129, 16]",2
1,1529178,15.76,17.99,109.904,10.77,1,1,1,"[10, 77]",2
2,1181923,84.98,99.99,999.896,77.99,1,1,1,"[77, 99]",2
3,1276706,149.0,179.0,1.489.994,149.0,0,2,1,"[149, 0]",2
4,1154394,112.97,103.95,59.584,52.99,1,1,1,"[52, 99]",2


In [72]:
temp[temp.decimal_places != 2]

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count,sale_price_split,decimal_places


<div class="alert alert-block alert-success">
        It appears that the <b>sale_price</b> values are not corrupted. All values have one decimal point and two values after the decimal point
</div>

### Check total_paid

### Check regular_price

### Check promo_price

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
82,267375,1140087,2017-01-10 22:48:08,"Pure Nude Ultraslim 03 ""7/8 Transparent iPhone...",transparent and flexible cover with 03mm thick...,Puro,PUR0150,Accessories,17.98,1,12.95,129.906,12.99
110,281222,1232865,2017-04-21 19:19:07,iHealth box 50 Reagent Strips glucímetros,Blood glucose test strips in the iHealth glucí...,iHealth,IHE0026,unknown,24.98,1,19.95,199.904,19.99
129,286842,1122531,2017-01-02 22:39:55,Philips Hue Go Portable Light White,Portable light with natural dynamic effects an...,Philips,PHI0056,Hardware,79.99,1,79.95,789.888,79.99
204,299404,1120191,2017-01-01 22:55:53,Mac memory Kingston 4GB SO-DIMM DDR3åÊ1333MhzåÊ,4GB RAM iMac (2011) Mac mini and MacBook Pro (...,Kingston,KIN0156,unknown,415.11,1,35.53,429.913,37.99
272,299829,1119702,2017-01-01 17:50:48,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,282.48,1,249.99,205.994,275.49


<div class="alert alert-block alert-info">
    These values could be correct but have been saved as ints instead of floats. <br>
    Let's change them to floats and check if they are alway >= sale_price.
</div>

In [66]:
temp[temp.regular_price_decimal_count==0].regular_price = temp[temp.regular_price_decimal_count==0].regular_price.astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[temp.regular_price_decimal_count==0].regular_price = temp[temp.regular_price_decimal_count==0].regular_price.astype(float)


In [67]:
type(temp[temp.regular_price_decimal_count==0].regular_price.iloc[1])

str

In [59]:
temp[temp.regular_price_decimal_count==2].head(5)

Unnamed: 0,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count
21,505.76,1.990.002,194,152.95,2,0,1
75,127.02,1.728.001,17.28,14.53,2,1,1
101,795.85,2.499.013,249.901,17.56,2,1,1
281,457.89,5.608.689,4.849.898,457.89,2,2,1
295,33.73,2.499.013,249.901,23.74,2,1,1


### Transform orderlines.unit_price to floats

In [None]:
def transform_unit_price_to_floats(df):
    """
    Transform the orderlines.unit_price price column to floats.
    Some of the values have two decimal points. 
    For these values we will remove the leftmost decimal and transform all values to floats.
    The correct position of the decimal point will be determined by merging orderlines, 
    products, orders and brands, and comparing the price values.
    
    Args:
        df (pd.DataFrame): The orderlines data
    
    Returns:
        pd.DataFrame: The orderlines data with the unit_price column transformed from str to float values.
    """
    return (
        df.assign(unit_price = df.unit_price.str.split('.')
                  .apply(lambda x : x[0]+x[1]+'.'+x[2] if len(x)==3 else x[0]+'.'+ x[1])
                  .astype(float)
        )
    )
    

In [4]:
completed_sales['regular_price'].str.count(r'\.')

0        1
1        1
2        1
3        0
4        1
        ..
61667    0
61668    0
61669    0
61670    0
61671    0
Name: regular_price, Length: 61672, dtype: int64

In [None]:
def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df