# Assumptions

* The total_paid must be greater or equal to the sum of product_quantity*sale_price for all orderlines in an order
* regular_price must be greater or equal to sale_price
* regular_price must be greater or equal to promo_price
* promo_price could be less than sale_price if the product was sold at the regular_price
* It is possible that the difference between the total_paid value and sum(product_quantity*sale_price) is the shipping cost

In [1]:
import re
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import data_utils
import price_debugging_tests as pdt

# Set maximum number of rows to display
pd.set_option('display.max_rows', 1000)

# Import and merge data

In [2]:
# Import the data
orders = data_utils.clean_orders(data_path="../../data/")
orderlines = data_utils.clean_orderlines(data_path="../../data/")
products = data_utils.clean_products(data_path="../../data/")
brands = data_utils.clean_brands(data_path="../../data/")

# Merge the data
completed_sales =  data_utils.merge_data(orders, orderlines, products, brands)

5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from orderlines.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.




# Price data exploration
## Exclude orders with multiple orderlines and multiple products

In [57]:
# Get the count of orderlines per order from completed_sales
completed_sales_orderline_counts = completed_sales.groupby('order_id')['orderline_id'].count().reset_index()
completed_sales_orderline_counts.rename(columns={'orderline_id': 'orderline_count'}, inplace=True)

# Exclude orders with more than one orderline.
orders_with_one_orderline = completed_sales_orderline_counts[completed_sales_orderline_counts.orderline_count==1]
single_orderline_orders = completed_sales[completed_sales.order_id.isin(orders_with_one_orderline.order_id)]

# Exclude orders with product_quantity > 1
single_orderline_single_product = single_orderline_orders[single_orderline_orders.product_quantity==1].copy()
single_orderline_single_product.shape[0]/total_orders*100

print(f"We still have {np.round(single_orderline_single_product.shape[0]/total_orders*100)}% of the orders")

We still have 72.0% of the orders


## Explore corrupted decimal values

### Helper functions

In [6]:
def _calculate_percentage_difference(df, reference_col, comparison_col):
    return ((df[reference_col] - df[comparison_col])/df[reference_col]*100).round(2)

def add_discount_percentage_col(df):
    df['discount_percentage'] = _calculate_percentage_difference(df, 'regular_price', 'sale_price')
    return df

def check_for_outliers(df, reference_col='regular_price', comparison_col='sale_price'):
    ''' 
    Check for outliers to see if the decimal value is in the wrong spot. 
    
    We define an outlier as having a percentage difference between the two price 
    values greater than 3 standard deviations from the mean percentage difference.
    '''
    diff_col_name = comparison_col + '_vs_' + reference_col + '_percentage_diff'
    df[diff_col_name] = _calculate_percentage_difference(df, reference_col, comparison_col)
    
    mean_diff = df[diff_col_name].mean()
    std_diff = df[diff_col_name].std()
    threshold = mean_diff + 3 * std_diff
    
    outliers = df[abs(df[diff_col_name]) > threshold]
    
    return outliers

## Evaluate potentially uncorrupted values

### Decimal count = 1 = 1 = 1
<div class="alert alert-box alert-info">
    Since 67% of the total completed_sales data has total_paid_decimal_count = regular_price_decimal_count == sale_price_decimal_count == 1, cleaning this data will be a good first step.
</div>

In [11]:
orderline_ids = single_orderline_single_product[
    (single_orderline_single_product.total_paid_decimal_count == 1) &
    (single_orderline_single_product.regular_price_decimal_count == 1) &
    (single_orderline_single_product.sale_price_decimal_count == 1)
].orderline_id

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
# drop the decimal count cols to clean up the output
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Add a discount percentage col to detect incorrect values and outliers
one_decimal_data = add_discount_percentage_col(one_decimal_data)

In [12]:
one_decimal_data[one_decimal_data.order_id==376743]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
19074,376743,1291409,2017-07-13 08:39:01,5TB Lacie Rugged Hard Disk Thunderbolt USB-C,"5TB hard disk rugged, compact Thunderbolt and ...",LaCie,LAC0227,Memory,301.99,1,299.99,301.99,-0.67


#### Problem 1: sale_price < total_paid

<div class="alert alert-box alert-danger">
    There are cases where the sale_price is greater than the total_paid value.
</div>

In [13]:
one_decimal_data[one_decimal_data.order_id.isin([301495, 281302, 274043, 287311, 300950, 296010, 297572])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
92,274043,1121663,2017-01-02 16:30:49,Griffin Magnetic BreakSafe-C USB charging cabl...,Charging cable and connector magnetic fast lib...,Griffin,GRT0425,Accessories,3.98,1,34.99,24.99,28.58
111,281302,1142023,2017-01-12 01:00:05,Startech USB Adapter VGA-C Blanco,Adapter with reversible USB connection VGA-C (...,Startech,STA0036,Accessories,16.98,1,53.99,29.99,44.45
131,287311,1164035,2017-01-29 22:54:50,Sandisk iXpand Lightning to USB 3.0 64GB,64GB storage unit for iPhone and iPad,SanDisk,SAN0134,Accessories,54.99,1,69.99,59.99,14.29
160,296010,1138342,2017-01-09 23:41:57,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,0.0,1,29.9,24.99,16.42
182,297572,1162207,2017-01-27 23:53:16,Twelve South BookArc support for MacBook Pro /...,Aluminum support in an arc Retina MacBook Air.,Twelve South,TWS0093,Accessories,51.99,1,59.99,56.99,5.0
684,300950,1122232,2017-01-02 20:20:08,Apple Mac mini Core i5 14GHz | 8GB RAM | 500GB...,Mac mini desktop computer 8GB RAM 500GB SSD (M...,Pack,PAC0594,Memory,777.99,1,1135.59,782.99,31.05
866,301495,1123362,2017-01-03 12:04:16,Svolta Tucano MacBook Pro Sleeve bag / Retina ...,compact case for MacBook / Air 13 and 13 inche...,Tucano,TUC0277,Accessories,0.0,1,29.9,24.99,16.42


<div class="alert alert-box alert-danger">
    The first case above seems to have the decimal point in the wrong place in total_paid.
    <br><br>
    Some of the prices are relatively similar, for example, the 3rd, 5th and 6th rows above, and could possibly be corrected by setting the total_paid equal to the sale_price.
    <br><br>
    Some of the data is too corrupted to be corrected: The 2nd, 4th and 7th rows.
</div>

In [14]:
''' What percentage of the single_orderline_single_product data has total_paid values lower than the sale_price? '''
total_orders = one_decimal_data.shape[0]
sale_price_too_high = one_decimal_data[one_decimal_data.total_paid < one_decimal_data.sale_price].shape[0]
print(f"The sale_price is greater than the total_paid for {sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the total_paid for 0.68% of the orders. 


<div class="alert alert-box alert-success">
    This problem appears to apply to a very small percentage of the data and, for now, we will assume that the simplest way to deal with this problem is to delete these rows.
</div>

#### Problem 2: regular_price < sale_price

<div class="alert alert-box alert-danger">
    There are orderlines where the regular_price is lower than the sale_price.
</div>

In [15]:
one_decimal_data[one_decimal_data.order_id.isin([267375])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
82,267375,1140087,2017-01-10 22:48:08,"Pure Nude Ultraslim 03 ""7/8 Transparent iPhone...",transparent and flexible cover with 03mm thick...,Puro,PUR0150,Accessories,17.98,1,12.95,12.99,-0.31


In [16]:
''' What percentage of the data has regular_price values lower than the sale_price? '''
total_orders = one_decimal_data.shape[0]
sale_price_too_high = one_decimal_data[one_decimal_data.regular_price < one_decimal_data.sale_price].shape[0]
print(f"The sale_price is greater than the regular_price for {sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the regular_price for 3.30% of the orders. 


In [17]:
''' 
It appears that for many of these cases the difference is minor, as seen above. 
Let's round the values and see if it makes a difference. 
'''

sale_price_greater_than_regular = one_decimal_data[one_decimal_data.regular_price < one_decimal_data.sale_price].copy()

sale_price_greater_than_regular.sale_price = sale_price_greater_than_regular.sale_price.round()
sale_price_greater_than_regular.regular_price = sale_price_greater_than_regular.regular_price.round()
sale_price_greater_than_regular.total_paid = sale_price_greater_than_regular.total_paid.round()

rounded_sale_price_too_high = sale_price_greater_than_regular[sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price].shape[0]
print(f"The sale_price is greater than the regular_price for {rounded_sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the regular_price for 1.18% of the orders. 


In [18]:
'''
There are orderlines where there is a significant difference between the regular_price and sale_price.
It seems that the regular_price saved in products.csv has changed over time.
'''
sale_price_greater_than_regular[sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price].head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
272,299829,1119702,2017-01-01 17:50:48,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,282.0,1,250.0,275.0,-10.2
296,299909,1119872,2017-01-01 19:50:09,LaCie Porsche Design Desktop Drive 5TB USB 3.0...,External Hard Drive 5TB 35-inch USB 3.0 for Ma...,LaCie,LAC0159,Memory,184.0,1,175.0,177.0,-0.97
490,300464,1121064,2017-01-02 12:09:42,Withings Activite Steel Black Dial White,Smart Clock for iPhone and iPad,Withings,WIT0034,Smartwatch,153.0,1,130.0,153.0,-17.73
506,300497,1121164,2017-01-02 12:44:12,Full screen repair iPhone 5,Repair service including parts and labor for i...,Repair,REP0134,Repairs & warranties,100.0,1,60.0,90.0,-50.01
679,300941,1122210,2017-01-02 20:08:58,Withings Activite Steel Black Dial White,Smart Clock for iPhone and iPad,Withings,WIT0034,Smartwatch,153.0,1,130.0,153.0,-17.73


<div class="alert alert-box alert-success">
    This solution seems to have worked relatively well.
    <br><br>
    It appears that for the remaining values the regular price has changed since the sale took place.
    <br>
    We will tackle that as the next problem.
</div>

#### Problem 3: regular_price changing over time

<div class="alert alert-box alert-info">
    We still have to solve the problem of the regular_price being less than the sale_price in some instances, so that the discount percentage can be calculated correctly.
    <br>
    Let's check the values above and see if the regular prices have changed over time.
</div>

In [19]:
'''
LAC0159 - 16 rows, only the first value is incorrect. The sale_price is €1.70 greater than the regular_price
WIT0034 - 2 rows, total_paid = sale_price
REP0134 - 3 rows, total_paid = sale_price + 9.99 (shipping fee)
WIT0034 - 2 rows, total_paid = sale_price

LAC0171 - 108 rows, the first 6 rows have sale_price > regular_price. After that all the sale_prices are below the regular price, €249.99.
          However, the sale_price values in the first 6 rows (shown below) vary between €302.99 and €269.79.
          How do we know if €302.9 was the regular_price at this time and the other 5 values represent discount sales?
          The 4th, 5th and 6th rows have a sale_price of €283.99. 
          This could signify that the data is not corrupted but rather that this was a discount or a price change.
'''
sku = 'LAC0171'
completed_sales[completed_sales.sku.isin([sku])].sort_values('date').head(10)

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
272,299829,1119702,2017-01-01 17:50:48,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,282.48,1,249.99,275.49
3085,308551,1143957,2017-01-13 10:00:40,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,309.98,1,249.99,302.99
6920,318532,1168266,2017-01-31 23:10:16,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,276.78,1,249.99,269.79
6970,318707,1168710,2017-02-01 10:38:02,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,290.98,1,249.99,283.99
7215,319407,1170172,2017-02-02 23:49:53,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,325.97,1,249.99,283.99
7218,319416,1170189,2017-02-03 00:34:15,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,295.98,1,249.99,283.99
12453,351854,1239836,2017-05-02 14:15:50,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,242.98,1,249.99,235.99
12476,351938,1240025,2017-05-02 18:17:42,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99
12493,351987,1240131,2017-05-02 20:07:28,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99
12682,352663,1241533,2017-05-04 14:00:16,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99


<div class="alert alert-box alert-danger">
    We will have to examine the amount of data this problem affects after fixing as many of the prices as we can.
    <br>
    We can then decide if we wish to adjust the values or delete the orders entirely.
</div>

#### Problem 4: Incorrectly placed decimal points

In [20]:
'''
There are values where the regular_price has been corrupted.
'''
one_decimal_data[one_decimal_data.order_id.isin([496339])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
53601,496339,1574664,2018-01-22 20:18:56,Fibaro Single Switch embedded module,embedded module to control your consumption ou...,Fibaro,FIB0011,Accessories,55.98,1,599.918,56.99,90.5


#### TODO: Check how rounding the values affects the data
We see in the case above that we will need to round to whole numbers for this solution to have the desired affect.


problem 1: sale_price > total_paid - check how many rows, ignore for now?

problem 2: regular_price < sale_price 
- Round all values
- For the other ones, check if the regular price changed over time and assess whether the correct prices can be deduced from the prices at dates preceeding and following.
- Calculate the discount percentage after rounding and use negative discounts to identify these?

In [30]:
''' Let's refresh the data and round the values so that we can detect outliers using the discount_percentage '''

orderline_ids = single_orderline_single_product[
    (single_orderline_single_product.total_paid_decimal_count == 1) &
    (single_orderline_single_product.regular_price_decimal_count == 1) &
    (single_orderline_single_product.sale_price_decimal_count == 1)
].orderline_id

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')

one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# Outliers
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This represents 3.30% of the data.

There are corrupted values in total_paid which are less than their corresponding sale_price values.
This represents 0.68% of the data.

There are 305 outliers


In [22]:
outliers.sort_values('sale_price_vs_regular_price_percentage_diff', ascending=False).head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff
53630,496421,1574841,2018-01-22 22:35:46,Open - Piece Internal Battery iPhone 6,Replacement internal battery for Apple iPhone 6,Replacement,PIE0034-A,Accessories,15.78,1,349.896,11.79,96.63
19152,376976,1291907,2017-07-13 18:00:53,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,85.27,1,979.785,65.28,93.34
2474,306824,1139922,2017-01-10 21:16:45,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,77.98,1,979.785,72.99,92.55
23999,392903,1326538,2017-08-28 17:49:16,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.25,1,499.899,37.25,92.55
24270,393603,1326648,2017-08-28 19:19:14,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.24,1,499.899,37.24,92.55


In [23]:
''' Now round the prices and see if it makes a difference. '''

one_decimal_data[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data[['total_paid', 'regular_price', 'sale_price']].round()

# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# Outliers
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This represents 1.18% of the data.

There are corrupted values in total_paid which are less than their corresponding sale_price values.
This represents 0.47% of the data.

There are 305 outliers


In [24]:
outliers.sort_values('sale_price_vs_regular_price_percentage_diff', ascending=False).head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff
53630,496421,1574841,2018-01-22 22:35:46,Open - Piece Internal Battery iPhone 6,Replacement internal battery for Apple iPhone 6,Replacement,PIE0034-A,Accessories,16.0,1,350.0,12.0,96.57
19152,376976,1291907,2017-07-13 18:00:53,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,85.0,1,980.0,65.0,93.37
24293,393659,1326788,2017-08-28 22:31:30,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6
23999,392903,1326538,2017-08-28 17:49:16,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6
24453,394043,1327664,2017-08-29 21:04:10,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6


<div class="alert alert-block alert-danger">
    After visually examining all the values in the above dataframe (shown truncated), it is clear that the decimal point in most of these regular_price values has been placed one point too far to the right.
</div>

In [25]:
def _move_decimal_right(row, decimal_col, comparison_col):
    denominator = 10
    while row[decimal_col] >= row[comparison_col]:
        if row[decimal_col] > row[comparison_col]:
            row[decimal_col] /= denominator
        else:
            return row[decimal_col]
    return row[decimal_col] * denominator

def move_regular_price_decimal_point_right_wrt_sale_price(df):
    df.regular_price = [_move_decimal_right(row, 'regular_price', 'sale_price') for index, row in df.iterrows()]
    return df


test_data = outliers.sort_values('sale_price_vs_regular_price_percentage_diff', ascending=False).head(5).copy()
test_data = move_regular_price_decimal_point_right_wrt_sale_price(test_data)
test_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff
53630,496421,1574841,2018-01-22 22:35:46,Open - Piece Internal Battery iPhone 6,Replacement internal battery for Apple iPhone 6,Replacement,PIE0034-A,Accessories,16.0,1,35.0,12.0,96.57
19152,376976,1291907,2017-07-13 18:00:53,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,85.0,1,98.0,65.0,93.37
24293,393659,1326788,2017-08-28 22:31:30,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,50.0,37.0,92.6
23999,392903,1326538,2017-08-28 17:49:16,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,50.0,37.0,92.6
24453,394043,1327664,2017-08-29 21:04:10,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,50.0,37.0,92.6


In [26]:
test_data = outliers[outliers.order_id.isin([512762, 512594, 512579, 512662, 512523, 513175, 301102])].head(5).copy()
test_data = move_regular_price_decimal_point_right_wrt_sale_price(test_data)
test_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff
723,301102,1131193,2017-01-05 12:48:03,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,137.0,1,980.0,137.0,86.02
57981,512523,1615968,2018-02-16 09:03:37,Mophie Powerstation 4000mAh Battery Plus Mini ...,external battery capacity 4000mAh output volta...,Mophie,MOP0107,Accessories,13.0,1,70.0,8.0,88.57
57997,512579,1616105,2018-02-16 09:26:33,Mophie Powerstation 6000mAh battery Universal ...,external battery capacity 6000mAh output volta...,Mophie,MOP0105,Accessories,14.0,1,90.0,10.0,88.89
58006,512594,1616149,2018-02-16 09:36:22,Mophie Powerstation 6000mAh battery Universal ...,external battery capacity 6000mAh output volta...,Mophie,MOP0105,Accessories,14.0,1,90.0,10.0,88.89
58033,512662,1616282,2018-02-16 09:58:57,Mophie Powerstation 6000mAh battery Universal ...,external battery capacity 6000mAh output volta...,Mophie,MOP0105,Accessories,14.0,1,90.0,10.0,88.89


<div class="alert alert-box alert-danger">
    Oops! In lines above we can see that the regular_price has been corrupted and should clearly be slightly lower than the sale_price rather than much greater.
    <br><br>
    Let's edit the logic above to keep moving the decimal point until the smallest possible discount_percentage is found.
    <br><br>
    Later we can examine the data and decide whether it makes sense to set the regular_price equal to the sale_price or delete the rows.
</div>

In [46]:
a = 0.15
a.round(2)

AttributeError: 'float' object has no attribute 'round'

In [45]:
def _calculate_percentage_difference(df, reference_col, comparison_col):
    return ((df[reference_col] - df[comparison_col])/df[reference_col]*100).round(2)

def _move_decimal_right(row, decimal_col, comparison_col):
    denominator = 10

    while True:
        new_row = row.copy()
        new_row[decimal_col] /= denominator
             
        # Check if the new percentage difference is closer to 0
        if abs(_calculate_percentage_difference(new_row, decimal_col, comparison_col)) > abs(_calculate_percentage_difference(row, decimal_col, comparison_col)):
            return row
        else:
            row = new_row
    
def move_regular_price_decimal_point_right_wrt_sale_price(df):
    df = df.apply(lambda row: _move_decimal_right(row, 'regular_price', 'sale_price'), axis=1)
    return df


''' Check the new logic on the one decimal data and rerun the tests '''

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Fix the decimal points
# one_decimal_data = move_regular_price_decimal_point_right_wrt_sale_price(one_decimal_data)

one_decimal_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.15,1,139.99,129.16
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,15.76,1,17.99,10.77
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,84.98,1,99.99,77.99
13,247524,1547886,2018-01-08 21:21:14,Crucial memory Mac 16GB (2x8GB) SO-DIMM DDR3 1...,RAM 16GB (2x8GB) Mac mini (2011) iMac (2010/11...,Crucial,CRU0026-2,unknown,167.98,1,163.98,162.99
14,247643,1255453,2017-05-26 11:44:27,Open - Samsung 850 EVO SSD Disk 500GB,Open hard disk with SSD Mac and PC packaging 2...,Samsung,SAM0068-A,Accessories,153.54,1,179.99,153.54
...,...,...,...,...,...,...,...,...,...,...,...,...
61624,526816,1649042,2018-03-14 09:26:27,Macally Lampcharge Lamp with 4 USB ports,Table lamp with touch control and 4 USB charge...,Macally,MAC0144,Hardware,29.98,1,39.95,24.99
61625,526843,1649106,2018-03-14 10:39:18,Open - LaCie Porsche Design Desktop Lightgrey ...,External Hard Drive Refurbished 35 inches 4TB ...,LaCie,LAC0206-A,Accessories,113.22,1,139.99,108.23
61626,526858,1649135,2018-03-14 11:04:14,"OWC SSD installation kit for iMac 27 ""2011",SSD installation on iMac 27-inch Mid 2011 Kit,OWC,OWC0027,Accessories,47.98,1,60.99,42.99
61627,526863,1649141,2018-03-14 11:13:35,Mac memory OWC 8GB (2x4GB) SO-DIMM DDR3 1333MHZ,RAM 8GB (2x4GB) Mac mini (2011) iMac (2010-201...,OWC,OWC0036-2,unknown,96.98,1,93.98,92.99


In [None]:
# Round the prices
one_decimal_data[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data[['total_paid', 'regular_price', 'sale_price']].round()

# Calculate the discount percentage
one_decimal_data = add_discount_percentage_col(one_decimal_data)

# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# Outliers
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")

<div class="alert alert-block alert-success">
    Success!
</div>

In [31]:
completed_sales[completed_sales.sku=='GRT0462']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
50,258530,1328354,2017-08-30 10:42:53,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,7.78,1,24.99,3.79
10057,339409,1299797,2017-07-22 20:16:21,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,14.97,1,24.99,4.99
13250,354697,1245634,2017-05-11 08:50:29,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,13.98,1,24.99,9.99
13334,354980,1246820,2017-05-12 18:37:33,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,19.98,1,24.99,9.99
18853,376157,1290942,2017-07-12 17:23:01,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,1034.98,1,24.99,6.99
19249,377329,1292597,2017-07-14 09:53:19,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,28.97,1,24.99,3.99
19937,379563,1297129,2017-07-19 14:24:10,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,62.96,1,24.99,4.99
21238,383929,1310134,2017-08-02 23:20:29,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,43.96,1,24.99,4.99
21461,384403,1307917,2017-07-31 16:22:42,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,21.97,1,24.99,4.99
22435,387595,1314187,2017-08-08 17:43:26,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,877.32,1,24.99,3.99


In [32]:
outliers#.head() # 301752, 302431

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff
50,258530,1328354,2017-08-30 10:42:53,Griffin Survivor iPhone Case Clear 8 Plus / 7 ...,"Case, Green / Transparent iPhone resistant pol...",Griffin,GRT0462,Accessories,7.78,1,24.99,3.79,84.83
704,301043,1122427,2017-01-02 21:46:01,Samsung EVO SSD 750 250 GB,SSD Hard Drive SATA 6 Gb / s 25-inch NAND tech...,Samsung,SAM0095,Memory,86.99,1,999.896,86.99,91.3
723,301102,1131193,2017-01-05 12:48:03,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,136.87,1,979.785,136.87,86.03
882,301528,1123450,2017-01-03 12:40:06,Elgato Eve and Energy Plug Power Wireless Sens...,Control on / off and power / consumer remote s...,Pack,PAC1400,Smartwatch,96.99,1,999.799,96.99,90.3
1066,302071,1127175,2017-01-03 22:18:08,Samsung EVO SSD 750 250 GB,SSD Hard Drive SATA 6 Gb / s 25-inch NAND tech...,Samsung,SAM0095,Memory,86.99,1,999.896,86.99,91.3
1683,304477,1134261,2017-01-07 21:01:25,"Red 750GB WD 25 ""Mac PC hard drive and NAS",Western Digital hard drive designed for NAS 75...,Western Digital,WDT0185,Hardware,53.98,1,639.896,49.99,92.19
1915,305257,1136025,2017-01-08 23:48:37,Pure UltraSlim Case 03 + Protector iPhone 5 / ...,03mm thin cover with screen protector included...,Puro,PUR0142,Accessories,16.98,1,129.906,12.99,90.0
1981,305501,1137439,2017-01-09 15:57:06,Service installation RAM + HDD + SSD iMac,installation RAM HDD + SSD + on your iMac + Da...,Service,SEV0025,Memory,54.99,1,599.918,59.99,90.0
2474,306824,1139922,2017-01-10 21:16:45,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,77.98,1,979.785,72.99,92.55
2584,307156,1140724,2017-01-11 11:16:20,Travelstar Kingston Pen Drive 3C Duo micro USB...,64GB pendrive reversible USB-C to USB for Mac ...,Kingston,KIN0123,Accessories,2.98,1,302.803,23.99,92.08


<div class="alert alert-block alert-success">
    After visually evaluating the outliers they appear to just be products with very large discounts.
</div>

In [33]:
outliers[outliers.order_id.isin([301752, 302431])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff


<div class="alert alert-block alert-danger">
    Except the values above where the total_paid decimal has been corrupted.
    <br><br>
    Let's handle this in the next section where we will be cleaning the total_paid values anyway.
</div>

#### Problem 5: total_paid < sale_price & total_paid >> sale_price

<div class="alert alert-block alert-info">
    Let's first take a look at the current state of the total_paid data.
</div>

In [34]:
completed_sales[completed_sales.total_paid==0]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
160,296010,1138342,2017-01-09 23:41:57,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,0.0,1,29.9,24.99
866,301495,1123362,2017-01-03 12:04:16,Svolta Tucano MacBook Pro Sleeve bag / Retina ...,compact case for MacBook / Air 13 and 13 inche...,Tucano,TUC0277,Accessories,0.0,1,29.9,24.99


In [35]:
# The 2nd and 3rd rows below have additional orderlines in their orders, the sum of which is equal to the total paid. 
completed_sales[completed_sales.sku=='TUC0252'].head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
160,296010,1138342,2017-01-09 23:41:57,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,0.0,1,29.9,24.99
5596,315031,1159793,2017-01-26 09:56:46,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,1296.96,1,29.9,24.99
9679,327408,1187814,2017-02-22 00:30:18,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,53.97,1,29.9,24.99
11327,348074,1231937,2017-04-21 08:44:40,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,29.98,1,29.9,24.99
11538,348607,1233041,2017-04-22 05:41:48,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,29.98,1,29.9,24.99


<div class="alert alert-block alert-danger">
    While investigating the total_paid < sale_price values below it was found that two total_paid values were equal to zero. 
    <br><br>
    Since the regular_price and sale_price values for these two orders are consistent with the others, for which the total_paid = sale_price + €4.99 (shipping fee), for now we will simply set the total_paid equal to €29.98, to get the outliers function to run, and keep these values in mind as we try to solve the problem.
</div>

In [36]:
#one_decimal_data[one_decimal_data.total_paid==0].total_paid = 29.98
one_decimal_data.loc[one_decimal_data.order_id==296010, 'total_paid'] = 29.98
one_decimal_data.loc[one_decimal_data.order_id==301495, 'total_paid'] = 29.98
one_decimal_data[one_decimal_data.order_id.isin([296010, 301495])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff
160,296010,1138342,2017-01-09 23:41:57,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,29.98,1,29.9,24.99,16.42
866,301495,1123362,2017-01-03 12:04:16,Svolta Tucano MacBook Pro Sleeve bag / Retina ...,compact case for MacBook / Air 13 and 13 inche...,Tucano,TUC0277,Accessories,29.98,1,29.9,24.99,16.42


In [37]:
def check_for_outliers2(df, reference_col='regular_price', comparison_col='sale_price'):
    ''' 
    Check for outliers to see if the decimal value is in the wrong spot. 
    
    We define an outlier as having a percentage difference between the two price 
    values greater than 3 standard deviations from the mean percentage difference.
    '''
    diff_col_name = reference_col + '_' + comparison_col + '_%diff'
    df[diff_col_name] = _calculate_percentage_difference(df, reference_col, comparison_col)
    
    mean_diff = df[diff_col_name].mean()
    std_diff = df[diff_col_name].std()
    threshold = mean_diff + 3 * std_diff
    
    outliers = df[abs(df[diff_col_name]) > threshold]
    
    return outliers


main_col='total_paid'
comparison_col='sale_price'
outliers2 = check_for_outliers2(one_decimal_data, main_col, comparison_col)

print(f"There are {outliers2.shape[0]} outliers")

outliers2.sort_values('total_paid_sale_price_%diff')

There are 212 outliers


Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff,total_paid_sale_price_%diff
2584,307156,1140724,2017-01-11 11:16:20,Travelstar Kingston Pen Drive 3C Duo micro USB...,64GB pendrive reversible USB-C to USB for Mac ...,Kingston,KIN0123,Accessories,2.98,1,302.803,23.99,92.08,-705.03
92,274043,1121663,2017-01-02 16:30:49,Griffin Magnetic BreakSafe-C USB charging cabl...,Charging cable and connector magnetic fast lib...,Griffin,GRT0425,Accessories,3.98,1,34.99,24.99,28.58,-527.89
3357,309296,1145769,2017-01-14 11:58:50,Travelstar Kingston Pen Drive 3C Duo micro USB...,64GB pendrive reversible USB-C to USB for Mac ...,Kingston,KIN0123,Accessories,3.99,1,302.803,23.99,92.08,-501.25
1197,302589,1130123,2017-01-04 16:51:31,Be.ez LArobe MacBook Pro Retina ONE sleeve 13 ...,Neoprene Sleeve for MacBook Pro Retina 13 inches,Be.ez,BEZ0178,Accessories,4.98,1,29.95,24.99,16.56,-401.81
949,301709,1123836,2017-01-03 15:09:35,Svolta Tucano MacBook Pro Sleeve bag / Retina ...,compact case for MacBook / Air 13 and 13 inche...,Tucano,TUC0277,Accessories,6.98,1,29.9,24.99,16.42,-258.02
3396,309426,1146043,2017-01-14 16:56:51,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,8.98,1,29.99,29.99,0.0,-233.96
1421,303444,1131913,2017-01-06 01:01:47,Samsung Pro + SDHC UHS Class 3 | 32GB,SDHC Memory Card U3 / UHS-I speed of 95MB / 90MB,Samsung,SAM0105,Memory,8.98,1,52.01,28.99,44.26,-222.83
1556,304005,1133051,2017-01-07 01:04:49,"Be.ez LArobe Case for MacBook Pro Retina 15 ""g...",Retina MacBook Pro sleeve 15 inch polyurethane.,Be.ez,BEZ0159,Accessories,8.98,1,34.99,28.99,17.15,-222.83
1574,304106,1133741,2017-01-07 15:15:34,Startech USB adapter to HDMI-C White,Adapter with reversible USB-C connection HDMI ...,Startech,STA0038,Accessories,9.98,1,39.99,29.99,25.01,-200.5
4012,311055,1151189,2017-01-18 12:25:38,Startech USB-C HDMI Adapter Black,Adapter with USB-C reversible HDMI connection ...,Startech,STA0037,Accessories,9.98,1,39.99,29.99,25.01,-200.5


In [38]:
outliers2[outliers2.order_id.isin([307156, 274043, 309296])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,sale_price_vs_regular_price_percentage_diff,total_paid_sale_price_%diff
92,274043,1121663,2017-01-02 16:30:49,Griffin Magnetic BreakSafe-C USB charging cabl...,Charging cable and connector magnetic fast lib...,Griffin,GRT0425,Accessories,3.98,1,34.99,24.99,28.58,-527.89
2584,307156,1140724,2017-01-11 11:16:20,Travelstar Kingston Pen Drive 3C Duo micro USB...,64GB pendrive reversible USB-C to USB for Mac ...,Kingston,KIN0123,Accessories,2.98,1,302.803,23.99,92.08,-705.03
3357,309296,1145769,2017-01-14 11:58:50,Travelstar Kingston Pen Drive 3C Duo micro USB...,64GB pendrive reversible USB-C to USB for Mac ...,Kingston,KIN0123,Accessories,3.99,1,302.803,23.99,92.08,-501.25


<div class="alert alert-box alert-danger">
    There are rows (above) which must be fixed by moving the decimal right.
</div>

In [39]:
completed_sales[completed_sales.sku=='MAC0121']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
6011,316116,1162133,2017-01-27 21:59:12,Macally Snap-on Case iPhone 6 / 6S metallic red,Case rigid and resistant metallic tones iPhone...,Macally,MAC0121,Accessories,33.41,1,9.95,4.95
16825,370351,1278732,2017-06-30 14:15:57,Macally Snap-on Case iPhone 6 / 6S metallic red,Case rigid and resistant metallic tones iPhone...,Macally,MAC0121,Accessories,847.98,1,9.95,7.99
42609,457539,1483664,2017-12-08 21:41:21,Macally Snap-on Case iPhone 6 / 6S metallic red,Case rigid and resistant metallic tones iPhone...,Macally,MAC0121,Accessories,38.96,1,9.95,4.99
48594,477933,1528572,2017-12-31 10:41:38,Macally Snap-on Case iPhone 6 / 6S metallic red,Case rigid and resistant metallic tones iPhone...,Macally,MAC0121,Accessories,18.92,1,9.95,4.74


<div class="alert alert-box alert-danger">
    There are rows (above) which must be fixed by moving the decimal left.
</div>

In [40]:
completed_sales[completed_sales.order_id.isin([301709, 309426, 303444, 311055, 300572])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
535,300572,1123325,2017-01-03 11:50:07,Kanex Thunderbolt Cable 2m,Thunderbolt Cable 2m.,Kanex,KAN0025,Accessories,21.99,1,47.99,46.99
949,301709,1123836,2017-01-03 15:09:35,Svolta Tucano MacBook Pro Sleeve bag / Retina ...,compact case for MacBook / Air 13 and 13 inche...,Tucano,TUC0277,Accessories,6.98,1,29.9,24.99
1421,303444,1131913,2017-01-06 01:01:47,Samsung Pro + SDHC UHS Class 3 | 32GB,SDHC Memory Card U3 / UHS-I speed of 95MB / 90MB,Samsung,SAM0105,Memory,8.98,1,52.01,28.99
3396,309426,1146043,2017-01-14 16:56:51,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,8.98,1,29.99,29.99
4012,311055,1151189,2017-01-18 12:25:38,Startech USB-C HDMI Adapter Black,Adapter with USB-C reversible HDMI connection ...,Startech,STA0037,Accessories,9.98,1,39.99,29.99


In [41]:
''' There are no other single-orderline orders to compare this order to so we cannot fix it.'''
single_orderline_single_product[single_orderline_single_product.sku=='KAN0025']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,regular_price_decimal_count,sale_price_decimal_count,total_paid_decimal_count
535,300572,1123325,2017-01-03 11:50:07,Kanex Thunderbolt Cable 2m,Thunderbolt Cable 2m.,Kanex,KAN0025,Accessories,21.99,1,47.99,46.99,1,1,1


In [42]:
''' There are no other single-orderline orders with the same sale_price so we cannot fix it. '''
single_orderline_single_product[single_orderline_single_product.sku=='SAM0105']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,regular_price_decimal_count,sale_price_decimal_count,total_paid_decimal_count
780,301253,1122867,2017-01-03 01:34:02,Samsung Pro + SDHC UHS Class 3 | 32GB,SDHC Memory Card U3 / UHS-I speed of 95MB / 90MB,Samsung,SAM0105,Memory,35.98,1,52.01,31.99,1,1,1
1421,303444,1131913,2017-01-06 01:01:47,Samsung Pro + SDHC UHS Class 3 | 32GB,SDHC Memory Card U3 / UHS-I speed of 95MB / 90MB,Samsung,SAM0105,Memory,8.98,1,52.01,28.99,1,1,1


In [43]:
single_orderline_single_product[single_orderline_single_product.sku=='TRK0007']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,regular_price_decimal_count,sale_price_decimal_count,total_paid_decimal_count
652,300873,1122059,2017-01-02 19:06:03,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,34.98,1,29.99,29.99,1,1,1
1339,303174,1131257,2017-01-05 13:31:53,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,39.98,1,29.99,29.99,1,1,1
2245,306172,1138338,2017-01-09 23:40:17,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,34.98,1,29.99,29.99,1,1,1
2393,306585,1139370,2017-01-10 16:30:41,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,34.98,1,29.99,29.99,1,1,1
3396,309426,1146043,2017-01-14 16:56:51,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,8.98,1,29.99,29.99,1,1,1
3432,309540,1146288,2017-01-14 21:53:54,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,39.98,1,29.99,29.99,1,1,1
3436,309546,1146305,2017-01-14 22:02:01,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,34.98,1,29.99,29.99,1,1,1
8490,323517,1178966,2017-02-13 12:52:16,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,34.98,1,29.99,29.99,1,1,1
8845,324522,1181473,2017-02-15 12:23:41,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,39.98,1,29.99,29.99,1,1,1
9275,325912,1184656,2017-02-18 11:03:48,Bravo Locator Black Trackr,Bluetooth locator objects APP for iPhone,TrackR,TRK0007,unknown,34.98,1,29.99,29.99,1,1,1


<div class="alert alert-box alert-danger">
    There are rows (above) which simply have the wrong total_paid value and cannot be fixed by moving the decimal. 
    <br><br>
    Perhaps orderlines from these orders are missing in the data dump?
    <br><br>
    To fix these we will have to:
    <br>
    - Identify them as outliers
    <br>
    - Delete rows which do not have other orders with the same sale_price. The cannot be fixed.
    <br>
    - Get the mode (most common value) total_paid of other single-orderline orders of the same product, which are not outliers and have the same sale_price.
    <br>
    - Assign this value as the total_paid for the corrupted row
</div>

In [None]:
# def _calculate_percentage_difference(df, reference_col, comparison_col):
#     return ((df[reference_col] - df[comparison_col])/df[reference_col]*100).round(2)

# def _move_decimal_right(row, decimal_col, comparison_col):
#     denominator = 10

#     while True:
#         new_row = row.copy()
#         new_row[decimal_col] /= denominator
             
#         # Check if the new percentage difference is closer to 0
#         if abs(_calculate_percentage_difference(new_row, decimal_col, comparison_col)) > abs(_calculate_percentage_difference(row, decimal_col, comparison_col)):
#             return row
#         else:
#             row = new_row
    
# def move_regular_price_decimal_point_right_wrt_sale_price(df):
#     df = df.apply(lambda row: _move_decimal_right(row, 'regular_price', 'sale_price'), axis=1)
#     return df


# ''' Check the new logic on the one decimal data and rerun the tests '''

# one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
# one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
# one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# # Fix the decimal points
# one_decimal_data = move_regular_price_decimal_point_right_wrt_sale_price2(one_decimal_data)

# # Round the prices
# one_decimal_data[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data[['total_paid', 'regular_price', 'sale_price']].round()

# # Calculate the discount percentage
# one_decimal_data = add_discount_percentage_col(one_decimal_data)

# # Tests
# sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
# sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# # Outliers
# outliers = check_for_outliers(one_decimal_data)
# print(f"There are {outliers.shape[0]} outliers")

In [None]:
one_decimal_data[one_decimal_data.order_id.isin([304373, 311465, 281302, 303444, 310031, 274043, 307156, 309296])]

In [None]:
one_decimal_data[one_decimal_data.orderline_id.isin(sale_greater_than_total_orderline_ids)].sort_values('discount_percentage', ascending=False).head()

In [None]:
sale_price_greater_than_regular[(sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price) & (sale_price_greater_than_regular.total_paid < sale_price_greater_than_regular.sale_price)]

<div class="alert alert-box alert-info">
    Let's split the data at the decimal point and examine whether we can determine any patterns by the number of digits after the decimal point.
</div>

## Decimal count = 1 = 0 = 1

<div class="alert alert-box alert-info">
    total_paid_decimal_count==1  & regular_price_decimal_count==0 & sale_price_decimal_count==1
</div>

In [None]:
1. Turn them into floats

In [None]:
order_ids = prices[
((prices.regular_price_decimal_count == 0 )| (prices.regular_price_decimal_count == 1)) &
((prices.promo_price_decimal_count == 0) | (prices.promo_price_decimal_count == 1)) &
((prices.sale_price_decimal_count == 0) | (prices.sale_price_decimal_count == 1)) 
].orderline_id

zero_one_decimal_data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
zero_one_decimal_data

## Decimal count == 0 

<div class="alert alert-box alert-info">
    regular_price_decimal_count == promo_price_decimal_count == sale_price_decimal_count == 0
</div>
<div class="alert alert-box alert-warning">
    There are no sale_price values with zero decimal points.
    <br>
    The regular_price and promo_price values with no decimal points appear to have been accidentally saved as ints instead of floats.
    <br>
    Let's select only these values and see if this logic is correct
</div>


In [None]:
order_ids = prices[
((prices.regular_price_decimal_count == 0)) &
((prices.promo_price_decimal_count == 0)) &
((prices.sale_price_decimal_count == 0) | (prices.sale_price_decimal_count == 1)) 
].orderline_id

zero_decimal_count = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
zero_decimal_count

In [None]:
# Transform the three columns to floats
zero_decimal_count[['regular_price', 'promo_price', 'sale_price']] = zero_decimal_count[['regular_price', 'promo_price', 'sale_price']].astype('float')

# Run the tests
incorrect_reg_pro_ids = pdt.test_regular_greater_or_equal_to_promo(zero_decimal_count)

In [None]:
display(zero_decimal_count[zero_decimal_count.regular_price >= zero_decimal_count.promo_price].head())

<div class="alert alert-box alert-success">
    After viewing the entire df (shown above in truncated form) we can conclude that our assumption is basically correct.
</div>

<div class="alert alert-box alert-danger">
    However, some of the sale_price values still do not match the regular_price or the promo_price.
    <br>
    <br>
    Furthermore, even though the total_price values only ever had one decimal point, for some of them it is clearly in the wrong place.
</div>
<div class="alert alert-box alert-info">
    To clean the sale_price values we must first clean the total_price values as it is sometimes equal to neither regular_price nor promo_price but is close to total_paid.
    <br>
    <br>
    To this end we must select a subset where all the orderlines in the order are within the zero_decimal_count df. 
    <br>
    If the order we are examining has other otherlines which are in completed_sales but not in zero_decimal_count then we cannot be sure we have all the data to confirm the total_paid value is correct.
    <br>
    <br>
    We must check the distribution of total_price values wrt to sale_price, then promo_price and regular_price to determine whether the difference can be explained by shipping costs or whether the decimal point is in the wrong place.
    <br>
    <br>
    We can then analyse the distribution of sale_price values wrt to regular_price and wrt promo_price and examine the outliers.
</div>

In [None]:
''' Filter out orders which do not have all their orderlines in zero_decimal_count'''

# Get the count of orderlines per order from completed_sales
completed_sales_orderline_counts = completed_sales.groupby('order_id')['orderline_id'].count().reset_index()
completed_sales_orderline_counts.rename(columns={'orderline_id': 'completed_count'}, inplace=True)

# Do the same for zero_decimal_count
zero_dec_orderline_counts = zero_decimal_count.groupby('order_id')['orderline_id'].count().reset_index()
zero_dec_orderline_counts.rename(columns={'orderline_id': 'zero_count'}, inplace=True)

# Merge to find orders with matching counts
valid_orders = pd.merge(completed_sales_orderline_counts, zero_dec_orderline_counts, on='order_id')
valid_orders = valid_orders[valid_orders['completed_count'] == valid_orders['zero_count']]

# Filter zero_decimal_count for these valid order_ids
zero_decimal_count__complete_orders = zero_decimal_count[zero_decimal_count['order_id'].isin(valid_orders['order_id'])].copy()

print("zero_decimal_count.shape:", zero_decimal_count.shape)
print("zero_decimal_count_subset.shape:", zero_decimal_count__complete_orders.shape)

In [None]:
valid_orders[valid_orders]

In [None]:
''' Multiply regular_price, promo_price and sale_price by the product_quantity to compare them to the total_paid. '''

zero_decimal_count__complete_orders['regular_price_total'] = zero_decimal_count__complete_orders.regular_price*zero_decimal_count__complete_orders.product_quantity
zero_decimal_count__complete_orders['promo_price_total'] = zero_decimal_count__complete_orders.promo_price*zero_decimal_count__complete_orders.product_quantity
zero_decimal_count__complete_orders['sale_price_total'] = zero_decimal_count__complete_orders.sale_price*zero_decimal_count__complete_orders.product_quantity

''' First check the orders with only one orderline for simplicity '''
zero_decimal_count__complete_orders__single_orderline

In [None]:

incorrect_pro_sale_ids = pdt.test_promo_greater_or_equal_to_sale(zero_decimal_count)

## Evaluate corrupted data with multiple decimal points

### regular_price_decimal_count==1	promo_price_decimal_count==0	sale_price_decimal_count==2

In [None]:
order_ids = prices[
(prices.regular_price_decimal_count == 1) &
(prices.promo_price_decimal_count == 0) &
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

### regular_price_decimal_count==1	promo_price_decimal_count==1	sale_price_decimal_count==2

In [None]:
order_ids = prices[
(prices.regular_price_decimal_count == 1) &
(prices.promo_price_decimal_count == 1) &
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

### sale_price_decimal_count == 2

In [None]:
order_ids = prices[
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

<div class="alert alert-block alert-danger">
    FOR SALE_PRICE_DEC_COUNT: YOU NEED TO SPLIT THE SALE PRICE COLUMN AND FIGURE OUT IF THERE ARE ALWAYS 2 DIGITS AFTER THE DEC AND 2 BETWEEN THE TWO. THEN YOU MIGHT BE ABLE TO FIX THEM ALL IN ONE GO.

</div>

### regular_price_decimal_count == 0

In [None]:
# Let's take a look at the values with zero decimal points. It is possible that these have been incorrectly stored as ints.
reg_price_zero_dec_ids = prices[(prices.regular_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(reg_price_zero_dec_ids)].copy()
zero_dec_test_data

In [None]:
pro_price_zero_dec_ids = prices[(prices.promo_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(pro_price_zero_dec_ids)].copy()
zero_dec_test_data

In [None]:
reg_pro_price_zero_dec_ids = prices[(prices.regular_price_decimal_count == 0) & (prices.promo_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(zero_dec_ids)].copy()
zero_dec_test_data

<div class="alert alert-block alert-success">
    The
</div>

### Check sale_price

In [None]:
temp = prices.copy()
temp[temp.sale_price_decimal_count!=1]

In [None]:
temp = (
    temp.assign(
        sale_price_split = lambda x: x['sale_price'].str.split(r'\.'),
        decimal_places = lambda x: len(x['sale_price_split'][1]))
)
temp.head(5)

In [None]:
temp[temp.decimal_places != 2]

### Transform orderlines.unit_price to floats

In [None]:
def transform_unit_price_to_floats(df):
    """
    Transform the orderlines.unit_price price column to floats.
    Some of the values have two decimal points. 
    For these values we will remove the leftmost decimal and transform all values to floats.
    The correct position of the decimal point will be determined by merging orderlines, 
    products, orders and brands, and comparing the price values.
    
    Args:
        df (pd.DataFrame): The orderlines data
    
    Returns:
        pd.DataFrame: The orderlines data with the unit_price column transformed from str to float values.
    """
    return (
        df.assign(unit_price = df.unit_price.str.split('.')
                  .apply(lambda x : x[0]+x[1]+'.'+x[2] if len(x)==3 else x[0]+'.'+ x[1])
                  .astype(float)
        )
    )
    

In [None]:
completed_sales['regular_price'].str.count(r'\.')

In [None]:
def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df

# Solution attempt 1
Move the decimal points until reg >= promo >= sale and run the tests to see if it works

In [None]:
def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df

def _insert_decimal_in_promo_price(row, decimal_position = -2):
    '''
    If the euro value of the regular_price is equal to the euro value of the promo_price, 
    set the promo_price equal to regular_price and return it.
    This is because some promo_prices are slightly larger than their equivalent prices, e.g. 12.95 - 12.99
    
    Otherwise, keep moving the decimal point towards the start of the string until the promo_price is lower than the price.
    Then transform the string to a float and round it to two decimal places.
    '''
    
    while float(row.promo_price) > row.regular_price:
        if round(float(row.promo_price), 0) == round(row.regular_price, 0):
            row.promo_price = row.regular_price
            return row.promo_price
        else:
            row.promo_price = _insert_decimal_at_string_position(row.promo_price, decimal_position)
            decimal_position += -1
    
    return round(float(row.promo_price), 2)

def transform_promo_price_to_floats(df):
    df.promo_price = [_insert_decimal_in_promo_price(row) for index, row in df.iterrows()]
    return df
    
def calculate_products_discounts(df):
    return df.assign(
        discount=round(df.price - df.promo_price, 2),
        discount_pc = round((df.price - df.promo_price)/df.price * 100, 2)
    )

def calculate_sales_discounts(df):
    return (df
            .assign(
                sales_discount=round(df.price - df.sale_price, 2),
                sales_discount_pc = round((df.price - df.sale_price)/df.price * 100, 2)
            )
           )

temp = completed_sales.copy()

temp = (temp
        .pipe(start_pipeline)
        .pipe(split_str_on_dots_and_append_decimal, 'regular_price')
        #.pipe(split_and_join_regular_prices)
        .pipe(transform_regular_price_to_float)
        .pipe(split_str_on_dots_and_append_decimal, 'promo_price')
        #.pipe(split_and_join_promo_prices)
        .pipe(transform_promo_price_to_floats)
        .pipe(calculate_products_discounts)
        .pipe(calculate_sales_discounts)
)

temp