# Assumptions

* The total_paid must be greater or equal to the sum of product_quantity*sale_price for all orderlines in an order
* regular_price must be greater or equal to sale_price
* regular_price must be greater or equal to promo_price
* promo_price could be less than sale_price if the product was sold at the regular_price
* It is possible that the difference between the total_paid value and sum(product_quantity*sale_price) is the shipping cost

In [1]:
import re
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import data_utils
import price_debugging_tests as pdt

# Set maximum number of rows to display
pd.set_option('display.max_rows', 1000)

# Import and merge data

In [2]:
# Import the data
orders = data_utils.clean_orders(data_path="../../data/")
orderlines = data_utils.clean_orderlines(data_path="../../data/")
products = data_utils.clean_products(data_path="../../data/")
brands = data_utils.clean_brands(data_path="../../data/")

# Merge the data
completed_sales =  data_utils.merge_data(orders, orderlines, products, brands)

5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from orderlines.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.




# Price data exploration
## Exclude orders with multiple orderlines and multiple products
<div class="alert alert-box alert-info">
    For simplicity's sake we will first exclude orders with multiple orderlines and product_quantity >= 1 so we can easily compare the prices the total_paid.
    <br>
    Hopefully we will be able to determine patterns which can later be applied to all the data. 
</div>

In [3]:
# Get the count of orderlines per order from completed_sales
completed_sales_orderline_counts = completed_sales.groupby('order_id')['orderline_id'].count().reset_index()
completed_sales_orderline_counts.rename(columns={'orderline_id': 'orderline_count'}, inplace=True)

''' Examine the distribution of orderlines per order. '''
# Group by completed_count and count how many orders have the same number of orderlines
orderline_distribution = completed_sales_orderline_counts.groupby('orderline_count')['order_id'].count().reset_index()
orderline_distribution.rename(columns={'order_id': 'order_count'}, inplace=True)

# Calculate the percentage of each orderline count relative to the total number of orders
total_orders = completed_sales_orderline_counts['order_id'].nunique()
orderline_distribution['percentage'] = (orderline_distribution['order_count'] / total_orders) * 100

orderline_distribution

Unnamed: 0,orderline_count,order_count,percentage
0,1,36055,77.770108
1,2,7100,15.314596
2,3,2107,4.544768
3,4,688,1.484006
4,5,261,0.562973
5,6,82,0.176873
6,7,30,0.06471
7,8,18,0.038826
8,9,12,0.025884
9,10,5,0.010785


In [4]:
''' Exclude orders with more than one orderline. '''
orders_with_one_orderline = completed_sales_orderline_counts[completed_sales_orderline_counts.orderline_count==1]
single_orderline_orders = completed_sales[completed_sales.order_id.isin(orders_with_one_orderline.order_id)]
single_orderline_orders.shape

(36055, 12)

<div class="alert alert-box alert-success">
    Good. We still have 78% of the data.
</div>

In [5]:
''' Exclude orders with product_quantity > 1 so we can directly compare the prices with total_paid. '''

single_orderline_single_product = single_orderline_orders[single_orderline_orders.product_quantity==1].copy()
single_orderline_single_product.shape[0]/total_orders*100

72.17273139060849

<div class="alert alert-box alert-success">
    We still have 72% of the data.
</div>

## Explore corrupted decimal values

In [6]:
''' Helper functions '''

def add_discount_percentage_col(df):
    df['discount_percentage'] = (df.regular_price - df.sale_price)/df.regular_price*100
    return df

def check_for_outliers(df):
    ''' 
    Check for outliers to see if the decimal value was in the wrong spot. 
    
    We define an outlier as having a discount percentage in regular_price and sale_price 
    greater than 3 standard deviations from the mean discount percentage
    '''

    mean_diff = df['discount_percentage'].mean()
    std_diff = df['discount_percentage'].std()
    threshold = mean_diff + 3 * std_diff
    
    outliers = df[df['discount_percentage'] > threshold]
    
    return outliers

### Count decimal points in price values - completed_sales

In [7]:
price_data = completed_sales.copy()

price_data['regular_price_decimal_count'] = price_data['regular_price'].str.count(r'\.')
price_data['sale_price_decimal_count'] = price_data['sale_price'].str.count(r'\.')
price_data['total_paid_decimal_count'] = price_data['sale_price'].str.count(r'\.')

price_data[['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,total_paid_decimal_count,regular_price_decimal_count,sale_price_decimal_count
0,,18611,
1,58120.0,42264,58120.0
2,3552.0,797,3552.0


### Check the distribution of decimal points across the price data - completed_sales

In [8]:
# Group by the three decimal count columns and get the count & percentage for each group
grouped_counts = price_data.groupby(
    ['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count']
).size().reset_index(name='count')

# Calculate the percentage of each group relative to the total number of rows
total_rows = len(price_data)
grouped_counts['percentage'] = (grouped_counts['count'] / total_rows) * 100

# Display the resulting dataframe
grouped_counts.sort_values('percentage', ascending=False)

Unnamed: 0,total_paid_decimal_count,regular_price_decimal_count,sale_price_decimal_count,count,percentage
1,1,1,1,41135,66.699637
0,1,0,1,16412,26.611752
3,2,0,2,2199,3.565638
4,2,1,2,1129,1.830652
2,1,2,1,573,0.929109
5,2,2,2,224,0.363212


### Count decimal points in price values - single_orderline_single_product

In [9]:
single_orderline_single_product['regular_price_decimal_count'] = single_orderline_single_product['regular_price'].str.count(r'\.')
single_orderline_single_product['sale_price_decimal_count'] = single_orderline_single_product['sale_price'].str.count(r'\.')
single_orderline_single_product['total_paid_decimal_count'] = single_orderline_single_product['sale_price'].str.count(r'\.')

single_orderline_single_product[['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,total_paid_decimal_count,regular_price_decimal_count,sale_price_decimal_count
0,,10233,
1,31050.0,22760,31050.0
2,2410.0,467,2410.0


### Check the distribution of decimal points across the price data - single_orderline_single_product

In [10]:
# Group by the three decimal count columns and get the count & percentage for each group
grouped_counts = single_orderline_single_product.groupby(
    ['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count']
).size().reset_index(name='count')

# Calculate the percentage of each group relative to the total number of rows
total_rows = len(price_data)
grouped_counts['percentage'] = (grouped_counts['count'] / total_rows) * 100

# Display the resulting dataframe
grouped_counts.sort_values('percentage', ascending=False)

Unnamed: 0,total_paid_decimal_count,regular_price_decimal_count,sale_price_decimal_count,count,percentage
1,1,1,1,21971,35.625568
0,1,0,1,8767,14.215527
3,2,0,2,1466,2.377092
4,2,1,2,789,1.279349
2,1,2,1,312,0.505902
5,2,2,2,155,0.25133


## Evaluate potentially uncorrupted values

### Decimal count = 1 = 1 = 1
<div class="alert alert-box alert-info">
    Since 67% of the completed_sales data has total_paid_decimal_count = regular_price_decimal_count == sale_price_decimal_count == 1, cleaning this data will be a good first step.
</div>

In [29]:
orderline_ids = single_orderline_single_product[
    (single_orderline_single_product.total_paid_decimal_count == 1) &
    (single_orderline_single_product.regular_price_decimal_count == 1) &
    (single_orderline_single_product.sale_price_decimal_count == 1)
].orderline_id

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
# drop the decimal count cols to clean up the output
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Add a discount percentage col to detect incorrect values and outliers
one_decimal_data = add_discount_percentage_col(one_decimal_data)

#### Problem 1: sale_price < total_paid

<div class="alert alert-box alert-danger">
    There are cases where the sale_price is greater than the total_paid value.
</div>

In [30]:
one_decimal_data[one_decimal_data.order_id.isin([301495, 281302, 274043, 287311, 300950, 296010, 297572])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
92,274043,1121663,2017-01-02 16:30:49,Griffin Magnetic BreakSafe-C USB charging cabl...,Charging cable and connector magnetic fast lib...,Griffin,GRT0425,Accessories,3.98,1,34.99,24.99,28.579594
111,281302,1142023,2017-01-12 01:00:05,Startech USB Adapter VGA-C Blanco,Adapter with reversible USB connection VGA-C (...,Startech,STA0036,Accessories,16.98,1,53.99,29.99,44.452676
131,287311,1164035,2017-01-29 22:54:50,Sandisk iXpand Lightning to USB 3.0 64GB,64GB storage unit for iPhone and iPad,SanDisk,SAN0134,Accessories,54.99,1,69.99,59.99,14.287755
160,296010,1138342,2017-01-09 23:41:57,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,0.0,1,29.9,24.99,16.421405
182,297572,1162207,2017-01-27 23:53:16,Twelve South BookArc support for MacBook Pro /...,Aluminum support in an arc Retina MacBook Air.,Twelve South,TWS0093,Accessories,51.99,1,59.99,56.99,5.000833
684,300950,1122232,2017-01-02 20:20:08,Apple Mac mini Core i5 14GHz | 8GB RAM | 500GB...,Mac mini desktop computer 8GB RAM 500GB SSD (M...,Pack,PAC0594,Memory,777.99,1,1135.59,782.99,31.049939
866,301495,1123362,2017-01-03 12:04:16,Svolta Tucano MacBook Pro Sleeve bag / Retina ...,compact case for MacBook / Air 13 and 13 inche...,Tucano,TUC0277,Accessories,0.0,1,29.9,24.99,16.421405


<div class="alert alert-box alert-danger">
    The first case above seems to have the decimal point in the wrong place in total_paid.
    <br><br>
    Some of the prices are relatively similar, for example, the 3rd, 5th and 6th rows above, and could possibly be corrected by setting the total_paid equal to the sale_price.
    <br><br>
    Some of the data is too corrupted to be corrected: The 2nd, 4th and 7th rows.
</div>

In [31]:
''' What percentage of the data has total_paid values lower than the sale_price? '''
total_orders = one_decimal_data.shape[0]
sale_price_too_high = one_decimal_data[one_decimal_data.total_paid < one_decimal_data.sale_price].shape[0]
print(f"The sale_price is greater than the total_paid for {sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the total_paid for 0.68% of the orders. 


<div class="alert alert-box alert-success">
    This problem appears to apply to a very small percentage of the data and, for now, we will assume that the simplicest way to deal with this problem is to delete these rows.
</div>

#### Problem 2: regular_price < sale_price

<div class="alert alert-box alert-danger">
    There are orderlines where the regular_price is lower than the sale_price.
</div>

In [32]:
one_decimal_data[one_decimal_data.order_id.isin([267375])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
82,267375,1140087,2017-01-10 22:48:08,"Pure Nude Ultraslim 03 ""7/8 Transparent iPhone...",transparent and flexible cover with 03mm thick...,Puro,PUR0150,Accessories,17.98,1,12.95,12.99,-0.30888


In [33]:
''' What percentage of the data has regular_price values lower than the sale_price? '''
total_orders = one_decimal_data.shape[0]
sale_price_too_high = one_decimal_data[one_decimal_data.regular_price < one_decimal_data.sale_price].shape[0]
print(f"The sale_price is greater than the regular_price for {sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the regular_price for 3.30% of the orders. 


In [34]:
''' 
It appears that for many of these cases the difference is minor, as seen above. 
Let's round the values and see if it makes a difference. 
'''

sale_price_greater_than_regular = one_decimal_data[one_decimal_data.regular_price < one_decimal_data.sale_price].copy()

sale_price_greater_than_regular.sale_price = sale_price_greater_than_regular.sale_price.round()
sale_price_greater_than_regular.regular_price = sale_price_greater_than_regular.regular_price.round()
sale_price_greater_than_regular.total_paid = sale_price_greater_than_regular.total_paid.round()

rounded_sale_price_too_high = sale_price_greater_than_regular[sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price].shape[0]
print(f"The sale_price is greater than the regular_price for {rounded_sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the regular_price for 1.18% of the orders. 


In [35]:
'''
There are orderlines where there is a significant difference between the regular_price and sale_price.
It seems that the regular_price saved in products.csv has changed over time.
'''
sale_price_greater_than_regular[sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price].head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
272,299829,1119702,2017-01-01 17:50:48,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,282.0,1,250.0,275.0,-10.200408
296,299909,1119872,2017-01-01 19:50:09,LaCie Porsche Design Desktop Drive 5TB USB 3.0...,External Hard Drive 5TB 35-inch USB 3.0 for Ma...,LaCie,LAC0159,Memory,184.0,1,175.0,177.0,-0.971484
490,300464,1121064,2017-01-02 12:09:42,Withings Activite Steel Black Dial White,Smart Clock for iPhone and iPad,Withings,WIT0034,Smartwatch,153.0,1,130.0,153.0,-17.729896
506,300497,1121164,2017-01-02 12:44:12,Full screen repair iPhone 5,Repair service including parts and labor for i...,Repair,REP0134,Repairs & warranties,100.0,1,60.0,90.0,-50.008335
679,300941,1122210,2017-01-02 20:08:58,Withings Activite Steel Black Dial White,Smart Clock for iPhone and iPad,Withings,WIT0034,Smartwatch,153.0,1,130.0,153.0,-17.729896


<div class="alert alert-box alert-success">
    This solution seems to have worked relatively well.
    <br><br>
    It appears that for the remaining values the regular price has changed since the sale took place.
</div>

#### Problem 3: regular_price changing over time

<div class="alert alert-box alert-info">
    We still have to solve the problem of the regular_price being less than the sale_price in some instances, so that the discount percentage can be calculated correctly.
    <br>
    Let's check the values above and see if the regular prices have changed over time.
</div>

In [36]:
'''
LAC0159 - 16 rows, only the first value is incorrect. The sale_price is €1.70 greater than the regular_price
WIT0034 - 2 rows, total_paid = sale_price
REP0134 - 3 rows, total_paid = sale_price + 9.99 (shipping fee)
WIT0034 - 2 rows, total_paid = sale_price

LAC0171 - 108 rows, the first 6 rows have sale_price > regular_price. After that all the sale_prices are below the regular price, €249.99.
          However, the sale_price values in the first 6 rows (shown below) vary between €302.99 and €269.79.
          How do we know if €302.9 was the regular_price at this time and the other 5 values represent discount sales?
          The 4th, 5th and 6th rows have a sale_price of €283.99. 
          This could signify that the data is not corrupted but rather that this was a discount or a price change.
'''

completed_sales[completed_sales.sku.isin(['LAC0171'])].sort_values('date').head(10)

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
272,299829,1119702,2017-01-01 17:50:48,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,282.48,1,249.99,275.49
3085,308551,1143957,2017-01-13 10:00:40,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,309.98,1,249.99,302.99
6920,318532,1168266,2017-01-31 23:10:16,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,276.78,1,249.99,269.79
6970,318707,1168710,2017-02-01 10:38:02,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,290.98,1,249.99,283.99
7215,319407,1170172,2017-02-02 23:49:53,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,325.97,1,249.99,283.99
7218,319416,1170189,2017-02-03 00:34:15,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,295.98,1,249.99,283.99
12453,351854,1239836,2017-05-02 14:15:50,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,242.98,1,249.99,235.99
12476,351938,1240025,2017-05-02 18:17:42,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99
12493,351987,1240131,2017-05-02 20:07:28,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99
12682,352663,1241533,2017-05-04 14:00:16,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99


<div class="alert alert-box alert-danger">
    We will have to examine the amount of data this problem affects after fixing as many of the prices as we can.
    <br>
    We can then decide if we wish to adjust the values or delete the orders entirely.
</div>

#### Problem 4: Incorrectly placed decimal points

In [37]:
'''
There are values where the regular_price has been corrupted.
'''
one_decimal_data[one_decimal_data.order_id.isin([496339])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
53601,496339,1574664,2018-01-22 20:18:56,Fibaro Single Switch embedded module,embedded module to control your consumption ou...,Fibaro,FIB0011,Accessories,55.98,1,599.918,56.99,90.500368


In [63]:
''' Let's refresh the data and round the values so that we can detect outliers using the discount_percentage '''

orderline_ids = single_orderline_single_product[
    (single_orderline_single_product.total_paid_decimal_count == 1) &
    (single_orderline_single_product.regular_price_decimal_count == 1) &
    (single_orderline_single_product.sale_price_decimal_count == 1)
].orderline_id

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
one_decimal_data[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data[['total_paid', 'regular_price', 'sale_price']].round()
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Add a discount percentage col to detect incorrect values and outliers
one_decimal_data = add_discount_percentage_col(one_decimal_data)

# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# Outliers
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This respresents 1.18% of the data.

There are corrupted values in total_paid which are less than their corresponding sale_price values.
This respresents 0.47% of the data.

There are 304 outliers


In [64]:
''' A LOT OF THESE VALUES CLEARLY HAVE THE DECIMAL POINT ONE PLACE TOO FAR TO THE LEFT IN REGULAR_PRICE'''
outliers.sort_values('discount_percentage', ascending=False).head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
53630,496421,1574841,2018-01-22 22:35:46,Open - Piece Internal Battery iPhone 6,Replacement internal battery for Apple iPhone 6,Replacement,PIE0034-A,Accessories,16.0,1,350.0,12.0,96.571429
19152,376976,1291907,2017-07-13 18:00:53,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,85.0,1,980.0,65.0,93.367347
24270,393603,1326648,2017-08-28 19:19:14,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6
24453,394043,1327664,2017-08-29 21:04:10,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6
24293,393659,1326788,2017-08-28 22:31:30,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6


<div class="alert alert-block alert-danger">
    After visually examining all the values in the above dataframe (shown truncated), it is clear that the decimal point in these regular_price values has been placed one point too far to the right.
    <br><br>
    This is fixable.
</div>

In [65]:
def _move_decimal_right(row, decimal_col, comparison_col):
    denominator = 10
    while row[decimal_col] >= row[comparison_col]:
        if row[decimal_col] > row[comparison_col]:
            row[decimal_col] /= denominator
    return row[decimal_col] * denominator


def move_regular_price_decimal_point_right_wrt_sale_price(df):
    df.regular_price = [_move_decimal_right(row, 'regular_price', 'sale_price') for index, row in df.iterrows()]
    return df

test_data = outliers.sort_values('discount_percentage', ascending=False).head(5).copy()
test_data = move_regular_price_decimal_point_right_wrt_sale_price(test_data)
test_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
53630,496421,1574841,2018-01-22 22:35:46,Open - Piece Internal Battery iPhone 6,Replacement internal battery for Apple iPhone 6,Replacement,PIE0034-A,Accessories,16.0,1,35.0,12.0,96.571429
19152,376976,1291907,2017-07-13 18:00:53,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,85.0,1,98.0,65.0,93.367347
24270,393603,1326648,2017-08-28 19:19:14,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,50.0,37.0,92.6
24453,394043,1327664,2017-08-29 21:04:10,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,50.0,37.0,92.6
24293,393659,1326788,2017-08-28 22:31:30,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,50.0,37.0,92.6


In [None]:
''' Add the new logic, recalculate the discount percentages and rerun the tests '''

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
one_decimal_data[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data[['total_paid', 'regular_price', 'sale_price']].round()
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Fix the decimal points
one_decimal_data = move_regular_price_decimal_point_right_wrt_sale_price(one_decimal_data)

# Add a discount percentage col to detect incorrect values and outliers
one_decimal_data = add_discount_percentage_col(one_decimal_data)

# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# Outliers
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")

<div class="alert alert-block alert-success">
    Success!
</div>

<div class="alert alert-block alert-info">
    Now let's see if we can fix the total_paid < sale_price values
</div>

In [None]:
one_decimal_data[one_decimal_data.orderline_id.isin(sale_greater_than_total_orderline_ids)].sort_values('discount_percentage', ascending=False).head()

In [None]:
one_decimal_data[one_decimal_data.order_id.isin([313597])]

#### Problem 5: total_paid < sale_price

In [88]:
sale_price_greater_than_regular[(sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price) & (sale_price_greater_than_regular.total_paid < sale_price_greater_than_regular.sale_price)]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
6535,317549,1165690,2017-01-30 14:59:19,Synology RT2600AC Wifi Router AC2600,Wifi Wireless Router AC2600 17GHz dual-core an...,Synology,SYN0163,Hardware,240.0,1,230.0,241.0,-4.954328


# Check how rounding the values affects the data
problem 1: sale_price > total_paid - check how many rows, ignore for now?

problem 2: regular_price < sale_price 
- Round all values
- For the other ones, check if the regular price changed over time and then probably set regular_price equal to sale_price
- Calculate the discount percentage after rounding and use negative discounts to identify these?

<div class="alert alert-box alert-info">
    Let's split the data at the decimal point and examine whether we can determine any patterns by the number of digits after the decimal point.
</div>

## Decimal count = 1 = 0 = 1

<div class="alert alert-box alert-info">
    total_paid_decimal_count==1  & regular_price_decimal_count==0 & sale_price_decimal_count==1
</div>

In [None]:
1. Turn them into floats

In [47]:
order_ids = prices[
((prices.regular_price_decimal_count == 0 )| (prices.regular_price_decimal_count == 1)) &
((prices.promo_price_decimal_count == 0) | (prices.promo_price_decimal_count == 1)) &
((prices.sale_price_decimal_count == 0) | (prices.sale_price_decimal_count == 1)) 
].orderline_id

zero_one_decimal_data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
zero_one_decimal_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,15.76,1,17.99,109.904,10.77
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,84.98,1,99.99,999.896,77.99
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,59.584,52.99
5,246018,1179702,2017-02-13 22:34:47,"iFixit battery Macbook Pro 13 ""OEM (Mid 2009 /...",OEM battery for MacBook Pro 13,iFixit,IFX0055,Accessories,211.95,1,99.95,999.896,93.99
6,246018,1179711,2017-02-13 22:39:20,"Tucano Nido Hard-Shell Case MacBook Pro 13 ""(L...",Protective cover with slip rubber feet 13 inch...,Tucano,TUC0308,Accessories,211.95,1,29.9,249.901,24.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61667,527042,1649446,2018-03-14 11:42:38,EarPods Apple Headphones with Remote and Mic (...,EarPods headphones Apple iPhone iPad and iPod ...,Apple,APP0927,iPhone,18.98,1,35,13.99,13.99
61668,527070,1649512,2018-03-14 11:49:01,Apple Lightning Cable Connector to USB 1m Whit...,Apple Lightning USB Cable 1 meter to charge an...,Apple,APP0698,Accessories,24.97,2,25,99.898,9.99
61669,527074,1649522,2018-03-14 11:49:36,Apple Lightning Cable Connector to USB 1m Whit...,Apple Lightning USB Cable 1 meter to charge an...,Apple,APP0698,Accessories,24.97,2,25,99.898,9.99
61670,527096,1649565,2018-03-14 11:54:35,Apple Lightning Cable Connector to USB 1m Whit...,Apple Lightning USB Cable 1 meter to charge an...,Apple,APP0698,Accessories,34.96,3,25,99.898,9.99


## Decimal count == 0 

<div class="alert alert-box alert-info">
    regular_price_decimal_count == promo_price_decimal_count == sale_price_decimal_count == 0
</div>
<div class="alert alert-box alert-warning">
    There are no sale_price values with zero decimal points.
    <br>
    The regular_price and promo_price values with no decimal points appear to have been accidentally saved as ints instead of floats.
    <br>
    Let's select only these values and see if this logic is correct
</div>


In [29]:
order_ids = prices[
((prices.regular_price_decimal_count == 0)) &
((prices.promo_price_decimal_count == 0)) &
((prices.sale_price_decimal_count == 0) | (prices.sale_price_decimal_count == 1)) 
].orderline_id

zero_decimal_count = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
zero_decimal_count

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
22,251688,1432433,2017-11-24 02:26:02,Apple iPhone Leather Case X Case Blue Night,Leather case Apple official,Apple,APP2558,iPhone,505.76,1,59,56,41.65
516,300528,1121267,2017-01-02 13:11:30,Adapter Apple Lightning to headphone jack 3.5 ...,Lightning adapter for headphones or earphones ...,Apple,APP1685,Accessories,14.98,1,9,9,7.99
592,300728,1122002,2017-01-02 18:45:38,Apple Pencil,Pencil Apple iPad Pro.,Apple,APP1221,iPad,94.59,1,109,106,87.60
613,300778,1121866,2017-01-02 17:43:32,Apple Pencil,Pencil Apple iPad Pro.,Apple,APP1221,iPad,1234.97,1,109,106,106.00
643,300848,1122008,2017-01-02 18:48:57,Silicone Case Apple iPad Case mini 4 Blue Ocean,Protective cover ultrafine silicone cases mini 4,Apple,APP1764,iPad,114.00,1,69,59,69.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61393,525823,1647481,2018-03-12 18:25:34,Apple Thunderbolt Adapter 3 / USB-C Thunderbol...,Thunderbolt adapter 3 / USB-C Thunderbolt 2 Ma...,Apple,APP1919,Accessories,390.56,1,59,56,56.00
61430,525935,1647595,2018-03-12 20:35:37,Bose SoundTouch 10 Wi-Fi Bluetooth Speaker White,Wireless music system compatible with iOS and ...,Bose,BOS0040,Hardware,201.99,1,229,195,195.00
61496,526149,1647616,2018-03-12 20:59:39,Apple Thunderbolt Adapter 3 / USB-C Thunderbol...,Thunderbolt adapter 3 / USB-C Thunderbolt 2 Ma...,Apple,APP1919,Accessories,566.98,1,59,56,56.00
61544,526344,1648095,2018-03-13 11:11:42,Apple Thunderbolt Adapter 3 / USB-C Thunderbol...,Thunderbolt adapter 3 / USB-C Thunderbolt 2 Ma...,Apple,APP1919,Accessories,263.98,1,59,56,56.00


In [35]:
# Transform the three columns to floats
zero_decimal_count[['regular_price', 'promo_price', 'sale_price']] = zero_decimal_count[['regular_price', 'promo_price', 'sale_price']].astype('float')

# Run the tests
incorrect_reg_pro_ids = pdt.test_regular_greater_or_equal_to_promo(zero_decimal_count)

All of the regular_price values are greater or equal to the corresponding promo_price values.



In [54]:
display(zero_decimal_count[zero_decimal_count.regular_price >= zero_decimal_count.promo_price].head())

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
22,251688,1432433,2017-11-24 02:26:02,Apple iPhone Leather Case X Case Blue Night,Leather case Apple official,Apple,APP2558,iPhone,505.76,1,59.0,56.0,41.65
516,300528,1121267,2017-01-02 13:11:30,Adapter Apple Lightning to headphone jack 3.5 ...,Lightning adapter for headphones or earphones ...,Apple,APP1685,Accessories,14.98,1,9.0,9.0,7.99
592,300728,1122002,2017-01-02 18:45:38,Apple Pencil,Pencil Apple iPad Pro.,Apple,APP1221,iPad,94.59,1,109.0,106.0,87.6
613,300778,1121866,2017-01-02 17:43:32,Apple Pencil,Pencil Apple iPad Pro.,Apple,APP1221,iPad,1234.97,1,109.0,106.0,106.0
643,300848,1122008,2017-01-02 18:48:57,Silicone Case Apple iPad Case mini 4 Blue Ocean,Protective cover ultrafine silicone cases mini 4,Apple,APP1764,iPad,114.0,1,69.0,59.0,69.0


<div class="alert alert-box alert-success">
    After viewing the entire df (shown above in truncated form) we can conclude that our assumption is basically correct.
</div>

<div class="alert alert-box alert-danger">
    However, some of the sale_price values still do not match the regular_price or the promo_price.
    <br>
    <br>
    Furthermore, even though the total_price values only ever had one decimal point, for some of them it is clearly in the wrong place.
</div>
<div class="alert alert-box alert-info">
    To clean the sale_price values we must first clean the total_price values as it is sometimes equal to neither regular_price nor promo_price but is close to total_paid.
    <br>
    <br>
    To this end we must select a subset where all the orderlines in the order are within the zero_decimal_count df. 
    <br>
    If the order we are examining has other otherlines which are in completed_sales but not in zero_decimal_count then we cannot be sure we have all the data to confirm the total_paid value is correct.
    <br>
    <br>
    We must check the distribution of total_price values wrt to sale_price, then promo_price and regular_price to determine whether the difference can be explained by shipping costs or whether the decimal point is in the wrong place.
    <br>
    <br>
    We can then analyse the distribution of sale_price values wrt to regular_price and wrt promo_price and examine the outliers.
</div>

In [70]:
''' Filter out orders which do not have all their orderlines in zero_decimal_count'''

# Get the count of orderlines per order from completed_sales
completed_sales_orderline_counts = completed_sales.groupby('order_id')['orderline_id'].count().reset_index()
completed_sales_orderline_counts.rename(columns={'orderline_id': 'completed_count'}, inplace=True)

# Do the same for zero_decimal_count
zero_dec_orderline_counts = zero_decimal_count.groupby('order_id')['orderline_id'].count().reset_index()
zero_dec_orderline_counts.rename(columns={'orderline_id': 'zero_count'}, inplace=True)

# Merge to find orders with matching counts
valid_orders = pd.merge(completed_sales_orderline_counts, zero_dec_orderline_counts, on='order_id')
valid_orders = valid_orders[valid_orders['completed_count'] == valid_orders['zero_count']]

# Filter zero_decimal_count for these valid order_ids
zero_decimal_count__complete_orders = zero_decimal_count[zero_decimal_count['order_id'].isin(valid_orders['order_id'])].copy()

print("zero_decimal_count.shape:", zero_decimal_count.shape)
print("zero_decimal_count_subset.shape:", zero_decimal_count__complete_orders.shape)

zero_decimal_count.shape: (738, 13)
zero_decimal_count_subset.shape: (238, 13)


In [69]:
valid_orders[valid_orders]

(233, 3)

In [71]:
''' Multiply regular_price, promo_price and sale_price by the product_quantity to compare them to the total_paid. '''

zero_decimal_count__complete_orders['regular_price_total'] = zero_decimal_count__complete_orders.regular_price*zero_decimal_count__complete_orders.product_quantity
zero_decimal_count__complete_orders['promo_price_total'] = zero_decimal_count__complete_orders.promo_price*zero_decimal_count__complete_orders.product_quantity
zero_decimal_count__complete_orders['sale_price_total'] = zero_decimal_count__complete_orders.sale_price*zero_decimal_count__complete_orders.product_quantity

''' First check the orders with only one orderline for simplicity '''
zero_decimal_count__complete_orders__single_orderline

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price,regular_price_total,promo_price_total,sale_price_total
516,300528,1121267,2017-01-02 13:11:30,Adapter Apple Lightning to headphone jack 3.5 ...,Lightning adapter for headphones or earphones ...,Apple,APP1685,Accessories,14.98,1,9.0,9.0,7.99,9.0,9.0,7.99
592,300728,1122002,2017-01-02 18:45:38,Apple Pencil,Pencil Apple iPad Pro.,Apple,APP1221,iPad,94.59,1,109.0,106.0,87.6,109.0,106.0,87.6
709,301074,1122484,2017-01-02 22:14:46,Apple Pencil,Pencil Apple iPad Pro.,Apple,APP1221,iPad,106.0,1,109.0,106.0,106.0,109.0,106.0,106.0
1075,302097,1127765,2017-01-03 22:42:18,Apple Thunderbolt Adapter 3 / USB-C Thunderbol...,Thunderbolt adapter 3 / USB-C Thunderbolt 2 Ma...,Apple,APP1919,Accessories,64.84,2,59.0,56.0,28.93,118.0,112.0,57.86
2114,305836,1137489,2017-01-09 16:23:41,Adapter Apple Lightning to headphone jack 3.5 ...,Lightning adapter for headphones or earphones ...,Apple,APP1685,Accessories,13.99,1,9.0,9.0,9.0,9.0,9.0,9.0
2167,305987,1137849,2017-01-09 19:14:00,Adapter Apple Lightning to headphone jack 3.5 ...,Lightning adapter for headphones or earphones ...,Apple,APP1685,Accessories,15.99,1,9.0,9.0,9.0,9.0,9.0,9.0
3102,308593,1144044,2017-01-13 10:28:23,Apple Thunderbolt Adapter 3 / USB-C Thunderbol...,Thunderbolt adapter 3 / USB-C Thunderbolt 2 Ma...,Apple,APP1919,Accessories,39.99,1,59.0,56.0,35.0,59.0,56.0,35.0
4330,311764,1151689,2017-01-18 18:56:40,Adapter Apple Lightning to headphone jack 3.5 ...,Lightning adapter for headphones or earphones ...,Apple,APP1685,Accessories,27.0,3,9.0,9.0,9.0,27.0,27.0,27.0
4558,312437,1153318,2017-01-20 11:15:47,Apple Thunderbolt Adapter 3 / USB-C Thunderbol...,Thunderbolt adapter 3 / USB-C Thunderbolt 2 Ma...,Apple,APP1919,Accessories,38.24,1,59.0,56.0,33.25,59.0,56.0,33.25
4880,313423,1155621,2017-01-22 22:39:09,Apple Magic Trackpad 2,Apple Wireless Bluetooth Trackpad.,Apple,APP1216,Accessories,138.99,1,149.0,139.0,138.99,149.0,139.0,138.99


In [None]:

incorrect_pro_sale_ids = pdt.test_promo_greater_or_equal_to_sale(zero_decimal_count)

## Evaluate corrupted data with multiple decimal points

### regular_price_decimal_count==1	promo_price_decimal_count==0	sale_price_decimal_count==2

In [16]:
order_ids = prices[
(prices.regular_price_decimal_count == 1) &
(prices.promo_price_decimal_count == 0) &
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
41002,449341,1460602,2017-11-27 20:19:43,Sonnet Echo Express III-R 2U Chassis Rack Thun...,PCIe expansion chassis for Mac via Thunderbolt.,Sonnet,SNN0032,unknown,5418.13,1,1208.79,1089,1.012.77


### regular_price_decimal_count==1	promo_price_decimal_count==1	sale_price_decimal_count==2

In [17]:
order_ids = prices[
(prices.regular_price_decimal_count == 1) &
(prices.promo_price_decimal_count == 1) &
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
7595,320581,1172651,2017-02-06 17:16:09,"Dell Monitor P4317Q 425 """" IPS 4K DP mDP 1.2",425 inch monitor for Mac,Dell,DLL0044,Hardware,1098.99,1,1008.99,779.989,1.098.99
7887,321497,1174680,2017-02-08 22:08:52,"Cintiq Wacom Graphics Tablet Pro 13 ""FHD",graphics tablet 13-inch FHD resolution (1920x1...,Wacom,WAC0228,Accessories,1049.99,1,1099.9,918.995,1.049.99
8465,323467,1178850,2017-02-13 11:43:09,Apple iPhone 7 Plus 256GB Rose Gold,New Apple iPhone 7 Plus 256GB Rose Gold free,Apple,APP1638,iPhone,1194.39,1,1130.33,1130.33,1.106.42
9487,326748,1186306,2017-02-20 15:40:50,"Dell Monitor P4317Q 425 """" IPS 4K DP mDP 1.2",425 inch monitor for Mac,Dell,DLL0044,Hardware,1098.99,1,1008.99,779.989,1.098.99
10404,345195,1226053,2017-04-10 20:34:13,Apple iPhone 7 Plus 256GB Rose Gold,New Apple iPhone 7 Plus 256GB Rose Gold free,Apple,APP1638,iPhone,1201.97,1,1130.33,1130.33,1.129.00
13980,357513,1251439,2017-05-20 10:28:21,"Wacom MobileStudio Pro 13 ""i5 128GB",Professional graphics tablet with 13-inch 128G...,Wacom,WAC0215,Accessories,1709.0,1,1899.9,1786.99,1.709.00
14422,359444,1255666,2017-05-26 16:15:21,Apple iPhone 7 Plus 256GB Rose Gold,New Apple iPhone 7 Plus 256GB Rose Gold free,Apple,APP1638,iPhone,5335.0,1,1130.33,1130.33,1.089.00
23143,389983,1319096,2017-08-16 17:23:42,Apple iPhone 7 Plus 256GB Rose Gold,New Apple iPhone 7 Plus 256GB Rose Gold free,Apple,APP1638,iPhone,1127.32,1,1130.33,1130.33,1.086.33
38153,440988,1441346,2017-11-24 19:22:13,"Wacom MobileStudio Pro 13 ""i5 128GB",Professional graphics tablet with 13-inch 128G...,Wacom,WAC0215,Accessories,1396.09,1,1899.9,1786.99,1.396.09
45092,466769,1504953,2017-12-19 13:48:53,DS418play Synology NAS Server | 10GB RAM | 12T...,4-bay NAS server to accommodate 4K Ultra HD files,Pack,PAC2252,Hardware,1036.37,1,1117.35,999.368,1.036.37


### sale_price_decimal_count == 2

In [19]:
order_ids = prices[
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
47,257847,1431400,2017-11-23 23:22:04,"Benq Monitor SW320 315 ""4K UHD Professional Ph...",315 inch monitor with 99% sRGB 4K height adjus...,BenQ,BNQ0054,Hardware,1367.11,1,1489,14.859.889,1.367.11
51,258985,1307573,2017-07-31 12:40:26,"Apple iMac 27 ""Core i5 3.4GHz Retina 5K | 16GB...",IMac desktop computer 27 inch Retina 5K RAM 16...,Pack,PAC2076,Memory,2264.60,1,2459,22.580.045,2.258.01
55,259668,1372137,2017-10-06 21:57:49,Apple iPhone 8 Plus 256GB Silver,Apple iPhone 8 Plus 256GB in Silver,Apple,APP2487,iPhone,1132.33,1,1089,10.669.998,1.087.33
63,262016,1319852,2017-08-18 00:54:24,"Apple MacBook Pro 13 ""with Touch Bar 33GHz Cor...",New MacBook Pro 13 inch Touch Bar 33 GHz Core ...,Apple,APP1865,Mac,3109.57,1,3279,29.055.936,2.905.59
105,279755,1133499,2017-01-07 12:50:57,"Apple iMac 27 ""Core i7 Retina 5K 4GHz | 8GB | ...",IMac desktop computer 27 inch 8GB RAM 256GB Re...,Apple,APP1345,Mac,2616.99,1,2809,26.755.847,2.621.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61548,526357,1648114,2018-03-13 11:33:52,"Apple MacBook Pro 13 ""Core i5 23GHz | 8GB RAM ...",New MacBook Pro 13-inch Core i5 23GHz with 8GB...,Apple,APP2078,Mac,1405.99,1,1505.59,13.990.044,1.399.00
61556,526380,1648173,2018-03-13 12:14:52,"Apple iMac 215 ""Core i5 30GHz 4K Retina | 8GB ...","IMac desktop computer 215 inch 8GB RAM 1TB 5,4...",Apple,APP2087,Mac,1436.99,1,1505.59,14.300.046,1.430.00
61588,526505,1648439,2018-03-13 15:50:13,"Apple MacBook Retina 12 ""Core m3 12GHz | 8GB R...",New MacBook Retina Display 12-inch Core m3 SSD...,Apple,APP2064,Mac,1421.99,1,1505.59,14.150.043,1.415.00
61589,526507,1648443,2018-03-13 15:51:57,Apple iPhone 64GB Space Gray X,New Free iPhone 64GB X,Apple,APP2489,iPhone,1137.97,1,1159,11.240.005,1.099.00


<div class="alert alert-block alert-danger">
    FOR SALE_PRICE_DEC_COUNT: YOU NEED TO SPLIT THE SALE PRICE COLUMN AND FIGURE OUT IF THERE ARE ALWAYS 2 DIGITS AFTER THE DEC AND 2 BETWEEN THE TWO. THEN YOU MIGHT BE ABLE TO FIX THEM ALL IN ONE GO.

</div>

### regular_price_decimal_count == 0

In [8]:
# Let's take a look at the values with zero decimal points. It is possible that these have been incorrectly stored as ints.
reg_price_zero_dec_ids = prices[(prices.regular_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(reg_price_zero_dec_ids)].copy()
zero_dec_test_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
3,245275,1276706,2017-06-28 11:12:30,Tado Smart Climate Control Intelligent AC,intelligent control air conditioning works wit...,Tado,TAD0007,Accessories,149.00,1,179,1.489.994,149.00
11,246405,1434356,2017-11-24 09:53:54,"Seagate Barracuda 3TB 35 ""SATA hard drive Mac ...",internal hard drive Mac and PC 3TB (ST3000DM008).,Seagate,SEA0044,Memory,407.96,1,112,849.045,89.35
12,246405,1434434,2017-11-24 09:57:55,Case Apple iPhone 6 / 6S Leather Case Blue Night,ultrathin leather case and microfiber premium ...,Apple,APP1136,iPhone,407.96,1,55,510.003,45.90
15,250275,1413938,2017-11-19 22:32:48,IPhone AppleCare Protection Plan,Apple Care extended warranty iPhone.,Apple,APP1190,Accessories,54.99,1,70,609.997,51.00
16,251302,1245100,2017-05-10 12:25:57,Apple Keyboard Keypad International English Mac,USB ultrathin keyboard Apple Mac English.,Apple,APP0411,Accessories,140.99,1,59,589.996,59.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61667,527042,1649446,2018-03-14 11:42:38,EarPods Apple Headphones with Remote and Mic (...,EarPods headphones Apple iPhone iPad and iPod ...,Apple,APP0927,iPhone,18.98,1,35,13.99,13.99
61668,527070,1649512,2018-03-14 11:49:01,Apple Lightning Cable Connector to USB 1m Whit...,Apple Lightning USB Cable 1 meter to charge an...,Apple,APP0698,Accessories,24.97,2,25,99.898,9.99
61669,527074,1649522,2018-03-14 11:49:36,Apple Lightning Cable Connector to USB 1m Whit...,Apple Lightning USB Cable 1 meter to charge an...,Apple,APP0698,Accessories,24.97,2,25,99.898,9.99
61670,527096,1649565,2018-03-14 11:54:35,Apple Lightning Cable Connector to USB 1m Whit...,Apple Lightning USB Cable 1 meter to charge an...,Apple,APP0698,Accessories,34.96,3,25,99.898,9.99


In [None]:
pro_price_zero_dec_ids = prices[(prices.promo_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(pro_price_zero_dec_ids)].copy()
zero_dec_test_data

In [None]:
reg_pro_price_zero_dec_ids = prices[(prices.regular_price_decimal_count == 0) & (prices.promo_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(zero_dec_ids)].copy()
zero_dec_test_data

<div class="alert alert-block alert-success">
    The
</div>

### Check sale_price

In [1]:
temp = prices.copy()
temp[temp.sale_price_decimal_count!=1]

NameError: name 'prices' is not defined

In [2]:
temp = (
    temp.assign(
        sale_price_split = lambda x: x['sale_price'].str.split(r'\.'),
        decimal_places = lambda x: len(x['sale_price_split'][1]))
)
temp.head(5)

NameError: name 'temp' is not defined

In [3]:
temp[temp.decimal_places != 2]

NameError: name 'temp' is not defined

### Transform orderlines.unit_price to floats

In [None]:
def transform_unit_price_to_floats(df):
    """
    Transform the orderlines.unit_price price column to floats.
    Some of the values have two decimal points. 
    For these values we will remove the leftmost decimal and transform all values to floats.
    The correct position of the decimal point will be determined by merging orderlines, 
    products, orders and brands, and comparing the price values.
    
    Args:
        df (pd.DataFrame): The orderlines data
    
    Returns:
        pd.DataFrame: The orderlines data with the unit_price column transformed from str to float values.
    """
    return (
        df.assign(unit_price = df.unit_price.str.split('.')
                  .apply(lambda x : x[0]+x[1]+'.'+x[2] if len(x)==3 else x[0]+'.'+ x[1])
                  .astype(float)
        )
    )
    

In [None]:
completed_sales['regular_price'].str.count(r'\.')

In [None]:
def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df

# Solution attempt 1
Move the decimal points until reg >= promo >= sale and run the tests to see if it works

In [21]:
def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df

def _insert_decimal_in_promo_price(row, decimal_position = -2):
    '''
    If the euro value of the regular_price is equal to the euro value of the promo_price, 
    set the promo_price equal to regular_price and return it.
    This is because some promo_prices are slightly larger than their equivalent prices, e.g. 12.95 - 12.99
    
    Otherwise, keep moving the decimal point towards the start of the string until the promo_price is lower than the price.
    Then transform the string to a float and round it to two decimal places.
    '''
    
    while float(row.promo_price) > row.regular_price:
        if round(float(row.promo_price), 0) == round(row.regular_price, 0):
            row.promo_price = row.regular_price
            return row.promo_price
        else:
            row.promo_price = _insert_decimal_at_string_position(row.promo_price, decimal_position)
            decimal_position += -1
    
    return round(float(row.promo_price), 2)

def transform_promo_price_to_floats(df):
    df.promo_price = [_insert_decimal_in_promo_price(row) for index, row in df.iterrows()]
    return df
    
def calculate_products_discounts(df):
    return df.assign(
        discount=round(df.price - df.promo_price, 2),
        discount_pc = round((df.price - df.promo_price)/df.price * 100, 2)
    )

def calculate_sales_discounts(df):
    return (df
            .assign(
                sales_discount=round(df.price - df.sale_price, 2),
                sales_discount_pc = round((df.price - df.sale_price)/df.price * 100, 2)
            )
           )

temp = completed_sales.copy()

temp = (temp
        .pipe(start_pipeline)
        .pipe(split_str_on_dots_and_append_decimal, 'regular_price')
        #.pipe(split_and_join_regular_prices)
        .pipe(transform_regular_price_to_float)
        .pipe(split_str_on_dots_and_append_decimal, 'promo_price')
        #.pipe(split_and_join_promo_prices)
        .pipe(transform_promo_price_to_floats)
        .pipe(calculate_products_discounts)
        .pipe(calculate_sales_discounts)
)

temp

TypeError: '<' not supported between instances of 'float' and 'str'