# Assumptions

* The total_paid must be greater or equal to the sum of product_quantity*sale_price for all orderlines in an order
* regular_price must be greater or equal to sale_price
* regular_price must be greater or equal to promo_price
* promo_price could be less than sale_price if the product was sold at the regular_price
* It is possible that the difference between the total_paid value and sum(product_quantity*sale_price) is the shipping cost

In [1]:
import re
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import data_utils
import price_debugging_tests as pdt

# Set maximum number of rows to display
pd.set_option('display.max_rows', 1000)

# Import and merge data

In [2]:
# Import the data
orders = data_utils.clean_orders(data_path="../../data/")
orderlines = data_utils.clean_orderlines(data_path="../../data/")
products = data_utils.clean_products(data_path="../../data/")
brands = data_utils.clean_brands(data_path="../../data/")

# Merge the data
completed_sales =  data_utils.merge_data(orders, orderlines, products, brands)

5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from orderlines.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.




# Price data exploration
## Exclude orders with multiple orderlines and multiple products
<div class="alert alert-box alert-info">
    For simplicity's sake we will first exclude orders with multiple orderlines and product_quantity >= 1 so we can easily compare the prices the total_paid.
    <br>
    Hopefully we will be able to determine patterns which can later be applied to all the data. 
</div>

In [3]:
# Get the count of orderlines per order from completed_sales
completed_sales_orderline_counts = completed_sales.groupby('order_id')['orderline_id'].count().reset_index()
completed_sales_orderline_counts.rename(columns={'orderline_id': 'orderline_count'}, inplace=True)

''' Examine the distribution of orderlines per order. '''
# Group by completed_count and count how many orders have the same number of orderlines
orderline_distribution = completed_sales_orderline_counts.groupby('orderline_count')['order_id'].count().reset_index()
orderline_distribution.rename(columns={'order_id': 'order_count'}, inplace=True)

# Calculate the percentage of each orderline count relative to the total number of orders
total_orders = completed_sales_orderline_counts['order_id'].nunique()
orderline_distribution['percentage'] = (orderline_distribution['order_count'] / total_orders) * 100

orderline_distribution

Unnamed: 0,orderline_count,order_count,percentage
0,1,36055,77.770108
1,2,7100,15.314596
2,3,2107,4.544768
3,4,688,1.484006
4,5,261,0.562973
5,6,82,0.176873
6,7,30,0.06471
7,8,18,0.038826
8,9,12,0.025884
9,10,5,0.010785


In [4]:
''' Exclude orders with more than one orderline. '''
orders_with_one_orderline = completed_sales_orderline_counts[completed_sales_orderline_counts.orderline_count==1]
single_orderline_orders = completed_sales[completed_sales.order_id.isin(orders_with_one_orderline.order_id)]
single_orderline_orders.shape

(36055, 12)

<div class="alert alert-box alert-success">
    Good. We still have 78% of the data.
</div>

In [5]:
''' Exclude orders with product_quantity > 1 so we can directly compare the prices with total_paid. '''

single_orderline_single_product = single_orderline_orders[single_orderline_orders.product_quantity==1].copy()
single_orderline_single_product.shape[0]/total_orders*100

72.17273139060849

<div class="alert alert-box alert-success">
    We still have 72% of the data.
</div>

## Explore corrupted decimal values

### Helper functions

In [59]:
def add_discount_percentage_col(df):
    df['discount_percentage'] = (df.regular_price - df.sale_price)/df.regular_price*100
    return df

def check_for_outliers(df):
    ''' 
    Check for outliers to see if the decimal value was in the wrong spot. 
    
    We define an outlier as having a discount percentage in regular_price and sale_price 
    greater than 3 standard deviations from the mean discount percentage
    '''

    mean_diff = df['discount_percentage'].mean()
    std_diff = df['discount_percentage'].std()
    threshold = mean_diff + 3 * std_diff

    outliers = df[df['discount_percentage'] > threshold]

    print("mean_diff: ", mean_diff)
    print("std_diff: ", std_diff)
    print("threshold: ", threshold)
    print("outliers.shape[0]: ", outliers.shape[0])
    
    return outliers

### Count decimal points in price values - completed_sales

In [7]:
price_data = completed_sales.copy()

price_data['regular_price_decimal_count'] = price_data['regular_price'].str.count(r'\.')
price_data['sale_price_decimal_count'] = price_data['sale_price'].str.count(r'\.')
price_data['total_paid_decimal_count'] = price_data['sale_price'].str.count(r'\.')

price_data[['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,total_paid_decimal_count,regular_price_decimal_count,sale_price_decimal_count
0,,18611,
1,58120.0,42264,58120.0
2,3552.0,797,3552.0


### Check the distribution of decimal points across the price data - completed_sales

In [8]:
# Group by the three decimal count columns and get the count & percentage for each group
grouped_counts = price_data.groupby(
    ['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count']
).size().reset_index(name='count')

# Calculate the percentage of each group relative to the total number of rows
total_rows = len(price_data)
grouped_counts['percentage'] = (grouped_counts['count'] / total_rows) * 100

# Display the resulting dataframe
grouped_counts.sort_values('percentage', ascending=False)

Unnamed: 0,total_paid_decimal_count,regular_price_decimal_count,sale_price_decimal_count,count,percentage
1,1,1,1,41135,66.699637
0,1,0,1,16412,26.611752
3,2,0,2,2199,3.565638
4,2,1,2,1129,1.830652
2,1,2,1,573,0.929109
5,2,2,2,224,0.363212


### Count decimal points in price values - single_orderline_single_product

In [9]:
single_orderline_single_product['regular_price_decimal_count'] = single_orderline_single_product['regular_price'].str.count(r'\.')
single_orderline_single_product['sale_price_decimal_count'] = single_orderline_single_product['sale_price'].str.count(r'\.')
single_orderline_single_product['total_paid_decimal_count'] = single_orderline_single_product['sale_price'].str.count(r'\.')

single_orderline_single_product[['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,total_paid_decimal_count,regular_price_decimal_count,sale_price_decimal_count
0,,10233,
1,31050.0,22760,31050.0
2,2410.0,467,2410.0


### Check the distribution of decimal points across the price data - single_orderline_single_product

In [10]:
# Group by the three decimal count columns and get the count & percentage for each group
grouped_counts = single_orderline_single_product.groupby(
    ['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count']
).size().reset_index(name='count')

# Calculate the percentage of each group relative to the total number of rows
total_rows = len(price_data)
grouped_counts['percentage'] = (grouped_counts['count'] / total_rows) * 100

# Display the resulting dataframe
grouped_counts.sort_values('percentage', ascending=False)

Unnamed: 0,total_paid_decimal_count,regular_price_decimal_count,sale_price_decimal_count,count,percentage
1,1,1,1,21971,35.625568
0,1,0,1,8767,14.215527
3,2,0,2,1466,2.377092
4,2,1,2,789,1.279349
2,1,2,1,312,0.505902
5,2,2,2,155,0.25133


## Evaluate potentially uncorrupted values

### Decimal count = 1 = 1 = 1
<div class="alert alert-box alert-info">
    Since 67% of the completed_sales data has total_paid_decimal_count = regular_price_decimal_count == sale_price_decimal_count == 1, cleaning this data will be a good first step.
</div>

In [11]:
orderline_ids = single_orderline_single_product[
    (single_orderline_single_product.total_paid_decimal_count == 1) &
    (single_orderline_single_product.regular_price_decimal_count == 1) &
    (single_orderline_single_product.sale_price_decimal_count == 1)
].orderline_id

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
# drop the decimal count cols to clean up the output
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Add a discount percentage col to detect incorrect values and outliers
one_decimal_data = add_discount_percentage_col(one_decimal_data)

In [12]:
one_decimal_data[one_decimal_data.order_id==376743]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
19074,376743,1291409,2017-07-13 08:39:01,5TB Lacie Rugged Hard Disk Thunderbolt USB-C,"5TB hard disk rugged, compact Thunderbolt and ...",LaCie,LAC0227,Memory,301.99,1,299.99,301.99,-0.666689


#### Problem 1: sale_price < total_paid

<div class="alert alert-box alert-danger">
    There are cases where the sale_price is greater than the total_paid value.
</div>

In [13]:
one_decimal_data[one_decimal_data.order_id.isin([301495, 281302, 274043, 287311, 300950, 296010, 297572])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
92,274043,1121663,2017-01-02 16:30:49,Griffin Magnetic BreakSafe-C USB charging cabl...,Charging cable and connector magnetic fast lib...,Griffin,GRT0425,Accessories,3.98,1,34.99,24.99,28.579594
111,281302,1142023,2017-01-12 01:00:05,Startech USB Adapter VGA-C Blanco,Adapter with reversible USB connection VGA-C (...,Startech,STA0036,Accessories,16.98,1,53.99,29.99,44.452676
131,287311,1164035,2017-01-29 22:54:50,Sandisk iXpand Lightning to USB 3.0 64GB,64GB storage unit for iPhone and iPad,SanDisk,SAN0134,Accessories,54.99,1,69.99,59.99,14.287755
160,296010,1138342,2017-01-09 23:41:57,"Tucano Nido Hard-Shell Case MacBook Air 13 ""Black",rigid and slicked rubber feet MacBook Air 13 i...,Tucano,TUC0252,Accessories,0.0,1,29.9,24.99,16.421405
182,297572,1162207,2017-01-27 23:53:16,Twelve South BookArc support for MacBook Pro /...,Aluminum support in an arc Retina MacBook Air.,Twelve South,TWS0093,Accessories,51.99,1,59.99,56.99,5.000833
684,300950,1122232,2017-01-02 20:20:08,Apple Mac mini Core i5 14GHz | 8GB RAM | 500GB...,Mac mini desktop computer 8GB RAM 500GB SSD (M...,Pack,PAC0594,Memory,777.99,1,1135.59,782.99,31.049939
866,301495,1123362,2017-01-03 12:04:16,Svolta Tucano MacBook Pro Sleeve bag / Retina ...,compact case for MacBook / Air 13 and 13 inche...,Tucano,TUC0277,Accessories,0.0,1,29.9,24.99,16.421405


<div class="alert alert-box alert-danger">
    The first case above seems to have the decimal point in the wrong place in total_paid.
    <br><br>
    Some of the prices are relatively similar, for example, the 3rd, 5th and 6th rows above, and could possibly be corrected by setting the total_paid equal to the sale_price.
    <br><br>
    Some of the data is too corrupted to be corrected: The 2nd, 4th and 7th rows.
</div>

In [14]:
''' What percentage of the data has total_paid values lower than the sale_price? '''
total_orders = one_decimal_data.shape[0]
sale_price_too_high = one_decimal_data[one_decimal_data.total_paid < one_decimal_data.sale_price].shape[0]
print(f"The sale_price is greater than the total_paid for {sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the total_paid for 0.68% of the orders. 


<div class="alert alert-box alert-success">
    This problem appears to apply to a very small percentage of the data and, for now, we will assume that the simplicest way to deal with this problem is to delete these rows.
</div>

#### Problem 2: regular_price < sale_price

<div class="alert alert-box alert-danger">
    There are orderlines where the regular_price is lower than the sale_price.
</div>

In [15]:
one_decimal_data[one_decimal_data.order_id.isin([267375])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
82,267375,1140087,2017-01-10 22:48:08,"Pure Nude Ultraslim 03 ""7/8 Transparent iPhone...",transparent and flexible cover with 03mm thick...,Puro,PUR0150,Accessories,17.98,1,12.95,12.99,-0.30888


In [16]:
''' What percentage of the data has regular_price values lower than the sale_price? '''
total_orders = one_decimal_data.shape[0]
sale_price_too_high = one_decimal_data[one_decimal_data.regular_price < one_decimal_data.sale_price].shape[0]
print(f"The sale_price is greater than the regular_price for {sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the regular_price for 3.30% of the orders. 


In [17]:
''' 
It appears that for many of these cases the difference is minor, as seen above. 
Let's round the values and see if it makes a difference. 
'''

sale_price_greater_than_regular = one_decimal_data[one_decimal_data.regular_price < one_decimal_data.sale_price].copy()

sale_price_greater_than_regular.sale_price = sale_price_greater_than_regular.sale_price.round()
sale_price_greater_than_regular.regular_price = sale_price_greater_than_regular.regular_price.round()
sale_price_greater_than_regular.total_paid = sale_price_greater_than_regular.total_paid.round()

rounded_sale_price_too_high = sale_price_greater_than_regular[sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price].shape[0]
print(f"The sale_price is greater than the regular_price for {rounded_sale_price_too_high/total_orders*100:.2f}% of the orders. ")

The sale_price is greater than the regular_price for 1.18% of the orders. 


In [18]:
'''
There are orderlines where there is a significant difference between the regular_price and sale_price.
It seems that the regular_price saved in products.csv has changed over time.
'''
sale_price_greater_than_regular[sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price].head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
272,299829,1119702,2017-01-01 17:50:48,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,282.0,1,250.0,275.0,-10.200408
296,299909,1119872,2017-01-01 19:50:09,LaCie Porsche Design Desktop Drive 5TB USB 3.0...,External Hard Drive 5TB 35-inch USB 3.0 for Ma...,LaCie,LAC0159,Memory,184.0,1,175.0,177.0,-0.971484
490,300464,1121064,2017-01-02 12:09:42,Withings Activite Steel Black Dial White,Smart Clock for iPhone and iPad,Withings,WIT0034,Smartwatch,153.0,1,130.0,153.0,-17.729896
506,300497,1121164,2017-01-02 12:44:12,Full screen repair iPhone 5,Repair service including parts and labor for i...,Repair,REP0134,Repairs & warranties,100.0,1,60.0,90.0,-50.008335
679,300941,1122210,2017-01-02 20:08:58,Withings Activite Steel Black Dial White,Smart Clock for iPhone and iPad,Withings,WIT0034,Smartwatch,153.0,1,130.0,153.0,-17.729896


<div class="alert alert-box alert-success">
    This solution seems to have worked relatively well.
    <br><br>
    It appears that for the remaining values the regular price has changed since the sale took place.
</div>

#### Problem 3: regular_price changing over time

<div class="alert alert-box alert-info">
    We still have to solve the problem of the regular_price being less than the sale_price in some instances, so that the discount percentage can be calculated correctly.
    <br>
    Let's check the values above and see if the regular prices have changed over time.
</div>

In [19]:
'''
LAC0159 - 16 rows, only the first value is incorrect. The sale_price is €1.70 greater than the regular_price
WIT0034 - 2 rows, total_paid = sale_price
REP0134 - 3 rows, total_paid = sale_price + 9.99 (shipping fee)
WIT0034 - 2 rows, total_paid = sale_price

LAC0171 - 108 rows, the first 6 rows have sale_price > regular_price. After that all the sale_prices are below the regular price, €249.99.
          However, the sale_price values in the first 6 rows (shown below) vary between €302.99 and €269.79.
          How do we know if €302.9 was the regular_price at this time and the other 5 values represent discount sales?
          The 4th, 5th and 6th rows have a sale_price of €283.99. 
          This could signify that the data is not corrupted but rather that this was a discount or a price change.
'''

completed_sales[completed_sales.sku.isin(['LAC0171'])].sort_values('date').head(10)

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
272,299829,1119702,2017-01-01 17:50:48,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,282.48,1,249.99,275.49
3085,308551,1143957,2017-01-13 10:00:40,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,309.98,1,249.99,302.99
6920,318532,1168266,2017-01-31 23:10:16,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,276.78,1,249.99,269.79
6970,318707,1168710,2017-02-01 10:38:02,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,290.98,1,249.99,283.99
7215,319407,1170172,2017-02-02 23:49:53,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,325.97,1,249.99,283.99
7218,319416,1170189,2017-02-03 00:34:15,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,295.98,1,249.99,283.99
12453,351854,1239836,2017-05-02 14:15:50,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,242.98,1,249.99,235.99
12476,351938,1240025,2017-05-02 18:17:42,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99
12493,351987,1240131,2017-05-02 20:07:28,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99
12682,352663,1241533,2017-05-04 14:00:16,LaCie Porsche Design Desktop Drive 8TB USB 3.0...,External Hard Drive 8TB 35-inch USB 3.0 for Ma...,LaCie,LAC0171,Memory,235.99,1,249.99,235.99


<div class="alert alert-box alert-danger">
    We will have to examine the amount of data this problem affects after fixing as many of the prices as we can.
    <br>
    We can then decide if we wish to adjust the values or delete the orders entirely.
</div>

#### Problem 4: Incorrectly placed decimal points

In [20]:
'''
There are values where the regular_price has been corrupted.
'''
one_decimal_data[one_decimal_data.order_id.isin([496339])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
53601,496339,1574664,2018-01-22 20:18:56,Fibaro Single Switch embedded module,embedded module to control your consumption ou...,Fibaro,FIB0011,Accessories,55.98,1,599.918,56.99,90.500368


In [21]:
''' Let's refresh the data and round the values so that we can detect outliers using the discount_percentage '''

orderline_ids = single_orderline_single_product[
    (single_orderline_single_product.total_paid_decimal_count == 1) &
    (single_orderline_single_product.regular_price_decimal_count == 1) &
    (single_orderline_single_product.sale_price_decimal_count == 1)
].orderline_id

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
one_decimal_data[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data[['total_paid', 'regular_price', 'sale_price']].round()
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Add a discount percentage col to detect incorrect values and outliers
one_decimal_data = add_discount_percentage_col(one_decimal_data)

# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# Outliers
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This respresents 1.18% of the data.

There are corrupted values in total_paid which are less than their corresponding sale_price values.
This respresents 0.47% of the data.

There are 304 outliers


In [22]:
''' A lot of these values clearly have the decimal point one place too far to the left in regular_price.'''
outliers.sort_values('discount_percentage', ascending=False).head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
53630,496421,1574841,2018-01-22 22:35:46,Open - Piece Internal Battery iPhone 6,Replacement internal battery for Apple iPhone 6,Replacement,PIE0034-A,Accessories,16.0,1,350.0,12.0,96.571429
19152,376976,1291907,2017-07-13 18:00:53,Crucial memory Mac 8GB (2x4GB) SO-DIMM DDR3 16...,8GB RAM (2x4GB) SO-DIMM 1600MHZ iMac and Macbo...,Crucial,CRU0025-2,unknown,85.0,1,980.0,65.0,93.367347
24270,393603,1326648,2017-08-28 19:19:14,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6
24453,394043,1327664,2017-08-29 21:04:10,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6
24293,393659,1326788,2017-08-28 22:31:30,Open - Elgato Eve and Energy Plug Power Wirele...,Control on and off and sensor power consumptio...,Elgato,ELG0033-A,Smartwatch,37.0,1,500.0,37.0,92.6


<div class="alert alert-block alert-danger">
    After visually examining all the values in the above dataframe (shown truncated), it is clear that the decimal point in these regular_price values has been placed one point too far to the right.
    <br><br>
    This is fixable.
</div>

In [55]:
def _move_decimal_right(row, decimal_col, comparison_col):
    denominator = 10
    while row[decimal_col] >= row[comparison_col]:
        if row[decimal_col] > row[comparison_col]:
            row[decimal_col] /= denominator
        else:
            return row[decimal_col]
    return row[decimal_col] * denominator

def move_regular_price_decimal_point_right_wrt_sale_price(df):
    df.regular_price = [_move_decimal_right(row, 'regular_price', 'sale_price') for index, row in df.iterrows()]
    return df

test_data = outliers.sort_values('discount_percentage', ascending=False).head(5).copy()
test_data = move_regular_price_decimal_point_right_wrt_sale_price(test_data)
test_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
10498,345593,1226805,2017-04-12 09:29:58,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,14.0,1,50.0,10.0,80.016003
10486,345552,1226737,2017-04-12 08:51:20,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,15.0,1,50.0,10.0,80.016003
10489,345561,1226751,2017-04-12 09:00:00,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,14.0,1,50.0,10.0,80.016003
10490,345562,1226752,2017-04-12 09:02:15,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,14.0,1,50.0,10.0,80.016003
10491,345563,1226754,2017-04-12 09:03:02,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,14.0,1,50.0,10.0,80.016003


<div class="alert alert-box alert-danger">
    Oops! In lines above we can see that the regular_price has been corrupted and should clearly be slightly lower than the sale_price rather than much greater.
    <br><br>
    Let's edit out logic above to keep moving the decimal point until the smallest possible discount_percentage is found.
    <br><br>
    Later we can examine the data and decide whether it makes sense to set the regular_price equal to the sale_price or delete the rows.
</div>

In [56]:
def _calculate_percentage_difference(row, decimal_col, comparison_col):
    return (row[decimal_col] - row[comparison_col])/row[decimal_col]*100

def _move_decimal_right(row, decimal_col, comparison_col):
    denominator = 10

    while True:
        new_row = row.copy()
        new_row[decimal_col] /= denominator
        new_row.discount_percentage = _calculate_percentage_difference(new_row, decimal_col, comparison_col)
             
        # Check if the new percentage difference is closer to 0
        if abs(new_row.discount_percentage) > abs(row.discount_percentage):
            return row
        else:
            row = new_row
    
def move_regular_price_decimal_point_right_wrt_sale_price(df):
    df = df.apply(lambda row: _move_decimal_right(row, 'regular_price', 'sale_price'), axis=1)
    return df

test_data = outliers.sort_values('discount_percentage', ascending=False).head(5).copy()
test_data = move_regular_price_decimal_point_right_wrt_sale_price(test_data)
test_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
10498,345593,1226805,2017-04-12 09:29:58,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,14.0,1,50.0,10.0,80.016003
10486,345552,1226737,2017-04-12 08:51:20,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,15.0,1,50.0,10.0,80.016003
10489,345561,1226751,2017-04-12 09:00:00,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,14.0,1,50.0,10.0,80.016003
10490,345562,1226752,2017-04-12 09:02:15,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,14.0,1,50.0,10.0,80.016003
10491,345563,1226754,2017-04-12 09:03:02,Griffin Survivor Case All-Terrain iPad mini 1/...,An ultra resistant with 360 degree protection ...,Griffin,GRT0315,Accessories,14.0,1,50.0,10.0,80.016003


<div class="alert alert-block alert-success">
    Problem solved.
</div>

In [57]:
one_decimal_data.head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.0,1,140.0,129.0,7.736267
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,16.0,1,18.0,11.0,40.133407
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,85.0,1,100.0,78.0,22.0022
13,247524,1547886,2018-01-08 21:21:14,Crucial memory Mac 16GB (2x8GB) SO-DIMM DDR3 1...,RAM 16GB (2x8GB) Mac mini (2011) iMac (2010/11...,Crucial,CRU0026-2,unknown,168.0,1,164.0,163.0,0.603732
14,247643,1255453,2017-05-26 11:44:27,Open - Samsung 850 EVO SSD Disk 500GB,Open hard disk with SSD Mac and PC packaging 2...,Samsung,SAM0068-A,Accessories,154.0,1,180.0,154.0,14.695261


In [60]:
''' Check the new logic on the one decimal data and rerun the tests '''

one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Calculate the discount percentage
one_decimal_data = add_discount_percentage_col(one_decimal_data)

# Fix the decimal points
one_decimal_data = move_regular_price_decimal_point_right_wrt_sale_price(one_decimal_data)

# Round the prices
one_decimal_data[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data[['total_paid', 'regular_price', 'sale_price']].round()


# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# Outliers
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This respresents 1.67% of the data.

There are corrupted values in total_paid which are less than their corresponding sale_price values.
This respresents 0.47% of the data.

mean_diff:  21.932714901702933
std_diff:  17.828869114058083
threshold:  75.41932224387719
outliers.shape[0]:  116
There are 116 outliers


In [27]:
outliers.head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
268,299815,1119666,2017-01-01 17:30:57,Logitech Ultrathin Keyboard Cover Keyboard Cov...,Ultrathin cover and cover with Bluetooth keybo...,Logitech,LOG0084,Accessories,24.0,1,90.0,19.0,78.897655
271,299825,1119691,2017-01-01 17:44:45,Logitech Ultrathin Keyboard Cover Keyboard Cov...,Ultrathin cover and cover with Bluetooth keybo...,Logitech,LOG0084,Accessories,24.0,1,90.0,19.0,78.897655
315,299991,1120018,2017-01-01 21:18:11,Logitech Ultrathin Keyboard Cover Keyboard Cov...,Ultrathin cover and cover with Bluetooth keybo...,Logitech,LOG0084,Accessories,26.0,1,90.0,19.0,78.897655
489,300463,1121063,2017-01-02 12:09:40,Logitech Ultrathin Keyboard Cover Keyboard Cov...,Ultrathin cover and cover with Bluetooth keybo...,Logitech,LOG0084,Accessories,24.0,1,90.0,20.0,77.786421
571,300682,1121610,2017-01-02 16:07:24,Logitech Ultrathin Keyboard Cover Keyboard Cov...,Ultrathin cover and cover with Bluetooth keybo...,Logitech,LOG0084,Accessories,27.0,1,90.0,20.0,77.786421


In [61]:
def _calculate_percentage_difference2(row, decimal_col, comparison_col):
    return (row[decimal_col] - row[comparison_col])/row[decimal_col]*100

def _move_decimal_right2(row, decimal_col, comparison_col):
    denominator = 10

    while True:
        new_row = row.copy()
        new_row[decimal_col] /= denominator
             
        # Check if the new percentage difference is closer to 0
        if abs(_calculate_percentage_difference2(new_row, decimal_col, comparison_col)) > abs(_calculate_percentage_difference2(row, decimal_col, comparison_col)):
            return row
        else:
            row = new_row
    
def move_regular_price_decimal_point_right_wrt_sale_price2(df):
    df = df.apply(lambda row: _move_decimal_right2(row, 'regular_price', 'sale_price'), axis=1)
    return df


''' Check the new logic on the one decimal data and rerun the tests '''

one_decimal_data2 = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data2[['regular_price', 'sale_price']] = one_decimal_data2[['regular_price', 'sale_price']].astype('float')
one_decimal_data2.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Fix the decimal points
one_decimal_data2 = move_regular_price_decimal_point_right_wrt_sale_price2(one_decimal_data2)

# Round the prices
one_decimal_data2[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data2[['total_paid', 'regular_price', 'sale_price']].round()

# Calculate the discount percentage
one_decimal_data2 = add_discount_percentage_col(one_decimal_data2)


# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data2)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data2)

# Outliers
outliers2 = check_for_outliers(one_decimal_data2)
print(f"There are {outliers2.shape[0]} outliers")

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This respresents 1.67% of the data.

There are corrupted values in total_paid which are less than their corresponding sale_price values.
This respresents 0.47% of the data.

mean_diff:  21.968633244382833
std_diff:  17.814879786815826
threshold:  75.41327260483031
outliers.shape[0]:  93
There are 93 outliers


<div class="alert alert-block alert-success">
    Success!
    <br><br>
    After visually evaluating the outliers they appear to just be products with very large discounts.
</div>

In [63]:
one_decimal_data.discount_percentage = one_decimal_data.discount_percentage.round(2)
one_decimal_data2.discount_percentage = one_decimal_data2.discount_percentage.round(2)

one_decimal_data[one_decimal_data.discount_percentage != one_decimal_data2.discount_percentage]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.0,1,140.0,129.0,7.74
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,16.0,1,18.0,11.0,40.13
13,247524,1547886,2018-01-08 21:21:14,Crucial memory Mac 16GB (2x8GB) SO-DIMM DDR3 1...,RAM 16GB (2x8GB) Mac mini (2011) iMac (2010/11...,Crucial,CRU0026-2,unknown,168.0,1,164.0,163.0,0.60
14,247643,1255453,2017-05-26 11:44:27,Open - Samsung 850 EVO SSD Disk 500GB,Open hard disk with SSD Mac and PC packaging 2...,Samsung,SAM0068-A,Accessories,154.0,1,180.0,154.0,14.70
43,256158,1311954,2017-08-05 19:46:32,My Cloud EX2 Ultra Pack | WD 6TB Network,WD My Cloud EX2 Ultra + 6TB (2x3TB) Network WD...,Pack,PAC1450,Memory,386.0,1,428.0,386.0,9.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61621,526777,1648980,2018-03-14 01:53:54,Mac memory OWC 8GB (2x4GB) SO-DIMM DDR3 1066MHZ,RAM 8GB (2x4GB) Mac mini iMac MacBook and MacB...,OWC,OWC0035-2,unknown,91.0,1,88.0,87.0,1.13
61623,526804,1649025,2018-03-14 08:39:35,Apple iPad Wi-Fi 32GB Silver,New iPad Wi-Fi 32GB Silver (MP2G2TY / A),Apple,APP1974,iPad,391.0,1,403.0,386.0,4.17
61624,526816,1649042,2018-03-14 09:26:27,Macally Lampcharge Lamp with 4 USB ports,Table lamp with touch control and 4 USB charge...,Macally,MAC0144,Hardware,30.0,1,40.0,25.0,37.45
61625,526843,1649106,2018-03-14 10:39:18,Open - LaCie Porsche Design Desktop Lightgrey ...,External Hard Drive Refurbished 35 inches 4TB ...,LaCie,LAC0206-A,Accessories,113.0,1,140.0,108.0,22.69


In [64]:
one_decimal_data2[one_decimal_data2.order_id.isin([241423])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.0,1,140.0,129.0,7.86


In [65]:
completed_sales[completed_sales.order_id.isin([241423])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.15,1,139.99,129.16


In [66]:
(139.99-129.16)/140*100

7.857142857142857

In [52]:
'''

mean_diff = df['discount_percentage'].mean()
std_diff = df['discount_percentage'].std()
threshold = mean_diff + 3 * std_diff

outliers = df[df['discount_percentage'] > threshold]

'''

#one_decimal_data = add_discount_percentage_col(one_decimal_data)
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")
#one_decimal_data['discount_percentage'].mean(), one_decimal_data2['discount_percentage'].mean()

#one_decimal_data['discount_percentage'].std(), one_decimal_data2['discount_percentage'].std()

one_decimal_data['discount_percentage'].mean() + 3 * one_decimal_data['discount_percentage'].std(), one_decimal_data2['discount_percentage'].mean() + 3 * one_decimal_data2['discount_percentage'].std()

There are 116 outliers


(75.41932224387719, 75.41327260483031)

In [30]:
one_decimal_data.head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.0,1,140.0,129.0,7.857143
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,16.0,1,18.0,11.0,38.888889
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,85.0,1,100.0,78.0,22.0
13,247524,1547886,2018-01-08 21:21:14,Crucial memory Mac 16GB (2x8GB) SO-DIMM DDR3 1...,RAM 16GB (2x8GB) Mac mini (2011) iMac (2010/11...,Crucial,CRU0026-2,unknown,168.0,1,164.0,163.0,0.609756
14,247643,1255453,2017-05-26 11:44:27,Open - Samsung 850 EVO SSD Disk 500GB,Open hard disk with SSD Mac and PC packaging 2...,Samsung,SAM0068-A,Accessories,154.0,1,180.0,154.0,14.444444


In [53]:
# Drop 'discount_percentage' from both dataframes for comparison
df1 = one_decimal_data.drop(columns=['discount_percentage'])
df2 = one_decimal_data2.drop(columns=['discount_percentage'])

# Compare the two dataframes and find rows that are different
# This will return a boolean dataframe indicating which rows are different
diff = (df1 != df2)

# Get the rows where any of the values differ (using .any(axis=1) to check row-wise)
rows_with_diff = diff.any(axis=1)

# Extract the rows from the original dataframes that are different
different_rows_in_df1 = one_decimal_data[rows_with_diff]
different_rows_in_df2 = one_decimal_data2[rows_with_diff]

# If you want to view them side-by-side:
different_rows = pd.concat([different_rows_in_df1, different_rows_in_df2], axis=1, keys=['df1', 'df2'])

different_rows


Unnamed: 0_level_0,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,...,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2
Unnamed: 0_level_1,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,...,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage


In [43]:
unique_in_outliers = outliers[~outliers['order_id'].isin(outliers2['order_id'])]
unique_in_outliers.sort_values('discount_percentage', ascending=False).head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
35919,434733,1430389,2017-11-23 20:55:10,Zagg Folio Case Color Keyboard Folio iPad Air ...,Case with Bluetooth Keyboard for iPad Air 2 Sp...,ZaggKeys,ZAG0019,Accessories,24.0,1,80.0,20.0,75.571946
36286,435925,1430551,2017-11-23 21:24:38,NewerTech NuGuard KX iPad Air Case Black,IPad case ultra resistant to extreme condition...,NewerTech,NTE0054,Accessories,25.0,1,80.0,20.0,75.571946
26703,403407,1359312,2017-09-22 20:31:18,NewerTech NuGuard KX iPhone Case SE / 5s / 5 B...,IPhone Case SE / 5s / 5 ultra resistant for ex...,NewerTech,NTE0055,Accessories,19.0,1,61.0,15.0,75.4222
42038,454645,1475132,2017-12-03 21:48:16,NewerTech NuGuard KX iPhone Case SE / 5s / 5 B...,IPhone Case SE / 5s / 5 ultra resistant for ex...,NewerTech,NTE0055,Accessories,22.0,1,61.0,15.0,75.4222
57823,511963,1614792,2018-02-15 11:47:24,NewerTech NuGuard KX iPhone Case SE / 5s / 5 B...,IPhone Case SE / 5s / 5 ultra resistant for ex...,NewerTech,NTE0055,Accessories,20.0,1,61.0,15.0,75.4222


In [44]:
completed_sales[completed_sales.order_id.isin([434733])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price
35919,434733,1430389,2017-11-23 20:55:10,Zagg Folio Case Color Keyboard Folio iPad Air ...,Case with Bluetooth Keyboard for iPad Air 2 Sp...,ZaggKeys,ZAG0019,Accessories,23.53,1,79.99,19.54


In [45]:
one_decimal_data[one_decimal_data.order_id.isin([434733])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
35919,434733,1430389,2017-11-23 20:55:10,Zagg Folio Case Color Keyboard Folio iPad Air ...,Case with Bluetooth Keyboard for iPad Air 2 Sp...,ZaggKeys,ZAG0019,Accessories,24.0,1,80.0,20.0,75.571946


In [46]:
one_decimal_data2[one_decimal_data2.order_id.isin([434733])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
35919,434733,1430389,2017-11-23 20:55:10,Zagg Folio Case Color Keyboard Folio iPad Air ...,Case with Bluetooth Keyboard for iPad Air 2 Sp...,ZaggKeys,ZAG0019,Accessories,24.0,1,80.0,20.0,75.0


In [48]:
outliers[outliers.order_id.isin([434733])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
35919,434733,1430389,2017-11-23 20:55:10,Zagg Folio Case Color Keyboard Folio iPad Air ...,Case with Bluetooth Keyboard for iPad Air 2 Sp...,ZaggKeys,ZAG0019,Accessories,24.0,1,80.0,20.0,75.571946


<div class="alert alert-block alert-danger">
    Except the values above where the total_paid decimal point position has been corrupted.
</div>

#### Problem 5: total_paid < sale_price

<div class="alert alert-block alert-info">
    Now let's see if we can fix the total_paid < sale_price values
</div>

In [35]:
one_decimal_data[one_decimal_data.order_id.isin([304373, 311465, 281302, 303444, 310031, 274043, 307156, 309296])]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
92,274043,1121663,2017-01-02 16:30:49,Griffin Magnetic BreakSafe-C USB charging cabl...,Charging cable and connector magnetic fast lib...,Griffin,GRT0425,Accessories,4.0,1,35.0,25.0,28.571429
111,281302,1142023,2017-01-12 01:00:05,Startech USB Adapter VGA-C Blanco,Adapter with reversible USB connection VGA-C (...,Startech,STA0036,Accessories,17.0,1,54.0,30.0,44.444444
1421,303444,1131913,2017-01-06 01:01:47,Samsung Pro + SDHC UHS Class 3 | 32GB,SDHC Memory Card U3 / UHS-I speed of 95MB / 90MB,Samsung,SAM0105,Memory,9.0,1,52.0,29.0,44.230769
1652,304373,1134013,2017-01-07 18:27:45,Jawbone UP3 Activity Monitor Black,Bluetooth activity monitor recorded sleep data...,Jawbone,JAW0040,Hardware,49.0,1,180.0,50.0,72.222222
2584,307156,1140724,2017-01-11 11:16:20,Travelstar Kingston Pen Drive 3C Duo micro USB...,64GB pendrive reversible USB-C to USB for Mac ...,Kingston,KIN0123,Accessories,3.0,1,30.0,24.0,20.0
3357,309296,1145769,2017-01-14 11:58:50,Travelstar Kingston Pen Drive 3C Duo micro USB...,64GB pendrive reversible USB-C to USB for Mac ...,Kingston,KIN0123,Accessories,4.0,1,30.0,24.0,20.0
3586,310031,1147316,2017-01-15 23:02:54,"21Mochila Thule Crossover MacBook Pro 15 ""Black",Lightweight waterproof backpack with several c...,Thule,THU0025,Accessories,45.0,1,100.0,70.0,30.0
4201,311465,1150949,2017-01-18 10:34:51,Pebble Smartwatch Time Steel Black,Bluetooth Smart Watch with steel case leather ...,Pebble,PEB0015,Smartwatch,121.0,1,300.0,126.0,58.0


In [36]:
one_decimal_data[one_decimal_data.orderline_id.isin(sale_greater_than_total_orderline_ids)].sort_values('discount_percentage', ascending=False)

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
1652,304373,1134013,2017-01-07 18:27:45,Jawbone UP3 Activity Monitor Black,Bluetooth activity monitor recorded sleep data...,Jawbone,JAW0040,Hardware,49.0,1,180.0,50.0,72.222222
4201,311465,1150949,2017-01-18 10:34:51,Pebble Smartwatch Time Steel Black,Bluetooth Smart Watch with steel case leather ...,Pebble,PEB0015,Smartwatch,121.0,1,300.0,126.0,58.0
5634,315121,1159985,2017-01-26 12:55:19,Crucial Mac Memory 32GB (2x16GB) DIMM DDR3 186...,32GB RAM (2x16GB) Mac Pro 2013.,Crucial,CRU0038-2,unknown,254.0,1,470.0,255.0,45.744681
111,281302,1142023,2017-01-12 01:00:05,Startech USB Adapter VGA-C Blanco,Adapter with reversible USB connection VGA-C (...,Startech,STA0036,Accessories,17.0,1,54.0,30.0,44.444444
6442,317342,1165077,2017-01-30 11:18:12,Lexar JumpDrive Lightning USB 3.0 M20i Pendriv...,Pendrive USB 3.0 connector lightning and 64GB ...,Lexar,LEX0028,Accessories,49.0,1,90.0,50.0,44.444444
1421,303444,1131913,2017-01-06 01:01:47,Samsung Pro + SDHC UHS Class 3 | 32GB,SDHC Memory Card U3 / UHS-I speed of 95MB / 90MB,Samsung,SAM0105,Memory,9.0,1,52.0,29.0,44.230769
2888,308018,1142744,2017-01-12 15:05:33,"Macally External Hard Drive 1TB 25 ""USB 3.0",External Hard Drive 1TB 5400rpm aluminum housi...,Pack,PAC1913,Accessories,58.0,1,99.0,59.0,40.40404
2789,307736,1142070,2017-01-12 02:58:33,"Open - Dell P2415Q 238 ""4K IPS miniDP DP DP HD...",238 inch 4K monitor IPS optimal resolution for...,Dell,DLL0021-A,Accessories,455.0,1,702.0,460.0,34.472934
6281,316828,1163703,2017-01-29 22:10:49,Open - IK Multimedia iRig Keys USB controller ...,Mini keyboard with 25 keys for Mac iPhone iPad...,IK Multimedia,IKM0045-A,Accessories,47.0,1,73.0,48.0,34.246575
5167,313954,1157089,2017-01-23 18:44:47,JBL Flip 3 Bluetooth Speaker Black,Bluetooth wireless speaker for iPhone iPad and...,JBL,JBL0107,Hardware,82.0,1,130.0,87.0,33.076923


In [37]:
''' Add the new logic, recalculate the discount percentages and rerun the tests, again. '''




one_decimal_data = single_orderline_single_product[single_orderline_single_product.orderline_id.isin(orderline_ids)].copy()
one_decimal_data[['regular_price', 'sale_price']] = one_decimal_data[['regular_price', 'sale_price']].astype('float')
one_decimal_data[['total_paid', 'regular_price', 'sale_price']] = one_decimal_data[['total_paid', 'regular_price', 'sale_price']].round()
one_decimal_data.drop(['total_paid_decimal_count', 'regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Fix the decimal points
one_decimal_data = move_regular_price_decimal_point_right_wrt_sale_price(one_decimal_data)

# Add a discount percentage col to detect incorrect values and outliers
one_decimal_data = add_discount_percentage_col(one_decimal_data)

# Tests
sale_greater_than_regular_orderline_ids = pdt.test_regular_greater_or_equal_to_sale(one_decimal_data)
sale_greater_than_total_orderline_ids = pdt.test_total_greater_or_equal_to_sale(one_decimal_data)

# Outliers
outliers = check_for_outliers(one_decimal_data)
print(f"There are {outliers.shape[0]} outliers")

AttributeError: 'Series' object has no attribute 'discount_percentage'

In [None]:
sale_price_greater_than_regular[(sale_price_greater_than_regular.regular_price < sale_price_greater_than_regular.sale_price) & (sale_price_greater_than_regular.total_paid < sale_price_greater_than_regular.sale_price)]

# Check how rounding the values affects the data
problem 1: sale_price > total_paid - check how many rows, ignore for now?

problem 2: regular_price < sale_price 
- Round all values
- For the other ones, check if the regular price changed over time and then probably set regular_price equal to sale_price
- Calculate the discount percentage after rounding and use negative discounts to identify these?

<div class="alert alert-box alert-info">
    Let's split the data at the decimal point and examine whether we can determine any patterns by the number of digits after the decimal point.
</div>

## Decimal count = 1 = 0 = 1

<div class="alert alert-box alert-info">
    total_paid_decimal_count==1  & regular_price_decimal_count==0 & sale_price_decimal_count==1
</div>

In [None]:
1. Turn them into floats

In [None]:
order_ids = prices[
((prices.regular_price_decimal_count == 0 )| (prices.regular_price_decimal_count == 1)) &
((prices.promo_price_decimal_count == 0) | (prices.promo_price_decimal_count == 1)) &
((prices.sale_price_decimal_count == 0) | (prices.sale_price_decimal_count == 1)) 
].orderline_id

zero_one_decimal_data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
zero_one_decimal_data

## Decimal count == 0 

<div class="alert alert-box alert-info">
    regular_price_decimal_count == promo_price_decimal_count == sale_price_decimal_count == 0
</div>
<div class="alert alert-box alert-warning">
    There are no sale_price values with zero decimal points.
    <br>
    The regular_price and promo_price values with no decimal points appear to have been accidentally saved as ints instead of floats.
    <br>
    Let's select only these values and see if this logic is correct
</div>


In [None]:
order_ids = prices[
((prices.regular_price_decimal_count == 0)) &
((prices.promo_price_decimal_count == 0)) &
((prices.sale_price_decimal_count == 0) | (prices.sale_price_decimal_count == 1)) 
].orderline_id

zero_decimal_count = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
zero_decimal_count

In [None]:
# Transform the three columns to floats
zero_decimal_count[['regular_price', 'promo_price', 'sale_price']] = zero_decimal_count[['regular_price', 'promo_price', 'sale_price']].astype('float')

# Run the tests
incorrect_reg_pro_ids = pdt.test_regular_greater_or_equal_to_promo(zero_decimal_count)

In [None]:
display(zero_decimal_count[zero_decimal_count.regular_price >= zero_decimal_count.promo_price].head())

<div class="alert alert-box alert-success">
    After viewing the entire df (shown above in truncated form) we can conclude that our assumption is basically correct.
</div>

<div class="alert alert-box alert-danger">
    However, some of the sale_price values still do not match the regular_price or the promo_price.
    <br>
    <br>
    Furthermore, even though the total_price values only ever had one decimal point, for some of them it is clearly in the wrong place.
</div>
<div class="alert alert-box alert-info">
    To clean the sale_price values we must first clean the total_price values as it is sometimes equal to neither regular_price nor promo_price but is close to total_paid.
    <br>
    <br>
    To this end we must select a subset where all the orderlines in the order are within the zero_decimal_count df. 
    <br>
    If the order we are examining has other otherlines which are in completed_sales but not in zero_decimal_count then we cannot be sure we have all the data to confirm the total_paid value is correct.
    <br>
    <br>
    We must check the distribution of total_price values wrt to sale_price, then promo_price and regular_price to determine whether the difference can be explained by shipping costs or whether the decimal point is in the wrong place.
    <br>
    <br>
    We can then analyse the distribution of sale_price values wrt to regular_price and wrt promo_price and examine the outliers.
</div>

In [None]:
''' Filter out orders which do not have all their orderlines in zero_decimal_count'''

# Get the count of orderlines per order from completed_sales
completed_sales_orderline_counts = completed_sales.groupby('order_id')['orderline_id'].count().reset_index()
completed_sales_orderline_counts.rename(columns={'orderline_id': 'completed_count'}, inplace=True)

# Do the same for zero_decimal_count
zero_dec_orderline_counts = zero_decimal_count.groupby('order_id')['orderline_id'].count().reset_index()
zero_dec_orderline_counts.rename(columns={'orderline_id': 'zero_count'}, inplace=True)

# Merge to find orders with matching counts
valid_orders = pd.merge(completed_sales_orderline_counts, zero_dec_orderline_counts, on='order_id')
valid_orders = valid_orders[valid_orders['completed_count'] == valid_orders['zero_count']]

# Filter zero_decimal_count for these valid order_ids
zero_decimal_count__complete_orders = zero_decimal_count[zero_decimal_count['order_id'].isin(valid_orders['order_id'])].copy()

print("zero_decimal_count.shape:", zero_decimal_count.shape)
print("zero_decimal_count_subset.shape:", zero_decimal_count__complete_orders.shape)

In [None]:
valid_orders[valid_orders]

In [None]:
''' Multiply regular_price, promo_price and sale_price by the product_quantity to compare them to the total_paid. '''

zero_decimal_count__complete_orders['regular_price_total'] = zero_decimal_count__complete_orders.regular_price*zero_decimal_count__complete_orders.product_quantity
zero_decimal_count__complete_orders['promo_price_total'] = zero_decimal_count__complete_orders.promo_price*zero_decimal_count__complete_orders.product_quantity
zero_decimal_count__complete_orders['sale_price_total'] = zero_decimal_count__complete_orders.sale_price*zero_decimal_count__complete_orders.product_quantity

''' First check the orders with only one orderline for simplicity '''
zero_decimal_count__complete_orders__single_orderline

In [None]:

incorrect_pro_sale_ids = pdt.test_promo_greater_or_equal_to_sale(zero_decimal_count)

## Evaluate corrupted data with multiple decimal points

### regular_price_decimal_count==1	promo_price_decimal_count==0	sale_price_decimal_count==2

In [None]:
order_ids = prices[
(prices.regular_price_decimal_count == 1) &
(prices.promo_price_decimal_count == 0) &
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

### regular_price_decimal_count==1	promo_price_decimal_count==1	sale_price_decimal_count==2

In [None]:
order_ids = prices[
(prices.regular_price_decimal_count == 1) &
(prices.promo_price_decimal_count == 1) &
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

### sale_price_decimal_count == 2

In [None]:
order_ids = prices[
(prices.sale_price_decimal_count == 2) 
].orderline_id

data = completed_sales[completed_sales.orderline_id.isin(order_ids)].copy()
data

<div class="alert alert-block alert-danger">
    FOR SALE_PRICE_DEC_COUNT: YOU NEED TO SPLIT THE SALE PRICE COLUMN AND FIGURE OUT IF THERE ARE ALWAYS 2 DIGITS AFTER THE DEC AND 2 BETWEEN THE TWO. THEN YOU MIGHT BE ABLE TO FIX THEM ALL IN ONE GO.

</div>

### regular_price_decimal_count == 0

In [None]:
# Let's take a look at the values with zero decimal points. It is possible that these have been incorrectly stored as ints.
reg_price_zero_dec_ids = prices[(prices.regular_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(reg_price_zero_dec_ids)].copy()
zero_dec_test_data

In [None]:
pro_price_zero_dec_ids = prices[(prices.promo_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(pro_price_zero_dec_ids)].copy()
zero_dec_test_data

In [None]:
reg_pro_price_zero_dec_ids = prices[(prices.regular_price_decimal_count == 0) & (prices.promo_price_decimal_count == 0)].orderline_id

zero_dec_test_data = completed_sales[completed_sales.orderline_id.isin(zero_dec_ids)].copy()
zero_dec_test_data

<div class="alert alert-block alert-success">
    The
</div>

### Check sale_price

In [None]:
temp = prices.copy()
temp[temp.sale_price_decimal_count!=1]

In [None]:
temp = (
    temp.assign(
        sale_price_split = lambda x: x['sale_price'].str.split(r'\.'),
        decimal_places = lambda x: len(x['sale_price_split'][1]))
)
temp.head(5)

In [None]:
temp[temp.decimal_places != 2]

### Transform orderlines.unit_price to floats

In [None]:
def transform_unit_price_to_floats(df):
    """
    Transform the orderlines.unit_price price column to floats.
    Some of the values have two decimal points. 
    For these values we will remove the leftmost decimal and transform all values to floats.
    The correct position of the decimal point will be determined by merging orderlines, 
    products, orders and brands, and comparing the price values.
    
    Args:
        df (pd.DataFrame): The orderlines data
    
    Returns:
        pd.DataFrame: The orderlines data with the unit_price column transformed from str to float values.
    """
    return (
        df.assign(unit_price = df.unit_price.str.split('.')
                  .apply(lambda x : x[0]+x[1]+'.'+x[2] if len(x)==3 else x[0]+'.'+ x[1])
                  .astype(float)
        )
    )
    

In [None]:
completed_sales['regular_price'].str.count(r'\.')

In [None]:
def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df

# Solution attempt 1
Move the decimal points until reg >= promo >= sale and run the tests to see if it works

In [None]:
def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df

def _insert_decimal_in_promo_price(row, decimal_position = -2):
    '''
    If the euro value of the regular_price is equal to the euro value of the promo_price, 
    set the promo_price equal to regular_price and return it.
    This is because some promo_prices are slightly larger than their equivalent prices, e.g. 12.95 - 12.99
    
    Otherwise, keep moving the decimal point towards the start of the string until the promo_price is lower than the price.
    Then transform the string to a float and round it to two decimal places.
    '''
    
    while float(row.promo_price) > row.regular_price:
        if round(float(row.promo_price), 0) == round(row.regular_price, 0):
            row.promo_price = row.regular_price
            return row.promo_price
        else:
            row.promo_price = _insert_decimal_at_string_position(row.promo_price, decimal_position)
            decimal_position += -1
    
    return round(float(row.promo_price), 2)

def transform_promo_price_to_floats(df):
    df.promo_price = [_insert_decimal_in_promo_price(row) for index, row in df.iterrows()]
    return df
    
def calculate_products_discounts(df):
    return df.assign(
        discount=round(df.price - df.promo_price, 2),
        discount_pc = round((df.price - df.promo_price)/df.price * 100, 2)
    )

def calculate_sales_discounts(df):
    return (df
            .assign(
                sales_discount=round(df.price - df.sale_price, 2),
                sales_discount_pc = round((df.price - df.sale_price)/df.price * 100, 2)
            )
           )

temp = completed_sales.copy()

temp = (temp
        .pipe(start_pipeline)
        .pipe(split_str_on_dots_and_append_decimal, 'regular_price')
        #.pipe(split_and_join_regular_prices)
        .pipe(transform_regular_price_to_float)
        .pipe(split_str_on_dots_and_append_decimal, 'promo_price')
        #.pipe(split_and_join_promo_prices)
        .pipe(transform_promo_price_to_floats)
        .pipe(calculate_products_discounts)
        .pipe(calculate_sales_discounts)
)

temp