In [18]:
import re
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import data_utils
import price_debugging_tests as pdt

# Import and merge data

In [19]:
# Import the data
orders = data_utils.clean_orders(data_path="../../data/")
orderlines = data_utils.clean_orderlines(data_path="../../data/")
products = data_utils.clean_products(data_path="../../data/")
brands = data_utils.clean_brands(data_path="../../data/")

# Merge the data
completed_sales =  data_utils.merge_data(orders, orderlines, products, brands)

5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from orderlines.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.




# Generate test data

In [20]:
failing_test_data, passing_test_data = pdt.generate_test_data(completed_sales)

print(failing_test_data.shape, passing_test_data.shape)

(31280, 13) (4116, 13)


### Find the orders where the total_price != sum(product_quantity*sale_price)

In [21]:
incorrect_total_price = pdt.test_order_total_paid_equal_sum_of_orderlines(passing_test_data)

There are 3507 orders where total_paid does not match the sum of product_quantity * sale_price.
This respresents 85.20% of the data.



# Explore the total_paid values
Let's first multiple regular_price, promo_price and sale_price values by the product quantity then group them by order and determine if perhaps the correct value is in the wrong column.

In [22]:
total_paid_test_data = passing_test_data[passing_test_data.order_id.isin(incorrect_total_price.index)].copy()
total_paid_test_data.head()

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,59.584,52.99
59,261656,1321933,2017-08-21 19:01:56,Support Satechi Slim MacBook or iMac Aluminum ...,Stand with aluminum finish and practical desig...,Satechi,SAT0034,Accessories,44.98,1,49.99,44.99,39.99
68,263738,1319085,2017-08-16 17:01:54,Elago Airpod Charging stand Charging Stand Bla...,Silicone holder for positioning and loading th...,Elago,ELA0027,Accessories,80.96,1,19.95,13.99,13.99
70,264244,1587509,2018-01-29 15:18:31,"Battery iFixit MacBook Pro 13 ""(Mid 2009 to Mi...",Internal Battery for MacBook Pro 13-inch (Mid ...,iFixit,IFX0181,Accessories,141.97,1,89.95,79.99,69.99
81,267194,1505973,2017-12-19 21:19:08,"Battery iFixit MacBook Pro 13 ""(Mid 2009 to Mi...",Internal Battery for MacBook Pro 13-inch (Mid ...,iFixit,IFX0181,Accessories,75.98,1,89.95,79.99,72.99


In [8]:
total_paid_test_data.regular_price = total_paid_test_data.regular_price * total_paid_test_data.product_quantity
total_paid_test_data.promo_price = total_paid_test_data.promo_price * total_paid_test_data.product_quantity
total_paid_test_data.sale_price = total_paid_test_data.sale_price * total_paid_test_data.product_quantity
grouped_data = total_paid_test_data.groupby('order_id')
group_sizes = grouped_data.size()
sorted_groups = group_sizes.sort_values(ascending=False)

sorted_groups

order_id
444722    3
445458    3
318935    3
449404    2
454854    2
         ..
414075    1
414094    1
414157    1
414219    1
527017    1
Length: 3507, dtype: int64

In [9]:
grouped_data.get_group(245595)#[['total_paid', 'product_quantity', 'regular_price', 'promo_price', 'sale_price']]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,207.9,119.168,105.98


In [10]:
total_paid_test_data[total_paid_test_data.sku=='PAC1561']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,207.9,119.168,105.98
1572,304087,1133263,2017-01-07 09:52:01,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,52.98,1,103.95,59.584,52.99
1695,304508,1134340,2017-01-07 21:53:10,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,56.98,1,103.95,59.584,52.99
1761,304710,1134849,2017-01-08 12:07:16,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,59.98,1,103.95,59.584,52.99
2457,306758,1139777,2017-01-10 19:50:19,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,56.98,1,103.95,59.584,52.99
4678,312748,1154120,2017-01-20 23:44:05,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,54.98,1,103.95,59.584,52.99
5413,314606,1158703,2017-01-25 10:10:43,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,57.98,1,103.95,59.584,52.99
6595,317709,1166133,2017-01-30 18:52:20,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,127.97,1,103.95,59.584,52.99
7600,320593,1172676,2017-02-06 17:43:09,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,1,103.95,59.584,52.99
7934,321652,1175005,2017-02-09 09:13:05,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,338.97,1,103.95,59.584,52.99


In [11]:
completed_sales[completed_sales.order_id==302857]

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
1257,302857,1130622,2017-01-05 00:57:59,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,66.98,1,103.95,59.584,62.99


In [12]:
completed_sales[completed_sales.sku=='PAC1561']

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,promo_price,sale_price
4,245595,1154394,2017-01-21 12:49:00,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,112.97,2,103.95,59.584,52.99
1077,302106,1127954,2017-01-03 22:50:20,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,62.99,1,103.95,59.584,62.99
1257,302857,1130622,2017-01-05 00:57:59,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,66.98,1,103.95,59.584,62.99
1572,304087,1133263,2017-01-07 09:52:01,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,52.98,1,103.95,59.584,52.99
1695,304508,1134340,2017-01-07 21:53:10,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,56.98,1,103.95,59.584,52.99
1761,304710,1134849,2017-01-08 12:07:16,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,59.98,1,103.95,59.584,52.99
2457,306758,1139777,2017-01-10 19:50:19,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,56.98,1,103.95,59.584,52.99
4678,312748,1154120,2017-01-20 23:44:05,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,54.98,1,103.95,59.584,52.99
5413,314606,1158703,2017-01-25 10:10:43,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,57.98,1,103.95,59.584,52.99
6595,317709,1166133,2017-01-30 18:52:20,"Macally External Hard Drive 1TB 35 ""USB 3.0 SA...",Aluminum External Hard Drive 1TB capacity form...,Pack,PAC1561,Memory,127.97,1,103.95,59.584,52.99


In [13]:
grouped_data.get_group(318935)[['regular_price', 'promo_price', 'sale_price']].sum(axis=0)

regular_price    159.970
promo_price      122.972
sale_price       116.970
dtype: float64

In [17]:
import itertools

def check_price_combinations(df):
    total_paid = df['total_paid'].iloc[0]  # total_paid is the same for all rows in the group

    # Create a list of lists with prices and quantities
    price_quantity_pairs = [
        [(row['regular_price'], row['product_quantity']), 
         (row['promo_price'], row['product_quantity']), 
         (row['sale_price'], row['product_quantity'])] 
        for idx, row in df.iterrows()
    ]
    
    # Check combinations with product_quantity = 1
    for combination in itertools.product(*price_quantity_pairs):
        total = sum(price * qty for price, qty in combination)
        if total_paid - 2 <= total <= total_paid + 2:
            print(f"Match found 1: {combination, total}")
    print('\n')
    
    # Check combinations with product_quantity varying (1 or 2)
    # This creates combinations where product_quantity can be 1 or 2 for each row
    for qty_combination in itertools.product([1, 2], repeat=len(df)):
        # Apply the varying quantities to each row's price choices
        varying_price_quantity_pairs = [
            [(row.regular_price, qty_combination[i]), 
             (row.promo_price, qty_combination[i]), 
             (row.sale_price, qty_combination[i])]
            for i, row in enumerate(df.itertuples())
        ]
        
        # Now test each price combination with the varying quantities
        for combination in itertools.product(*varying_price_quantity_pairs):
            total = sum(price * qty for price, qty in combination)
            if total_paid - 2 <= total <= total_paid + 2:
                print(f"Match found: {combination, total}")

# Example usage
df = grouped_data.get_group(318935)[['total_paid', 'product_quantity', 'regular_price', 'promo_price', 'sale_price']]
check_price_combinations(df)



Match found: (((79.99, 1), (49.99, 2), (16.992, 1)), 196.962)
Match found: (((79.99, 1), (49.99, 2), (16.99, 1)), 196.96)
Match found: (((69.99, 1), (49.99, 2), (29.99, 1)), 199.96)
Match found: (((69.99, 1), (49.99, 2), (29.99, 1)), 199.96)
Match found: (((79.99, 1), (29.99, 2), (29.99, 2)), 199.95)
Match found: (((69.99, 2), (29.99, 1), (29.99, 1)), 199.95999999999998)
Match found: (((69.99, 2), (29.99, 1), (29.99, 1)), 199.95999999999998)
