In [1]:
import re
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import data_utils
import price_debugging_tests as pdt

# Import and merge data

In [3]:
# Import the data
orders = data_utils.clean_orders(data_path="../../data/")
orderlines = data_utils.clean_orderlines(data_path="../../data/")
products = data_utils.clean_products(data_path="../../data/")
brands = data_utils.clean_brands(data_path="../../data/")

# Merge the data
completed_sales =  data_utils.merge_data(orders, orderlines, products, brands)

5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from orderlines.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.




## Create test data
Extract values with only one decimal so they can be transformed to floats and separate them into passing test data and failing test data to check the tests work correctly. 

Passing test data: regular_price >= promo_price >= sale_price

Failing test data: All of the data

In [9]:
def count_decimal_points(df):
    prices = df[['orderline_id', 'total_paid', 'regular_price', 'sale_price']].copy()

    prices['regular_price_decimal_count'] = prices['regular_price'].str.count(r'\.')
    prices['sale_price_decimal_count'] = prices['sale_price'].str.count(r'\.')

    return prices


def generate_test_data(sales_df):
    decimal_points_per_price = count_decimal_points(sales_df)
    
    # Extract a subset of the data with only one decimal point so we can transform the str values to floats and check our test logic.
    test_data_ids = decimal_points_per_price[
                        (decimal_points_per_price.regular_price_decimal_count == 1) & 
                        (decimal_points_per_price.sale_price_decimal_count == 1)
                    ].orderline_id
    test_data = sales_df[sales_df.orderline_id.isin(test_data_ids)].copy()

    # Transform the price values to float.
    test_data.regular_price = test_data.regular_price.astype(float)
    test_data.sale_price = test_data.sale_price.astype(float)
    
    # This data is corrupted and should fail the tests
    failing_test_data = test_data
    
    # This subset of the data has regular_price >= sale_price and should therefore (hopefully) be uncorrupted
    passing_test_data = test_data[test_data.regular_price >= test_data.sale_price]

    return failing_test_data, passing_test_data

failing_test_data, passing_test_data = generate_test_data(completed_sales)

print(failing_test_data.shape, passing_test_data.shape)

(41135, 12) (39858, 12)


In [None]:
''' INCLUDING TOTAL PRICE HERE
NEED TO DECIDE IF I SHOULD MERGE ORDERLINES IN TEST DATA TO INCLUDE ORDERS WITH MULTIPLE PRODUCTS'''

In [12]:
def generate_test_data(sales_df):
    decimal_points_per_price = count_decimal_points(sales_df)
    
    # Extract a subset of the data with only one decimal point so we can transform the str values to floats and check our test logic.
    test_data_ids = decimal_points_per_price[
                        (decimal_points_per_price.regular_price_decimal_count == 1) & 
                        (decimal_points_per_price.sale_price_decimal_count == 1)
                    ].orderline_id
    test_data = sales_df[sales_df.orderline_id.isin(test_data_ids)].copy()

    # Transform the price values to float.
    test_data.regular_price = test_data.regular_price.astype(float)
    test_data.sale_price = test_data.sale_price.astype(float)
    
    # This data is corrupted and should fail the tests
    failing_test_data = test_data
    
    # This subset of the data has regular_price >= sale_price and should therefore (hopefully) be uncorrupted
    passing_test_data = test_data[(test_data.regular_price >= test_data.sale_price) & (test_data.sale_price <= test_data.total_paid)]

    return failing_test_data, passing_test_data

failing_test_data, passing_test_data = generate_test_data(completed_sales)

print(failing_test_data.shape, passing_test_data.shape)

(41135, 12) (39703, 12)


## Tests
We will utilise tests that return data to help debug the logic for cleaning the price values.

### Test prices greater or equal

In [13]:
def test_col_vals_are_greater_or_equal_to_other(df, greater_col, lesser_col):
    num_incorrect_vals = df[df[greater_col] < df[lesser_col]].shape[0]
    if num_incorrect_vals == 0:
        print(f"All of the {greater_col} values are greater or equal to the corresponding {lesser_col} values.\n")
        return pd.DataFrame()
    else:
        print(f"There are corrupted values in {greater_col} which are less than their corresponding {lesser_col} values.")
        print(f"This respresents {num_incorrect_vals/df.shape[0]*100:.2f}% of the data.\n")
        corrupted_price_orderline_ids = df[df[greater_col] < df[lesser_col]].orderline_id
        return corrupted_price_orderline_ids

def test_regular_greater_or_equal_to_sale(df):
    incorrect_val_ids = test_col_vals_are_greater_or_equal_to_other(df, 'regular_price', 'sale_price')
    return incorrect_val_ids

In [14]:
regular_less_than_sale_orderline_ids = test_regular_greater_or_equal_to_sale(failing_test_data)

There are corrupted values in regular_price which are less than their corresponding sale_price values.
This respresents 3.10% of the data.



In [15]:
regular_less_than_sale_orderline_ids = test_regular_greater_or_equal_to_sale(passing_test_data)

All of the regular_price values are greater or equal to the corresponding sale_price values.



### Test total_paid

In [16]:
def test_order_id_has_single_total_paid_value(df):
    """
    A single order_id can have multiple lines corresponding to orderlines.
    Each of these lines should have the same total_paid value
    """
    
    # Group by order_id and check the number of unique total_paid values for each group
    inconsistent_orders = df.groupby('order_id')['total_paid'].nunique()
    
    # Filter for orders where there is more than 1 unique total_paid value
    inconsistent_orders = inconsistent_orders[inconsistent_orders > 1]
    
    # Check if there are any inconsistencies
    if not inconsistent_orders.empty:
        print("Inconsistent 'total_paid' values found for the following order_ids:")
        print(inconsistent_orders)
    else:
        print("All 'total_paid' values are consistent for each 'order_id'.")

test_order_id_has_single_total_paid_value(completed_sales)

All 'total_paid' values are consistent for each 'order_id'.


In [17]:
def test_order_total_paid_equal_sum_of_orderlines(df):
    """
    Group by order_id and calculate the sum of sale_price*product_quantity for all orderline_ids
    """
    grouped_orderlines = df.groupby('order_id').apply(
        lambda x: pd.Series({
            'calculated_total': (x['product_quantity'] * x['sale_price']).sum(),
            'total_paid': x['total_paid'].iloc[0]  # Total paid is the same for all rows in the group
        }),
        include_groups=False
    )

    # Compare calculated_total with total_paid
    incorrect_orders = grouped_orderlines[grouped_orderlines['calculated_total'] != grouped_orderlines['total_paid']]
    
    if incorrect_orders.empty:
        print("All orders have total_paid values equal to the sum of product_quantity * sale_price.")
        return pd.DataFrame()
    else:
        print(f"There are {incorrect_orders.shape[0]} orders where total_paid does not match the sum of product_quantity * sale_price.")
        print(f"This respresents {incorrect_orders.shape[0]/df.shape[0]*100:.2f}% of the data.\n")
        return incorrect_orders

In [18]:
incorrect_total_price = test_order_total_paid_equal_sum_of_orderlines(failing_test_data)

There are 26590 orders where total_paid does not match the sum of product_quantity * sale_price.
This respresents 64.64% of the data.



In [19]:
incorrect_total_price = test_order_total_paid_equal_sum_of_orderlines(passing_test_data)

There are 25806 orders where total_paid does not match the sum of product_quantity * sale_price.
This respresents 65.00% of the data.



<div class="alert alert-block alert-danger">
    Even when regular_price >= sale_price the sale price values do not match the total_paid per order.
    <br><br>
    It is possible that the difference in price is due to shipping costs, which are not included in the dataset.
    <br><br>
    Even though the 'passing' test data is clearly not as clean as we had hoped. We will leave as is for now so it can be used to debug why the prices do not match total_paid.
</div>