In [1]:
import re
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import data_utils
import price_debugging_tests as pdt

# Import and merge data

In [2]:
# Import the data
orders = data_utils.clean_orders(data_path="../../data/")
orderlines = data_utils.clean_orderlines(data_path="../../data/")
products = data_utils.clean_products(data_path="../../data/")
brands = data_utils.clean_brands(data_path="../../data/")

# Merge the data
completed_sales =  data_utils.merge_data(orders, orderlines, products, brands)

5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from orderlines.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.




# Price decimal point exploration

## How corrupted is the data?
### Count the number of decimal points in each price value to identify corrupted values.

In [30]:
# Count the decimal points
price_data = completed_sales.copy()

price_data['regular_price_decimal_count'] = price_data['regular_price'].str.count(r'\.')
price_data['sale_price_decimal_count'] = price_data['sale_price'].str.count(r'\.')

price_data[['regular_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,regular_price_decimal_count,sale_price_decimal_count
0,18611,
1,42264,58120.0
2,797,3552.0


### Check the distribution of decimal points across the completed_sales price data 

In [31]:
# Group by the three decimal count columns and get the count & percentage for each group
grouped_counts = price_data.groupby(
    ['regular_price_decimal_count', 'sale_price_decimal_count']
).size().reset_index(name='count')

# Calculate the percentage of each group relative to the total number of rows
total_rows = len(price_data)
grouped_counts['percentage'] = (grouped_counts['count'] / total_rows) * 100

# Display the resulting dataframe
grouped_counts.sort_values('percentage', ascending=False)

Unnamed: 0,regular_price_decimal_count,sale_price_decimal_count,count,percentage
2,1,1,41135,66.699637
0,0,1,16412,26.611752
1,0,2,2199,3.565638
3,1,2,1129,1.830652
4,2,1,573,0.929109
5,2,2,224,0.363212


<div class="alert alert-box alert-info">
    Hopefully the prices with 1 or 0 decimal points will not be corrupted. This would give us 93% of the data.
</div>

## Exclude orders with multiple orderlines and multiple products
<div class="alert alert-box alert-info">
    The data is extremely corrupted. It is made clear in the cells below that the position of the decimal points cannot be trusted and that the total_paid, regular_price and sale_price all have corrupted values.
    <br><br>
    Therefore, for simplicity's sake we will first exclude orders with multiple orderlines and product_quantity >= 1 so we can easily compare the prices with the total_paid.
    <br><br>
    Hopefully we will be able to determine patterns which can later be applied to all the data. 
</div>

In [7]:
total_orders = completed_sales['order_id'].nunique()
print(f"There are {total_orders} orders in total.")

There are 46361 orders in total.


In [32]:
# Get the count of orderlines per order from completed_sales
completed_sales_orderline_counts = completed_sales.groupby('order_id')['orderline_id'].count().reset_index()
completed_sales_orderline_counts.rename(columns={'orderline_id': 'orderline_count'}, inplace=True)

''' Examine the distribution of orderlines per order. '''
# Group by completed_count and count how many orders have the same number of orderlines
orderline_distribution = completed_sales_orderline_counts.groupby('orderline_count')['order_id'].count().reset_index()
orderline_distribution.rename(columns={'order_id': 'order_count'}, inplace=True)

# Calculate the percentage of each orderline count relative to the total number of orders
total_orders = completed_sales_orderline_counts['order_id'].nunique()
orderline_distribution['percentage'] = (orderline_distribution['order_count'] / total_orders) * 100

orderline_distribution

Unnamed: 0,orderline_count,order_count,percentage
0,1,36055,77.770108
1,2,7100,15.314596
2,3,2107,4.544768
3,4,688,1.484006
4,5,261,0.562973
5,6,82,0.176873
6,7,30,0.06471
7,8,18,0.038826
8,9,12,0.025884
9,10,5,0.010785


<div class="alert alert-box alert-success">
    Good. We can ignore orders containing multiple types of products and retain 78% of the data.
</div>

In [43]:
''' Exclude orders with multiple orderlines '''
orders_with_one_orderline = completed_sales_orderline_counts[completed_sales_orderline_counts.orderline_count==1]
single_orderline_orders = completed_sales[completed_sales.order_id.isin(orders_with_one_orderline.order_id)]

''' Exclude orders with product_quantity > 1 so we can directly compare the prices with total_paid. '''
single_orderline_single_product = single_orderline_orders[single_orderline_orders.product_quantity==1].copy()
single_orderline_single_product.shape[0]/total_orders*100

print(f"We still have {np.round(single_orderline_single_product.shape[0]/total_orders*100)}% of the completed_sales orders")

We still have 72.0% of the completed_sales orders


<div class="alert alert-box alert-success">
    We can also remove orders with multiple identical products and retain 72% of the data
</div>

### Count decimal points in price values - single_orderline_single_product

In [45]:
single_orderline_single_product['regular_price_decimal_count'] = single_orderline_single_product['regular_price'].str.count(r'\.')
single_orderline_single_product['sale_price_decimal_count'] = single_orderline_single_product['sale_price'].str.count(r'\.')

single_orderline_single_product[['regular_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,regular_price_decimal_count,sale_price_decimal_count
0,10233,
1,22760,31050.0
2,467,2410.0


### Check the distribution of decimal points across the single_orderline_single_product price data

In [46]:
# Group by the three decimal count columns and get the count & percentage for each group
grouped_counts = single_orderline_single_product.groupby(
    ['regular_price_decimal_count', 'sale_price_decimal_count']
).size().reset_index(name='count')

# Calculate the percentage of each group relative to the total number of rows
total_rows = len(single_orderline_single_product)
grouped_counts['percentage'] = (grouped_counts['count'] / total_rows) * 100

# Display the resulting dataframe
grouped_counts.sort_values('percentage', ascending=False)

Unnamed: 0,regular_price_decimal_count,sale_price_decimal_count,count,percentage
2,1,1,21971,65.663479
0,0,1,8767,26.201435
1,0,2,1466,4.381351
3,1,2,789,2.358039
4,2,1,312,0.932457
5,2,2,155,0.46324


<div class="alert alert-box alert-success">
    The distribution of decimal points is similar to the completed_sales data. 92% of the data has 0 or 1 decimal points.
</div>

## Exclude orders containing prices with more than one decimal point.

In [47]:
orders_with_multiple_decimals = single_orderline_single_product[(single_orderline_single_product.regular_price_decimal_count > 1) | (single_orderline_single_product.sale_price_decimal_count > 1)]

single_orderline_single_product_single_decimal = single_orderline_single_product[~single_orderline_single_product.order_id.isin(orders_with_multiple_decimals.order_id)].copy()

print(f"We still have {np.round(single_orderline_single_product_single_decimal.shape[0]/total_orders*100)}% of the completed_sales orders")

We still have 66.0% of the completed_sales orders


<div class="alert alert-box alert-success">
    66% of the data remains.
</div>

# Price data exploration

## Transform regular_price and sale_price to floats and compare the values

In [50]:
def _calculate_percentage_difference(df, reference_col, comparison_col):
    return ((df[reference_col] - df[comparison_col])/df[reference_col]*100).round(2)

def add_discount_percentage_col(df):
    df['discount_percentage'] = _calculate_percentage_difference(df, 'total_paid', 'sale_price')
    return df


# Shorten the name of the df 
base_data = single_orderline_single_product_single_decimal
# Change the prices to floats
base_data[['regular_price', 'sale_price']] = base_data[['regular_price', 'sale_price']].astype('float')
# Drop the decimal count cols to clean up the output
base_data.drop(['regular_price_decimal_count', 'sale_price_decimal_count'], axis=1, inplace=True)

# Add a discount percentage col to detect incorrect values and outliers
base_data = add_discount_percentage_col(base_data)

base_data

Unnamed: 0,order_id,orderline_id,date,name,desc,brand,sku,category,total_paid,product_quantity,regular_price,sale_price,discount_percentage
0,241423,1398738,2017-11-06 12:47:20,LaCie Porsche Design Desktop Drive 4TB USB 3.0...,External Hard Drive 4TB 35-inch USB 3.0 for Ma...,LaCie,LAC0212,Memory,136.15,1,139.99,129.16,5.13
1,242832,1529178,2017-12-31 17:26:40,Parrot 550mAh battery for MiniDrones,550mAh rechargeable battery for Parrot minidrones,Parrot,PAR0074,Accessories,15.76,1,17.99,10.77,31.66
2,243330,1181923,2017-02-15 17:07:44,Mac OWC Memory 8GB 1066MHZ DDR3 SO-DIMM,8GB RAM Mac mini iMac MacBook and MacBook Pro ...,OWC,OWC0074,unknown,84.98,1,99.99,77.99,8.23
3,245275,1276706,2017-06-28 11:12:30,Tado Smart Climate Control Intelligent AC,intelligent control air conditioning works wit...,Tado,TAD0007,Accessories,149.00,1,179.00,149.00,0.00
13,247524,1547886,2018-01-08 21:21:14,Crucial memory Mac 16GB (2x8GB) SO-DIMM DDR3 1...,RAM 16GB (2x8GB) Mac mini (2011) iMac (2010/11...,Crucial,CRU0026-2,unknown,167.98,1,163.98,162.99,2.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61657,527024,1649397,2018-03-14 11:36:24,EarPods Apple Headphones with Remote and Mic (...,EarPods headphones Apple iPhone iPad and iPod ...,Apple,APP0927,iPhone,17.98,1,35.00,13.99,22.19
61658,527027,1649417,2018-03-14 11:38:31,Apple Lightning Cable Connector to USB 1m Whit...,Apple Lightning USB Cable 1 meter to charge an...,Apple,APP0698,Accessories,14.98,1,25.00,9.99,33.31
61659,527033,1649429,2018-03-14 11:40:54,EarPods Apple Headphones with Remote and Mic (...,EarPods headphones Apple iPhone iPad and iPod ...,Apple,APP0927,iPhone,18.98,1,35.00,13.99,26.29
61667,527042,1649446,2018-03-14 11:42:38,EarPods Apple Headphones with Remote and Mic (...,EarPods headphones Apple iPhone iPad and iPod ...,Apple,APP0927,iPhone,18.98,1,35.00,13.99,26.29


In [None]:
"""
THE TOTAL PAID - THE SALE PRICE IS ALWAYS JUST GOING TO BE THE POSTAGE PRICE

IF THE REGULAR PRICE AND THE PROMO PRICE ARE JUST A SNAPSHOT OF THE CURRENT PRICE THEN THEY WILL NEVER ACTUALLY REFLECT THE DISCOUNTS

SHOULD I CHECK THE CHANGE IN PRICES OVER TIME

"""