In [1]:
import re
import numpy as np
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import data_utils
import price_debugging_tests as pdt

## Import and merge data

In [8]:
# Import the data
orders = data_utils.clean_orders(data_path="../../data/")
orderlines = data_utils.clean_orderlines(data_path="../../data/")
products = data_utils.clean_products(data_path="../../data/")
brands = data_utils.clean_brands(data_path="../../data/")

# Merge the data
completed_sales =  data_utils.merge_data(orders, orderlines, products, brands)

5 missing values were removed from orders.
This represents 0.0022% of the data.


0 missing values were removed from orderlines.
This represents 0.00% of the data.


8792 missing values were removed from products
This represents 45.49% of the data.


0 missing values were removed from brands.
This represents 0.00% of the data.




## Explore corrupted decimal values
### Count decimal points in price values

In [4]:
prices = completed_sales[['orderline_id', 'total_paid', 'regular_price', 'promo_price', 'sale_price']].copy()

prices['regular_price_decimal_count'] = prices['regular_price'].str.count(r'\.')
prices['promo_price_decimal_count'] = prices['promo_price'].str.count(r'\.')
prices['sale_price_decimal_count'] = prices['sale_price'].str.count(r'\.')

prices

Unnamed: 0,orderline_id,total_paid,regular_price,promo_price,sale_price,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count
0,1398738,136.15,139.99,1.149.948,129.16,1,2,1
1,1529178,15.76,17.99,109.904,10.77,1,1,1
2,1181923,84.98,99.99,999.896,77.99,1,1,1
3,1276706,149.00,179,1.489.994,149.00,0,2,1
4,1154394,112.97,103.95,59.584,52.99,1,1,1
...,...,...,...,...,...,...,...,...
61667,1649446,18.98,35,13.99,13.99,0,1,1
61668,1649512,24.97,25,99.898,9.99,0,1,1
61669,1649522,24.97,25,99.898,9.99,0,1,1
61670,1649565,34.96,25,99.898,9.99,0,1,1


In [5]:
prices[['regular_price_decimal_count', 'promo_price_decimal_count', 'sale_price_decimal_count']].apply(pd.Series.value_counts)

Unnamed: 0,regular_price_decimal_count,promo_price_decimal_count,sale_price_decimal_count
0,18611,877,
1,42264,40066,58120.0
2,797,20729,3552.0


<div class="alert alert-block alert-info">
    total_paid is the only price column which does not have corrupted data. 
    <br>
    It may be that we can use this column to determine the correct positions of the decimal points in the other price columns.
</div>

### Check sale_price

In [1]:
temp = prices.copy()
temp[temp.sale_price_decimal_count!=1]

NameError: name 'prices' is not defined

In [2]:
temp = (
    temp.assign(
        sale_price_split = lambda x: x['sale_price'].str.split(r'\.'),
        decimal_places = lambda x: len(x['sale_price_split'][1]))
)
temp.head(5)

NameError: name 'temp' is not defined

In [3]:
temp[temp.decimal_places != 2]

NameError: name 'temp' is not defined

### Transform orderlines.unit_price to floats

In [None]:
def transform_unit_price_to_floats(df):
    """
    Transform the orderlines.unit_price price column to floats.
    Some of the values have two decimal points. 
    For these values we will remove the leftmost decimal and transform all values to floats.
    The correct position of the decimal point will be determined by merging orderlines, 
    products, orders and brands, and comparing the price values.
    
    Args:
        df (pd.DataFrame): The orderlines data
    
    Returns:
        pd.DataFrame: The orderlines data with the unit_price column transformed from str to float values.
    """
    return (
        df.assign(unit_price = df.unit_price.str.split('.')
                  .apply(lambda x : x[0]+x[1]+'.'+x[2] if len(x)==3 else x[0]+'.'+ x[1])
                  .astype(float)
        )
    )
    

In [None]:
completed_sales['regular_price'].str.count(r'\.')

In [None]:
def split_str_on_dots_and_append_decimal(df, col):
    '''Remove the decimal points from the strings and append .00'''
    return df.assign(promo_price=df[col]
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_regular_price(row):
    '''
    Keep moving the decimal point towards the end of the regular_price 
    string until the sale_price is lower or equal to the price.
    Then transform the regular_price string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
    
    while float(row.regular_price) < row.sale_price:
        if round(float(row.regular_price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.regular_price), 2)
        else:
            row.regular_price = _insert_decimal_at_string_position(row.regular_price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_regular_price_to_float(df):
    df.regular_price = [_insert_decimal_in_regular_price(row) for index, row in df.iterrows()]
    return df