## Import the data

In [2]:
import pandas as pd
import numpy as np
import re

pd.options.display.max_rows = 500

path = '../data/eniac/'
orderlines_original = pd.read_csv(path + 'orderlines.csv')
orders_original = pd.read_csv(path + 'orders.csv')
brands_original = pd.read_csv(path + 'brands.csv')
products_original = pd.read_csv(path + 'products.csv')


orders_original['created_date'] = pd.to_datetime(orders_original['created_date'])
orderlines_original['date'] = pd.to_datetime(orderlines_original['date'])

## Clean orders

In [4]:
orders = orders_original.copy()

# Remove nan values
orders_before = orders.shape[0]
orders.dropna(inplace=True)
orders_after = orders.shape[0]

rows_removed_from_orders = orders_before - orders_after
print(f'{rows_removed_from_orders} rows were removed from orders after removing NaN values')
print(f'This represents {round(rows_removed_from_orders/orders_before*100, 2)}% of the data')


# Remove 22213 order ids in orders that aren't in orderlines
orders_before = orders.shape[0]
orders = orders[orders.order_id.isin(orderlines_original.id_order)]
orders_after = orders.shape[0]
rows_removed_from_orders = orders_before - orders_after

# Of the order ids in orders but not in orderlines, 12304 orders have state = Place Order and 
# 9810 have state = Shopping Basket for a total of 22114.
# There are a total of 40883 orders with state = Place Order and 117809 with state = Shopping Basket.
print('\n')
print(f'{rows_removed_from_orders} rows were removed from orders after removing order_ids which do not exist in orderlines')
print(f'This represents {round(rows_removed_from_orders/orders_before*100, 2)}% of the data')

orders.to_csv(path + 'orders_clean.csv', index=False)

5 rows were removed from orders after removing NaN values
This represents 0.0% of the data


22213 rows were removed from orders after removing order_ids which do not exist in orderlines
This represents 9.79% of the data


## Clean products

In [5]:
products = products_original.copy()

# Drop the type and in_stock columns 
products.drop('type', axis=1, inplace=True, errors='ignore')
products.drop('in_stock', axis=1, inplace=True, errors='ignore')
#  Drop duplicate rows based on sku value
products.drop_duplicates(subset='sku', inplace=True)
# Check for products without descriptions
names_of_products_without_descriptions = products[products.desc.isna()].name.tolist()
# Add missing descriptions
missing_descriptions = [
    '2TB Mac hard drive and Nas',
    'Apple keyboard for iPad 9.7',
    'NAS server with 10GB RAM',
    'Ethernet adapter for Macbook 12',
    'Luxury power bank combined with powder, 2 mirrors - normal and 3x magnification, Illuminated under mirror with LED, Low weight and compact dimensions',
    'Battery capacity: 20,000 mAh; ultra-stable: outer shell made of durable synthetic rubber (military standard, withstands drops from up to 2 metres) ; protection: dust and splash proof: military standard iP54; battery level indicator and super fast charging; USB port can be connected to charger and other devices',
    'Smart thermostat designed to provide automatic time and temperature control of heating systems in homes and apartments. '
]
for i in range(len(names_of_products_without_descriptions)):
    products.loc[products.name == names_of_products_without_descriptions[i], 'desc'] = missing_descriptions[i]

def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def remove_missing_prices(df, col):
    return df[~df[col].isna()]

products_before = products.shape[0]

products = (products
        .pipe(start_pipeline)
        .pipe(remove_missing_prices,col='price')
)

products_after = products.shape[0]

print(f"{products_before-products_after} missing values were removed from products")
print(f"{(products_before-products_after)/products_before * 100}%")

products

45 missing values were removed from products
0.4253710180546365%


Unnamed: 0,sku,name,desc,price,promo_price
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,499.899
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59,589.996
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59,569.898
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25,229.997
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99
...,...,...,...,...,...
19321,BEL0376,Belkin Travel Support Apple Watch Black,compact and portable stand vertically or horiz...,29.99,269.903
19322,THU0060,"Enroute Thule 14L Backpack MacBook 13 ""Black",Backpack with capacity of 14 liter compartment...,69.95,649.903
19323,THU0061,"Enroute Thule 14L Backpack MacBook 13 ""Blue",Backpack with capacity of 14 liter compartment...,69.95,649.903
19324,THU0062,"Enroute Thule 14L Backpack MacBook 13 ""Red",Backpack with capacity of 14 liter compartment...,69.95,649.903


## Clean orderlines

In [6]:
orderlines = orderlines_original.copy()

# Remove 240 order ids in orderlines that aren't in orders
orderlines_before = orderlines.shape[0]
orderlines = orderlines[orderlines.id_order.isin(orders.order_id)].copy()
orderlines_after = orderlines.shape[0]
rows_removed_from_orderlines = orderlines_before - orderlines_after

print(f'{rows_removed_from_orderlines} rows were removed from orderlines after removing unmatched order_ids')
print(f'This represents {round(rows_removed_from_orderlines/orderlines_before*100, 2)}% of the data')

# Drop the product_id column because it contains no info
orderlines.drop('product_id', axis=1, inplace=True, errors='ignore')

# Transform the unit_price price column to floats
def transform_unit_price_to_floats(df):
    return (
        df.assign(unit_price = df.unit_price.str.split('.')
                  .apply(lambda x : x[0]+x[1]+'.'+x[2] if len(x)==3 else x[0]+'.'+ x[1])
                  .astype(float)
        )
    )

orderlines = (orderlines
              .pipe(start_pipeline)
              .pipe(transform_unit_price_to_floats)
             )

# Drop these corrupted orders
print(f'{round(orderlines.loc[orderlines.unit_price==6.59].shape[0]/orderlines_after*100, 2)}% of the data have incorrect €6.59 values')
orderlines = orderlines.loc[~(orderlines.unit_price==6.59)]

# Save the data
orderlines.to_csv(path + 'orderlines_clean.csv', index=False)

orderlines

240 rows were removed from orderlines after removing unmatched order_ids
This represents 0.08% of the data
0.07% of the data have incorrect €6.59 values


Unnamed: 0,id,id_order,product_quantity,sku,unit_price,date
0,1119109,299539,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,1,LGE0043,399.00,2017-01-01 00:19:45
2,1119111,299541,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,1,JBL0104,23.74,2017-01-01 01:06:38
...,...,...,...,...,...,...
293978,1650199,527398,1,JBL0122,42.99,2018-03-14 13:57:25
293979,1650200,527399,1,PAC0653,141.58,2018-03-14 13:57:34
293980,1650201,527400,2,APP0698,9.99,2018-03-14 13:57:41
293981,1650202,527388,1,BEZ0204,19.99,2018-03-14 13:58:01


## Merge with products to compare discounts

In [7]:
def merge_brands(df):
    return (df
            .assign(short = lambda row: row['sku'].str[:3])
            .merge(brands_original, on='short')
           )

def rename_columns(df):
    return (df
            .rename(columns={'long': 'brand', 'unit_price': 'sale_price', 'id_order': 'order_id'})
           )

def merge_orders(df):
    return (df
            .merge(orders, on='order_id')
    )

def drop_deprecated_columns(df):
    return (df
            .drop(['short', 'created_date'], axis=1)
           )

def assign_product_categories(df):
    apple_regexp_dict = {
        'iPod': '^.{0,7}apple ipod',
        'iPhone':  'apple iphone',
        'iPad':  'apple ipad',
        'Mac':  'apple macbook|apple iMac|apple Mac mini|desktop computer',
    }
    
    other_regexp_dict = {        
        'Smartwatch':'withings|watch|fitbit|apple watch|smartwatch|smart watch',
        'Accessories': 'kit|strap|armband|belt|bracelet|stylus|pen|Bamboo Wacom Intuos|pencil|pen|rubber pointers|screwdriver|case|funda|housing|casing|folder|bag|backpack|cable|connector|Lightning to USB|Wall socket|power strip|adapter|battery|headset|headphones|mouse|trackpad|stand|support|protect|cover|sleeve|Screensaver|shellhub|dock|microphone|keyboard|keypad',
        'Hardware': 'Philips Hue|temperature sensor|display|monitor|camera|charger|speaker|router|repeater|Synology|nas|server|Parrot FPV Glasses|Command Pack 2 Skycontroller|Apple TV',
        'Software':  'adobe|Office 365|Office Home and Student|software|parallels',
        'Memory': 'hard disk|hard drive|flash drive|USB 2.0 key|USB 2.0 pen|SSD|pendrive|raid|SDHC|sata|memory card|Portable Hard Thunderbolt',
        'Repairs & warranties': 'repair|parts and labor|warranty|applecare|license|protection|installation',
    }
    

    df = df.assign(category = 'unknown')
    
    # Find main apple items
    for label, val in apple_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['desc'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown') & (x['brand'] == 'Apple'), 
                    label, x['category'])
            )
        )
    
    # Find other items
    for label, val in other_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['desc'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown'), label, x['category'])
            )
        )
    
    return df

sales_info = (
orderlines
    .merge(products, how='inner', on='sku')
)

sales_info = (sales_info
              .pipe(start_pipeline)
              .pipe(merge_brands)
              .pipe(rename_columns)
              .pipe(merge_orders)
              .pipe(drop_deprecated_columns)
              .pipe(assign_product_categories)
             )

sales_info['category'].value_counts()

Accessories             130862
Memory                   30752
unknown                  26621
Hardware                 25742
iPhone                   23895
Mac                      23686
Smartwatch               11521
iPad                     10279
Software                  6479
Repairs & warranties      1538
iPod                       392
Name: category, dtype: int64

In [20]:
def split_and_join_prices(df):
    '''Remove the decimal points from the price strings and append .00 to the string'''
    return df.assign(price=df.price
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def split_and_join_promo_prices(df):
    '''Remove the decimal points from the promo_price strings and append .00 to the string'''
    return df.assign(promo_price=df.promo_price
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 
    
def _insert_decimal_in_price(row):
    '''
    Keep moving the decimal point towards the end of the string until the sale_price is lower or equal to the price.
    Then transform the string to a float and round it to two decimal places.
    '''
    decimal_position = 1
    row.price = _insert_decimal_at_string_position(row.price, decimal_position)
    
    while float(row.price) < row.sale_price:
        if round(float(row.price), 0) == round(row.sale_price, 0):
            row.sale_price = round(float(row.price), 2)
            return round(float(row.price), 2)
        else:
            row.price = _insert_decimal_at_string_position(row.price, decimal_position)
            decimal_position += 1

    return round(float(row.price), 2)

def transform_price_to_floats(df):
    df.price = [_insert_decimal_in_price(row) for index, row in df.iterrows()]
    return df

def _insert_decimal_in_promo_price(row, decimal_position = -2):
    '''
    If the euro value of the price is equal to the euro value of the promo_price, 
    set the promo_price equal to price and return it.
    This is because some promo_prices are slightly larger than their equivalent prices, e.g. 12.95 - 12.99
    
    Otherwise, keep moving the decimal point towards the start of the string until the promo_price is lower than the price.
    Then transform the string to a float and round it to two decimal places.
    '''
    
    
    while float(row.promo_price) > row.price:
        if round(float(row.promo_price), 0) == round(row.price, 0):
            row.promo_price = row.price
            return row.promo_price
        else:
            row.promo_price = _insert_decimal_at_string_position(row.promo_price, decimal_position)
            decimal_position += -1
    
    return round(float(row.promo_price), 2)

def transform_promo_price_to_floats(df):
    df.promo_price = [_insert_decimal_in_promo_price(row) for index, row in df.iterrows()]
    return df
    
def calculate_products_discounts(df):
    return df.assign(
        discount=round(df.price - df.promo_price, 2),
        discount_pc = round((df.price - df.promo_price)/df.price * 100, 2)
    )

def calculate_sales_discounts(df):
    return (df
            .assign(
                sales_discount=round(df.price - df.sale_price, 2),
                sales_discount_pc = round((df.price - df.sale_price)/df.price * 100, 2)
            )
           )

temp = sales_info.copy()

# Just run it on the completed orders because it takes forever...
temp = temp[temp.state=='Completed']

temp = (temp
        .pipe(start_pipeline)
        .pipe(split_and_join_prices)
        .pipe(transform_price_to_floats)
        .pipe(split_and_join_promo_prices)
        .pipe(transform_promo_price_to_floats)
        .pipe(calculate_products_discounts)
        .pipe(calculate_sales_discounts)
)

sales_info = temp.copy()

sales_info.to_csv(path + 'sales_info_clean.csv', index=False)

In [21]:
sales_info

Unnamed: 0,id,order_id,product_quantity,sku,sale_price,date,name,desc,price,promo_price,brand,total_paid,state,category,discount,discount_pc,sales_discount,sales_discount_pc
2,1121709,300713,1,OTT0133,19.99,2017-01-02 16:51:26,Otterbox iPhone Case Symmetry 2.0 SE / 5s / 5 ...,resistant cover and thin beveled edges for iPh...,34.99,19.99,Otterbox,54.99,Completed,Accessories,15.0,42.87,15.0,42.87
3,1121701,300713,1,APP0437,35.00,2017-01-02 16:50:07,Apple EarPods headset with microphone connecto...,IPhone iPad and iPod headphones with microphon...,35.00,33.00,Apple,54.99,Completed,Accessories,2.0,5.71,0.0,0.00
6,1137443,305815,1,OTT0133,19.99,2017-01-09 15:59:02,Otterbox iPhone Case Symmetry 2.0 SE / 5s / 5 ...,resistant cover and thin beveled edges for iPh...,34.99,19.99,Otterbox,37.98,Completed,Accessories,15.0,42.87,15.0,42.87
7,1137581,305815,1,IFX0022,17.99,2017-01-09 17:05:39,Battery Kit complete iFixit iPhone 5 battery c...,Battery replacement kit with tools for iPhone 5.,30.99,29.99,iFixit,37.98,Completed,Accessories,1.0,3.23,13.0,41.95
12,1164617,317155,1,OTT0133,18.99,2017-01-30 08:38:08,Otterbox iPhone Case Symmetry 2.0 SE / 5s / 5 ...,resistant cover and thin beveled edges for iPh...,34.99,19.99,Otterbox,40.88,Completed,Accessories,15.0,42.87,16.0,45.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291756,1647799,526223,1,SDE0007,65.99,2018-03-12 23:50:43,SDesign iPhone charging dock QI X / 8 / Apple ...,Base load compatible with iPhone and Apple Wat...,69.99,65.99,SDesign,69.98,Completed,Smartwatch,4.0,5.72,4.0,5.72
291760,1641088,523660,1,QAR0007,79.99,2018-03-09 09:48:18,Qardio Arm Voltage Meter iPhone White,Wireless sphygmomanometer for iPhone iPad and ...,129.99,79.99,Qardio,83.98,Completed,unknown,50.0,38.46,50.0,38.46
291762,1641202,523717,1,QAR0007,79.99,2018-03-09 11:19:26,Qardio Arm Voltage Meter iPhone White,Wireless sphygmomanometer for iPhone iPad and ...,129.99,79.99,Qardio,84.98,Completed,unknown,50.0,38.46,50.0,38.46
291763,1642236,524124,1,QAR0007,79.99,2018-03-09 20:18:27,Qardio Arm Voltage Meter iPhone White,Wireless sphygmomanometer for iPhone iPad and ...,129.99,79.99,Qardio,84.98,Completed,unknown,50.0,38.46,50.0,38.46
