## Import the data

In [116]:
import pandas as pd
import numpy as np
import re

path = '../data/eniac/'
orderlines = pd.read_csv(path + 'orderlines.csv')
orders = pd.read_csv(path + 'orders.csv')
brands = pd.read_csv(path + 'brands.csv')
products = pd.read_csv(path + 'products.csv')

df_list = [orderlines, orders, brands, products]
files = ['orderlines','orders','brands','products']

orders['created_date'] = pd.to_datetime(orders['created_date'])
orderlines['date'] = pd.to_datetime(orderlines['date'])

## Clean orders

In [117]:
# Remove nan values
orders_before = orders.shape[0]
orders.dropna(inplace=True)
orders_after = orders.shape[0]

rows_removed_from_orders = orders_before - orders_after
print(f'{rows_removed_from_orders} rows were removed from orders after removing NaN values')
print(f'This represents {round(rows_removed_from_orders/orders_before*100, 2)}% of the data')


# Remove 22213 order ids in orders that aren't in orderlines
orders_before = orders.shape[0]
orders = orders[orders.order_id.isin(orderlines.id_order)]
orders_after = orders.shape[0]
rows_removed_from_orders = orders_before - orders_after

# Of the order ids in orders but not in orderlines, 12304 orders have state = Place Order and 
# 9810 have state = Shopping Basket for a total of 22114.
# There are a total of 40883 orders with state = Place Order and 117809 with state = Shopping Basket.
print('\n')
print(f'{rows_removed_from_orders} rows were removed from orders after removing order_ids which do not exist in orderlines')
print(f'This represents {round(rows_removed_from_orders/orders_before*100, 2)}% of the data')

orders.to_csv(path + 'orders_clean.csv', index=False)

5 rows were removed from orders after removing NaN values
This represents 0.0% of the data


22213 rows were removed from orders after removing order_ids which do not exist in orderlines
This represents 9.79% of the data


## Clean products

In [118]:
# Drop the type and in_stock columns 
products.drop('type', axis=1, inplace=True, errors='ignore')
products.drop('in_stock', axis=1, inplace=True, errors='ignore')
#  Drop duplicate rows based on sku value
products.drop_duplicates(subset='sku', inplace=True)
# Check for products without descriptions
names_of_products_without_descriptions = products[products.desc.isna()].name.tolist()
# Add missing descriptions
missing_descriptions = [
    '2TB Mac hard drive and Nas',
    'Apple keyboard for iPad 9.7',
    'NAS server with 10GB RAM',
    'Ethernet adapter for Macbook 12',
    'Luxury power bank combined with powder, 2 mirrors - normal and 3x magnification, Illuminated under mirror with LED, Low weight and compact dimensions',
    'Battery capacity: 20,000 mAh; ultra-stable: outer shell made of durable synthetic rubber (military standard, withstands drops from up to 2 metres) ; protection: dust and splash proof: military standard iP54; battery level indicator and super fast charging; USB port can be connected to charger and other devices',
    'Smart thermostat designed to provide automatic time and temperature control of heating systems in homes and apartments. '
]
for i in range(len(names_of_products_without_descriptions)):
    products.loc[products.name == names_of_products_without_descriptions[i], 'desc'] = missing_descriptions[i]

# Clean the price and promo_price columns
def start_pipeline(df):
    '''Make a copy of the pipeline to prevent corrupting the original data'''
    return df.copy()

def remove_duplicates(df, col):
    '''Remove duplicate rows'''
    return df[~df[col].isna()]

def clean_prices(df):
        df.price = df.price.apply(lambda x : x +'.00' if x.count('.') == 0 else x)
        df.price = df.price.apply(lambda x: x  + '0' if x[-2]=='.' else x)
        df.price = df.price.apply(lambda x: str(float(x.replace('.',''))/1000) if ( (x[-4]=='.') & (x.count('.')==2)) else x)
        df.price = df.price.apply(lambda x: str(float(x.replace('.',''))/10000) if ( (x[-4]=='.') & (x.count('.')==1)) else x)
        df.price = df.price.astype(float)
        df.price = df.price.round(decimals = 2)
        return df

def split_and_join_promo_prices(df):
    '''Remove the decimal points from the promo_price strings and append .00 to the string'''
    return df.assign(promo_price=df.promo_price
                     .str.split('.')
                     .str.join('')
                     .apply(lambda x: x+'.00')
                    )

def _insert_decimal_at_string_position(s, pos):
    '''Insert a decimal point at a given position in a string'''
    s = s.split('.')
    s = s[0] + s[1]
    s = s[:pos]+'.'+s[pos:]
    return s 


def _insert_decimal_in_promo_price(row):
    '''
    If the euro value of the price is equal to the euro value of the promo_price, 
    set the promo_price equal to price and return it.
    This is because some promo_prices are slightly larger than their equivalent prices, e.g. 12.95 - 12.99
    
    Otherwise, keep moving the decimal point towards the start of the string until the promo_price is lower than the price.
    Then transform the string to a float and round it to two decimal places.
    '''
    
    decimal_position = -2
    while float(row.promo_price) > row.price:
        if round(float(row.promo_price), 0) == round(row.price, 0):
            row.promo_price = row.price
            return row.promo_price
        else:
            row.promo_price = _insert_decimal_at_string_position(row.promo_price, decimal_position)
            decimal_position += -1
    
    return round(float(row.promo_price), 2)

def transform_promo_price_to_floats(df):
    df.promo_price = [_insert_decimal_in_promo_price(row) for index, row in df.iterrows()]
    return df

def calculate_products_discounts(df):
    return df.assign(
        discount=round(df.price - df.promo_price, 2),
        discount_pc = round((df.price - df.promo_price)/df.price * 100, 2)
    )

products = (products
        .pipe(start_pipeline)
        .pipe(remove_duplicates,col='price')
        .pipe(clean_prices)
        .pipe(split_and_join_promo_prices)
        .pipe(transform_promo_price_to_floats)
        .pipe(calculate_products_discounts)
)

# Manually fix the last of the incorrect prices
products.loc[products['sku'] == 'DJI0025', ['price', 'promo_price']] = 849.00
products.loc[products['sku'] == 'TAD0008', ['price', 'promo_price']] = 179.99
products.loc[products['sku'] == 'IOT0018', ['price', 'promo_price']] = 24.99
products.loc[products['sku'] == 'APP2490', ['price', 'promo_price']] = 1599.00
products.loc[products['sku'] == 'KIN0150', ['price', 'promo_price']] = 17.28
products.loc[products['sku'] == 'REP0185', ['price', 'promo_price']] = 69.99
products.loc[products['sku'] == 'APP1477', ['price', 'promo_price']] = 490.33

products.to_csv(path + 'products_clean.csv', index=False)
products

Unnamed: 0,sku,name,desc,price,promo_price,discount,discount_pc
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,49.99,10.00,16.67
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.00,59.00,0.00,0.00
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.00,56.99,2.01,3.41
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.00,23.00,2.00,8.00
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99,3.00,8.57
...,...,...,...,...,...,...,...
19321,BEL0376,Belkin Travel Support Apple Watch Black,compact and portable stand vertically or horiz...,29.99,26.99,3.00,10.00
19322,THU0060,"Enroute Thule 14L Backpack MacBook 13 ""Black",Backpack with capacity of 14 liter compartment...,69.95,64.99,4.96,7.09
19323,THU0061,"Enroute Thule 14L Backpack MacBook 13 ""Blue",Backpack with capacity of 14 liter compartment...,69.95,64.99,4.96,7.09
19324,THU0062,"Enroute Thule 14L Backpack MacBook 13 ""Red",Backpack with capacity of 14 liter compartment...,69.95,64.99,4.96,7.09


## Clean orderlines

In [119]:
# Remove 240 order ids in orderlines that aren't in orders
orderlines_before = orderlines.shape[0]
orderlines = orderlines[orderlines.id_order.isin(orders.order_id)].copy()
orderlines_after = orderlines.shape[0]
rows_removed_from_orderlines = orderlines_before - orderlines_after

print(f'{rows_removed_from_orderlines} rows were removed from orderlines after removing unmatched order_ids')
print(f'This represents {round(rows_removed_from_orderlines/orderlines_before*100, 2)}% of the data')


# Drop the product_id column because it contains no info
orderlines.drop('product_id', axis=1, inplace=True, errors='ignore')

# Transform the unit_price price column to floats
def transform_unit_price_to_floats(df):
    return (
        df.assign(unit_price = df.unit_price.str.split('.')
                  .apply(lambda x : x[0]+x[1]+'.'+x[2] if len(x)==3 else x[0]+'.'+ x[1])
                  .astype(float)
        )
    )

orderlines = (orderlines
              .pipe(start_pipeline)
              .pipe(transform_unit_price_to_floats)
             )

# Drop these corrupted orders
orderlines = orderlines.loc[~(orderlines.unit_price==6.59)]

orderlines.to_csv(path + 'orderlines_clean.csv', index=False)

orderlines

240 rows were removed from orderlines after removing unmatched order_ids
This represents 0.08% of the data


Unnamed: 0,id,id_order,product_quantity,sku,unit_price,date
0,1119109,299539,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,1,LGE0043,399.00,2017-01-01 00:19:45
2,1119111,299541,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,1,JBL0104,23.74,2017-01-01 01:06:38
...,...,...,...,...,...,...
293978,1650199,527398,1,JBL0122,42.99,2018-03-14 13:57:25
293979,1650200,527399,1,PAC0653,141.58,2018-03-14 13:57:34
293980,1650201,527400,2,APP0698,9.99,2018-03-14 13:57:41
293981,1650202,527388,1,BEZ0204,19.99,2018-03-14 13:58:01


## Merge with products to compare discounts

In [120]:
sales_info = (
orderlines
    .merge(products, how='inner', on='sku')
)

def calculate_sales_discounts(df):
    return (df
            .assign(
                sales_discount=round(df.price - df.unit_price, 2),
                sales_discount_pc = round((df.price - df.unit_price)/df.price * 100, 2)
            )
           )

def merge_brands(df):
    return (df
            .assign(short = lambda row: row['sku'].str[:3])
            .merge(brands, on='short')
           )

def rename_columns(df):
    return (df
            .rename(columns={'long': 'brand', 'unit_price': 'sale_price', 'id_order': 'order_id'})
           )

def merge_orders(df):
    return (df
            .merge(orders, on='order_id')
    )

def drop_deprecated_columns(df):
    return (df
            .drop(['short', 'created_date'], axis=1)
           )

def assign_product_categories(df):
    apple_regexp_dict = {
        'iPod': '^.{0,7}apple ipod',
        'iPhone':  'apple iphone',
        'iPad':  'apple ipad',
        'Mac':  'apple macbook|apple iMac|apple Mac mini|desktop computer',
    }
    
    other_regexp_dict = {        
        'Smartwatch':'withings|watch|fitbit|apple watch|smartwatch|smart watch',
        'Accessories': 'kit|strap|armband|belt|bracelet|stylus|pen|Bamboo Wacom Intuos|pencil|pen|rubber pointers|screwdriver|case|funda|housing|casing|folder|bag|backpack|cable|connector|Lightning to USB|Wall socket|power strip|adapter|battery|headset|headphones|mouse|trackpad|stand|support|protect|cover|sleeve|Screensaver|shellhub|dock|microphone|keyboard|keypad',
        'Hardware': 'Philips Hue|temperature sensor|display|monitor|camera|charger|speaker|router|repeater|Synology|nas|server|Parrot FPV Glasses|Command Pack 2 Skycontroller|Apple TV',
        'Software':  'adobe|Office 365|Office Home and Student|software|parallels',
        'Memory': 'hard disk|hard drive|flash drive|USB 2.0 key|USB 2.0 pen|SSD|pendrive|raid|SDHC|sata|memory card|Portable Hard Thunderbolt',
        'Repairs & warranties': 'repair|parts and labor|warranty|applecare|license|protection|installation',
    }
    

    df = df.assign(category = 'unknown')
    
    # Find main apple items
    for label, val in apple_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['desc'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown') & (x['brand'] == 'Apple'), 
                    label, x['category'])
            )
        )
    
    # Find other items
    for label, val in other_regexp_dict.items(): 
        regexp = re.compile(val, flags=re.IGNORECASE)
        df = (
            df
            .assign(
                category = lambda x: np.where(
                    ((x['desc'].str.contains(regexp, regex=True))|(x['name'].str.contains(regexp, regex=True))) &
                    (x['category'] == 'unknown'), label, x['category'])
            )
        )
    
    return df


sales_info = (sales_info
              .pipe(start_pipeline)
              .pipe(calculate_sales_discounts)
              .pipe(merge_brands)
              .pipe(rename_columns)
              .pipe(merge_orders)
              .pipe(drop_deprecated_columns)
              .pipe(assign_product_categories)
             )


# Drop any order which are not completed to minimise corrupted data
sales_info = sales_info.loc[sales_info.state=='Completed']

sales_info.to_csv(path + 'sales_info_clean.csv', index=False)
    
sales_info['category'].value_counts()

Accessories             30123
Memory                   7183
unknown                  7121
Hardware                 6202
iPhone                   3823
Smartwatch               2913
Mac                      2652
iPad                     1432
Repairs & warranties      109
Software                   60
iPod                       54
Name: category, dtype: int64

In [121]:
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500
check_df = sales_info[sales_info['category']=='unknown'][['name', 'desc']]
#check_df.groupby(check_df.columns.tolist(),as_index=False).size().sort_values('size', ascending=False)

In [122]:
# sales_info[sales_info['sku'].isin(['APP2348', 'APP2201', 'APP2117', 'APP2358'])]
# sales_info[sales_info['order_id']==514441]
sales_info[sales_info['sales_discount_pc']>90].sort_values('sales_discount_pc').head(500)

Unnamed: 0,id,order_id,product_quantity,sku,sale_price,date,name,desc,price,promo_price,discount,discount_pc,sales_discount,sales_discount_pc,brand,total_paid,state,category
97103,1257935,360505,1,APP1498,829.0,2017-05-30 10:19:14,"Apple iPad Pro 9.7 ""Wi-Fi + Cellular 32GB Silver",9.7-inch Apple iPad Pro Wi-Fi + Cellular 32GB ...,8328.12,8328.11,0.01,0.0,7499.12,90.05,Apple,2167.0,Completed,iPad
245063,1255349,359317,1,TRA0011,44.99,2017-05-26 09:54:58,Transcend JetDrive Lite 330 64GB Macbook Pro R...,MLC memory card 64GB for Macbooks Retina 13 in...,452.81,452.81,0.0,0.0,407.82,90.06,Trascend,49.98,Completed,Memory
251754,1182971,325187,1,SAN0138,49.99,2017-02-16 12:42:14,SanDisk Ultra microSDXC 128GB Memory Card + SD...,And adapter card memory 128 GB microSDXC Class...,502.8,502.8,0.0,0.0,452.81,90.06,SanDisk,54.98,Completed,Accessories
251751,1150767,311402,1,SAN0138,49.99,2017-01-18 08:46:54,SanDisk Ultra microSDXC 128GB Memory Card + SD...,And adapter card memory 128 GB microSDXC Class...,502.8,502.8,0.0,0.0,452.81,90.06,SanDisk,54.98,Completed,Accessories
245082,1519037,473423,1,TRA0011,43.28,2017-12-27 14:13:19,Transcend JetDrive Lite 330 64GB Macbook Pro R...,MLC memory card 64GB for Macbooks Retina 13 in...,452.81,452.81,0.0,0.0,409.53,90.44,Trascend,47.27,Completed,Memory
190861,1522661,475203,1,WAC0235,18.99,2017-12-28 17:39:05,Wacom Intuos 4 Case Transport size S,Carrying case for graphics tablet Intuos 4 S,199.9,199.9,0.0,0.0,180.91,90.5,Wacom,22.98,Completed,Accessories
190860,1519901,473855,1,WAC0235,18.99,2017-12-27 19:39:43,Wacom Intuos 4 Case Transport size S,Carrying case for graphics tablet Intuos 4 S,199.9,199.9,0.0,0.0,180.91,90.5,Wacom,23.98,Completed,Accessories
186775,1527010,477240,1,WAC0235,18.99,2017-12-30 11:55:22,Wacom Intuos 4 Case Transport size S,Carrying case for graphics tablet Intuos 4 S,199.9,199.9,0.0,0.0,180.91,90.5,Wacom,110.97,Completed,Accessories
144394,1402693,424462,1,APP2494,208.05,2017-11-10 12:02:59,Apple TV 4K 64GB,Apple multimedia player with 4K resolution and...,2190.0,2190.0,0.0,0.0,1981.95,90.5,Apple,212.04,Completed,Hardware
26682,1406083,329976,1,APP2494,208.05,2017-11-12 22:39:58,Apple TV 4K 64GB,Apple multimedia player with 4K resolution and...,2190.0,2190.0,0.0,0.0,1981.95,90.5,Apple,541.08,Completed,Hardware
