## Data Cleaning

In [1]:
# Import libraries
import numpy as np 
import pandas as pd 
import datetime as dt
import matplotlib.pyplot as plt 
from IPython.display import display
from collections import defaultdict

In [2]:
# Merge τα δύο spreadsheets
df1_raw = pd.read_excel("online_retail_II.xlsx", sheet_name = "Year 2009-2010")
df1_raw.reset_index()
df2_raw = pd.read_excel("online_retail_II.xlsx", sheet_name = "Year 2010-2011")
df_raw = pd.concat([df1_raw, df2_raw])
df_raw.shape

(1067371, 8)

In [3]:
# Remove 9 days overlap
df2_raw_start_date = df2_raw["InvoiceDate"].min()
df = pd.concat([df1_raw[df1_raw["InvoiceDate"] < df2_raw_start_date], df2_raw])
df.shape 

(1044848, 8)

In [4]:
# Column datatype conversions: Invoice to string 
df["Invoice"] = df["Invoice"].astype(str)
df.shape

(1044848, 8)

In [5]:
# Column datatype conversions: StockCode to string, uppercase letters, substract blank space from value names
df["StockCode"] = df["StockCode"].astype(str).str.upper().str.strip()
df.shape

(1044848, 8)

In [6]:
# Drop missing values in Customer ID (235 287)
df = df.dropna(subset = ["Customer ID"])
df.shape

(809561, 8)

In [7]:
#duplicated = df[df.duplicated(keep = 'first')]
#duplicated
#df = df.drop_duplicates()
#df.shape

In [8]:
len(df[df["Quantity"] < 0])

18446

In [9]:
#df_grp_cid_stc=df.groupby(['Customer ID', 'StockCode'], as_index=True, sort=False, group_keys=True)['Quantity'].sum()
#df_grp_cid_stc = pd.DataFrame(df_grp_cid_stc) # equivalent to the command below
# https://stackoverflow.com/questions/34113203/python-pandas-setting-groupby-group-labels-as-index-in-a-new-dataframe
#df_grp_cid_stc = pd.DataFrame(df_grp_cid_stc.values, index=df_grp_cid_stc.index.values, columns=['Quantity'])
#df_grp_cid_stc
#df_grp_cid_stc.head(30)

In [10]:
# Shape of df before dropping transactions with negative quantities
rows_before_negq_drop = df.shape[0]
df.shape

(809561, 8)

In [11]:
# Number of transactions with negative quantities
len(df[df["Quantity"] < 0])

18446

In [12]:
# Create new column 'CIDxSC Summed Quantity' in which for each combination of Customer ID and StockCode
# we sum the Quantity in order to reach the quantities that were eventually bought from customers
df["CIDxSC Summed Quantity"] = df.groupby(["Customer ID", "StockCode"])["Quantity"].transform("sum")
df.shape

(809561, 9)

In [13]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,CIDxSC Summed Quantity
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,12
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,28
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,28
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,72
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,36


In [14]:
# Inspect result for Customer ID '13085' and StockCode '79323P'  
# a = df['StockCode']=='79323P'
# b = df['Customer ID'] == 13085.0
# c = a & b
# df[c]

In [15]:
df_grp_cid_stc_negq = df[df["CIDxSC Summed Quantity"] < 0]
df_grp_cid_stc_negq_idx = df_grp_cid_stc_negq.index
df_grp_cid_stc_negq_idx

Int64Index([   221,    285,    318,    399,    400,    401,    576,    589,
               590,    591,
            ...
            531616, 531759, 531799, 531800, 531816, 532722, 533082, 534308,
            537595, 541541],
           dtype='int64', length=2209)

In [16]:
# Drop summed quntities with negative value.
df.drop(df_grp_cid_stc_negq_idx, inplace=True)

In [17]:
# Shape of df after removal of negative values
rows_after_negq_drop = df.shape[0]
df.shape

(805701, 9)

In [18]:
# This is the number of rows deleted: for this combination of Customer ID and StockCode: 
# the summed quantity of those particular stock codes was found to be negative.
rows_before_negq_drop-rows_after_negq_drop

3860

In [19]:
# Confirm that all negative summed quantities are removed
df.shape
len(df[df["CIDxSC Summed Quantity"] < 0])

0

In [20]:
# A number of rows with negative quantities remain after the removal of negative summed quantities. 
# These don't matter because they were already taken into account during the summation. 
# The ones that matter are those of column 'CIDxSC Summed Quantity', which have been dropped already.
df.shape
len(df[df["Quantity"] < 0])

16494

In [21]:
# Now drop the 'CIDxSC Summed Quantity' column: its only utility was for dropping the negative summed quantities
df.drop(["CIDxSC Summed Quantity"], axis=1, inplace=True)
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [22]:
# Assert quantity is not null per customer for all customers
df["Quantity"].isnull().any()

False

In [23]:
# Assert price > 0 for all prices and stockcodes
len(df[df["Price"] < 0])

0

In [24]:
# Remove time from InvoiceDate column
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.date
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085.0,United Kingdom


In [25]:
# Review how many cancelled invoices remain in the dataset after the above modifications
len(df[df["Invoice"].str.contains("C")])

16494

In [26]:
# Check the cancelled invoices: 
# we will not remove them although their quantities are negative
# because they have been already been filtered through the `CIDxSC Summed Quantity` analysis
# where all negative summed quantities were already dropped
df[df["Invoice"].str.contains("C")].head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
178,C489449,22087,PAPER BUNTING WHITE LACE,-12,2009-12-01,2.95,16321.0,Australia
179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,2009-12-01,1.65,16321.0,Australia
180,C489449,21895,POTTING SHED SOW 'N' GROW SET,-4,2009-12-01,4.25,16321.0,Australia
181,C489449,21896,POTTING SHED TWINE,-6,2009-12-01,2.1,16321.0,Australia
182,C489449,22083,PAPER CHAIN KIT RETRO SPOT,-12,2009-12-01,2.95,16321.0,Australia
