## Data Cleaning

In [28]:
# Import libraries
import numpy as np 
import pandas as pd 
import datetime as dt
import matplotlib.pyplot as plt 
from IPython.display import display
from collections import defaultdict

In [29]:
# Merge the two spreadsheets
df1_raw = pd.read_excel("online_retail_II.xlsx", sheet_name = "Year 2009-2010")
df1_raw.reset_index()
df2_raw = pd.read_excel("online_retail_II.xlsx", sheet_name = "Year 2010-2011")
df_raw = pd.concat([df1_raw, df2_raw])
df_raw.shape

(1067371, 8)

In [30]:
# Remove 9 days overlap
df2_raw_start_date = df2_raw["InvoiceDate"].min()
df = pd.concat([df1_raw[df1_raw["InvoiceDate"] < df2_raw_start_date], df2_raw])
df.shape 

(1044848, 8)

In [31]:
# Column datatype conversions: Invoice to string 
df["Invoice"] = df["Invoice"].astype(str)
df.shape

(1044848, 8)

In [32]:
# Column datatype conversions: StockCode to string, uppercase letters, substract blank space from value names
df["StockCode"] = df["StockCode"].astype(str).str.upper().str.strip()
df.shape

(1044848, 8)

In [33]:
# Drop missing values in Customer ID (235287)
df = df.dropna(subset = ["Customer ID"])
df.shape

(809561, 8)

In [34]:
# Shape of df before dropping transactions with negative quantities
rows_before_negq_drop = df.shape[0]
df.shape

(809561, 8)

In [35]:
# Number of transactions with negative quantities
len(df[df["Quantity"] < 0])

18446

In [36]:
# Create new column 'CIDxSC Summed Quantity' in which for each combination of Customer ID and StockCode
# we sum the Quantity in order to reach the quantities that were eventually bought from customers
df["CIDxSC Summed Quantity"] = df.groupby(["Customer ID", "StockCode"])["Quantity"].transform("sum")
df.shape

(809561, 9)

In [37]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,CIDxSC Summed Quantity
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,12
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,28
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,28
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,72
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,36


In [38]:
# Keep only summed quantities with positive value
df = df[df["CIDxSC Summed Quantity"] > 0]

In [39]:
# Shape of df after removal of negative values
rows_after_negq_drop = df.shape[0]
df.shape

(800909, 9)

In [40]:
# Confirm that all negative summed quantities are removed
len(df[df["CIDxSC Summed Quantity"] <= 0])

0

In [41]:
# A number of rows with negative quantities remain after the removal of negative summed quantities. 
# These don't matter because they were already taken into account during the summation. 
# The ones that matter are those of column 'CIDxSC Summed Quantity', which have been dropped already.
df.shape
len(df[df["Quantity"] <= 0])

13291

In [42]:
# Now drop the 'CIDxSC Summed Quantity' column: its only utility was for dropping the negative summed quantities
df.drop(["CIDxSC Summed Quantity"], axis=1, inplace=True)
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [43]:
# Assert quantity is not null per customer for all customers
df["Quantity"].isnull().any()

False

In [44]:
# Assert price > 0 for all prices and stockcodes
len(df[df["Price"] < 0])

0

In [45]:
# Remove time from InvoiceDate column
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.date
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01,1.25,13085.0,United Kingdom


In [46]:
# Review how many cancelled invoices remain in the dataset after the above modifications
len(df[df["Invoice"].str.contains("C")])

13291

In [47]:
# Check the cancelled invoices: 
# we will not remove them although their quantities are negative
# because they have been already been filtered through the `CIDxSC Summed Quantity` analysis
# where all negative summed quantities were already dropped
df[df["Invoice"].str.contains("C")].head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
178,C489449,22087,PAPER BUNTING WHITE LACE,-12,2009-12-01,2.95,16321.0,Australia
179,C489449,85206A,CREAM FELT EASTER EGG BASKET,-6,2009-12-01,1.65,16321.0,Australia
186,C489449,22090,PAPER BUNTING RETRO SPOTS,-12,2009-12-01,2.95,16321.0,Australia
317,C489503,21533,RETRO SPOT LARGE MILK JUG,-1,2009-12-01,4.95,16011.0,United Kingdom
1162,C489554,85042,ANTIQUE LILY FAIRY LIGHTS,-3,2009-12-01,4.25,13767.0,United Kingdom


In [48]:
# Export df to excel: this is the final form of the dataset that will be used for RFM analysis
df_clean = df.to_excel(r'C:\Users\argyr\OneDrive\Υπολογιστής\Diplvm\Giftware dataset\clean_data.xlsx', index=False)