## Data Cleaning

In [77]:
# Import libraries
import numpy as np 
import pandas as pd 
import datetime as dt
import matplotlib.pyplot as plt 
from IPython.display import display
from collections import defaultdict

In [78]:
# Merge τα δύο spreadsheets
df1_raw = pd.read_excel("online_retail_II.xlsx", sheet_name = "Year 2009-2010")
df1_raw.reset_index()
df2_raw = pd.read_excel("online_retail_II.xlsx", sheet_name = "Year 2010-2011")
df_raw = pd.concat([df1_raw, df2_raw])
df_raw.shape

(1067371, 8)

In [79]:
# Remove 9 days overlap
df2_raw_start_date = df2_raw["InvoiceDate"].min()
df = pd.concat([df1_raw[df1_raw["InvoiceDate"] < df2_raw_start_date], df2_raw])
df.shape 

(1044848, 8)

In [80]:
# Column datatype conversions: Invoice to string 
df["Invoice"] = df["Invoice"].astype(str)
df.shape

(1044848, 8)

In [81]:
# Column datatype conversions: StockCode to string, uppercase letters, substract blank space from value names
df['StockCode'] = df['StockCode'].astype(str).str.upper().str.strip()
df.shape

(1044848, 8)

In [82]:
# Drop missing values in Customer ID (235 287)
df = df.dropna(subset = ["Customer ID"])
df.shape

(809561, 8)

In [83]:
#duplicated = df[df.duplicated(keep = 'first')]
#duplicated
#df = df.drop_duplicates()
#df.shape

In [84]:
len(df[df["Quantity"] < 0])

18446

---
---
---

In [85]:
#df_grp_cid_stc=df.groupby(['Customer ID', 'StockCode'], as_index=True, sort=False, group_keys=True)['Quantity'].sum()
#df_grp_cid_stc = pd.DataFrame(df_grp_cid_stc) # equivalent to the command below
# https://stackoverflow.com/questions/34113203/python-pandas-setting-groupby-group-labels-as-index-in-a-new-dataframe
#df_grp_cid_stc = pd.DataFrame(df_grp_cid_stc.values, index=df_grp_cid_stc.index.values, columns=['Quantity'])
#df_grp_cid_stc
#df_grp_cid_stc.head(30)

In [86]:
# Before
rows_before_negq_drop = df.shape[0]
df.shape

(809561, 8)

In [87]:
# Before
len(df[df['Quantity'] < 0])

18446

In [88]:
# https://stackoverflow.com/a/49216587
df['CIDxSC Summed Quantity'] = df.groupby(['Customer ID', 'StockCode'])['Quantity'].transform('sum')
df.shape

(809561, 9)

In [89]:
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,CIDxSC Summed Quantity
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,12
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,28
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,28
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,72
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,36


In [90]:
""" gia tou logou to ali9es
a = df['StockCode']=='79323P'
b = df['Customer ID'] == 13085.0
c = a & b
df[c]
"""

" gia tou logou to ali9es\na = df['StockCode']=='79323P'\nb = df['Customer ID'] == 13085.0\nc = a & b\ndf[c]\n"

In [91]:
df_grp_cid_stc_negq = df[df["CIDxSC Summed Quantity"] < 0]
df_grp_cid_stc_negq_idx = df_grp_cid_stc_negq.index
df_grp_cid_stc_negq_idx

Int64Index([   221,    285,    318,    399,    400,    401,    576,    589,
               590,    591,
            ...
            531616, 531759, 531799, 531800, 531816, 532722, 533082, 534308,
            537595, 541541],
           dtype='int64', length=2209)

In [92]:
df.drop(df_grp_cid_stc_negq_idx, inplace=True)

In [93]:
# After
rows_after_negq_drop = df.shape[0]
df.shape

(805701, 9)

In [94]:
# This is the number of rows deleted: for this combination of customer id and stock code: 
# the summed quantity of those particular stock codes was found to be negative.
rows_before_negq_drop-rows_after_negq_drop

3860

In [95]:
# After
df.shape
len(df[df['CIDxSC Summed Quantity'] < 0])

0

In [96]:
# After. These don't matter. The ones that matter are those of column 'CIDxSC Summed Quantity', which have been dropped already
df.shape
len(df[df['Quantity'] < 0])

16494

In [103]:
# Now drop the 'CIDxSC Summed Quantity' column: its only utility was for dropping the negative summed quantities
df.drop(['CIDxSC Summed Quantity'], axis=1, inplace=True)
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [97]:
#len(df[df["Invoice"].str.contains("C")])

In [98]:
#df_price_positive = df2[df2["Price"] >= 0.0]

In [99]:
#df_price_negative = df2[df2["Price"] < 0.0]

In [100]:
# We concatenate the two DataFrames.
#df_nnz_prices = pd.concat([df_price_positive, df_price_negative])

In [101]:
# Convert InvoiceDate column to datetime format
#df3["InvoiceDate"] = pd.to_datetime(df3["InvoiceDate"])

# Create Month column
#df3["Month"] = df3["InvoiceDate"].dt.strftime("%B")

# Create TotalAmount column
#df3["TotalAmount"] = df3["Price"] * df3["Quantity"] 