# Imports

In [135]:
import pandas as pd
import numpy as np

# EDA


In [136]:
df = pd.read_excel('online_retail_data.xlsx')
print(df.shape)
df.head()

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [137]:
# save a backup copy of our dataframe
df_copy = df.copy()

> let's see the geographical regions that we're dealing with

In [138]:
df.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

> Let's view the data from a Customer perspective

In [139]:
customer_data = df.set_index(['CustomerID', 'InvoiceNo'])
print(customer_data.shape)
customer_data.head(30)

(541909, 6)


Unnamed: 0_level_0,Unnamed: 1_level_0,StockCode,Description,Quantity,InvoiceDate,UnitPrice,Country
CustomerID,InvoiceNo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
17850.0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,United Kingdom
17850.0,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,United Kingdom
17850.0,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,United Kingdom
17850.0,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,United Kingdom
17850.0,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,United Kingdom
17850.0,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,United Kingdom
17850.0,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,United Kingdom
17850.0,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,United Kingdom
17850.0,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,United Kingdom
13047.0,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,United Kingdom


In [140]:
len(df.CustomerID.unique())

4373

# Data Discovery

- 4372 Unique CustomerID values
- 25900 Unique InvoiceNo values
- 4070 Unique StockCode values
- 38970 max unit price
- 0.03 min unit price
- 9,287 returns
- date range is (2010-12-01 to 2011-12-09)

 # Data Cleaning / Wrangling / Transformations

> Let's first clean up the dataset by removing uncessary observations, observations without a customer ID (our clustering target) etc...

In [141]:
df[df.StockCode == 'B']

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
299982,A563185,B,Adjust bad debt,1,2011-08-12 14:50:00,11062.06,,United Kingdom
299983,A563186,B,Adjust bad debt,1,2011-08-12 14:51:00,-11062.06,,United Kingdom
299984,A563187,B,Adjust bad debt,1,2011-08-12 14:52:00,-11062.06,,United Kingdom


In [142]:
df = df[df.StockCode != 'B']

> A few items that are priced less than 0.01 and the quantity doesn't bring them to 0.01 will also be dropped

In [143]:
df = df[df.UnitPrice > .01]

> Restrict the dataset to one full year of transaction data

In [144]:
df.InvoiceDate.min()

Timestamp('2010-12-01 08:26:00')

In [145]:
df.InvoiceDate.max()

Timestamp('2011-12-09 12:50:00')

In [146]:
df = df[df['InvoiceDate'] <= '2011-12-01 23:59:59']

In [147]:
df.InvoiceDate.max()

Timestamp('2011-12-01 19:54:00')

> Add total revenue per Unit for future computations

In [148]:
df['TotalRevenue'] = df['Quantity'] * df['UnitPrice']
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalRevenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


## RFM Variables Feature Engineering (Recency, Frequency, Monetary_Value)

The original dataset was organized long, with invoices nested within customer. Below, I create a customer-level dataset and add recency, frequency, and monetary value data to it. The recency variable refers to the number of days that have elapsed since the customer last purchased something (so, smaller numbers indicate more recent activity on the customer’s account). Frequency refers to the number of invoices with purchases during the year. Monetary value is the amount that the customer spent during the year. Some customers have negative monetary values. These customers probably returned something during the year that they had purchased before the year started, so I reset their monetary value to zero.

> Let's extract the data we need on a customer level

In [149]:
# total quantity of items each customer ordered (customer ID is ordered by desc)
quantities = df.groupby('CustomerID', as_index=False).Quantity.sum()
quantities.head()

Unnamed: 0,CustomerID,Quantity
0,12346.0,0
1,12347.0,2266
2,12348.0,2341
3,12349.0,631
4,12350.0,197


In [150]:
# frequency of purchases (number of invoices in the dataset, meaning over the year)
frequency = [len(df.loc[df['CustomerID'] == c_id].InvoiceNo.unique()) for c_id in sorted(df.CustomerID.unique())]
frequency[:10]

[2, 6, 4, 1, 1, 11, 3, 5, 1, 12]

In [151]:
print(len(sorted(df['CustomerID'].unique())))
len(df.groupby('CustomerID', as_index=False))

4334


4333

In [152]:
# total spend of each customer
total_spend = df.groupby('CustomerID', as_index=False).TotalRevenue.sum()
print(len(total_spend))
total_spend.head()

4333


Unnamed: 0,CustomerID,TotalRevenue
0,12346.0,0.0
1,12347.0,4085.18
2,12348.0,1797.24
3,12349.0,1757.55
4,12350.0,334.4


In [153]:
# merging of quantity and total spend by customer
quantities_revenues_by_cust = quantities
quantities_revenues_by_cust['MonetaryValue'] = total_spend['TotalRevenue']
quantities_revenues_by_cust['Frequency'] = frequency
quantities_revenues_by_cust.head()

ValueError: Length of values does not match length of index