# Imports

In [54]:
import pandas as pd
import numpy as np

# EDA / Data Cleaning / Transformations


In [55]:
df = pd.read_excel('online_retail_data.xlsx')
print(df.shape)
df.head()

(541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [56]:
df.Country.unique()

array(['United Kingdom', 'France', 'Australia', 'Netherlands', 'Germany',
       'Norway', 'EIRE', 'Switzerland', 'Spain', 'Poland', 'Portugal',
       'Italy', 'Belgium', 'Lithuania', 'Japan', 'Iceland',
       'Channel Islands', 'Denmark', 'Cyprus', 'Sweden', 'Austria',
       'Israel', 'Finland', 'Bahrain', 'Greece', 'Hong Kong', 'Singapore',
       'Lebanon', 'United Arab Emirates', 'Saudi Arabia',
       'Czech Republic', 'Canada', 'Unspecified', 'Brazil', 'USA',
       'European Community', 'Malta', 'RSA'], dtype=object)

> Add total revenue per Unit for future computations

In [57]:
df['TotalRevenue'] = df['Quantity'] * df['UnitPrice']
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalRevenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


> Let's view the data from a Customer perspective

In [58]:
customer_data = df.set_index(['CustomerID', 'InvoiceNo'])
print(customer_data.shape)
customer_data.head(30)

(541909, 7)


Unnamed: 0_level_0,Unnamed: 1_level_0,StockCode,Description,Quantity,InvoiceDate,UnitPrice,Country,TotalRevenue
CustomerID,InvoiceNo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17850.0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,United Kingdom,15.3
17850.0,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,United Kingdom,20.34
17850.0,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,United Kingdom,22.0
17850.0,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,United Kingdom,20.34
17850.0,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,United Kingdom,20.34
17850.0,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,United Kingdom,15.3
17850.0,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,United Kingdom,25.5
17850.0,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,United Kingdom,11.1
17850.0,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,United Kingdom,11.1
13047.0,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,United Kingdom,54.08


In [59]:
len(df.CustomerID.unique())

4373

## RFM Variables Feature Engineering (Recency, Frequency, Monetary_Value)

The original dataset was organized long, with invoices nested within customer. Below, I create a customer-level dataset and add recency, frequency, and monetary value data to it. The recency variable refers to the number of days that have elapsed since the customer last purchased something (so, smaller numbers indicate more recent activity on the customer’s account). Frequency refers to the number of invoices with purchases during the year. Monetary value is the amount that the customer spent during the year. Some customers have negative monetary values. These customers probably returned something during the year that they had purchased before the year started, so I reset their monetary value to zero.

> Let's extract the data we need on a customer level

In [60]:
# total quantity of items each customer ordered (customer ID is ordered by desc)
quantities = df.groupby('CustomerID', as_index=False).Quantity.sum()
quantities.head()

Unnamed: 0,CustomerID,Quantity
0,12346.0,0
1,12347.0,2458
2,12348.0,2341
3,12349.0,631
4,12350.0,197


In [119]:
# frequency of purchases (number of invoices in the dataset, meaning over the year)
frequency = [len(df.loc[df['CustomerID'] == c_id].InvoiceNo.unique()) for c_id in sorted(df.CustomerID.unique())]
frequency[:10]

[2, 7, 4, 1, 1, 11, 3, 6, 1, 13]

In [123]:
print(len(sorted(df['CustomerID'].unique())))
len(df.groupby('CustomerID', as_index=False))

4373


4372

In [117]:
# total spend of each customer
total_spend = df.groupby('CustomerID', as_index=False).TotalRevenue.sum()
print(len(total_spend))
total_spend.head()

4372


Unnamed: 0,CustomerID,TotalRevenue
0,12346.0,0.0
1,12347.0,4310.0
2,12348.0,1797.24
3,12349.0,1757.55
4,12350.0,334.4


In [93]:
# merging of quantity and total spend by customer
quantities_revenues_by_cust = quantities
quantities_revenues_by_cust['MonetaryValue'] = total_spend['TotalRevenue']
quantities_revenues_by_cust['Frequency'] = frequency
quantities_revenues_by_cust.head()

ValueError: Length of values does not match length of index