In [1]:
import pandas as pd

## Calculating CLV

CLV = ((Average sales x purchase frequency) / churn) x profit margin

average sales = totalSales / total no of orders

purchase frequency = total number of orders / total unique customers

retention rate = total no of orders greater than 1 / total unique customers

churn = 1 - retention rate

profit margin = based on business context

https://www.analyticsvidhya.com/blog/2020/10/a-definitive-guide-for-predicting-customer-lifetime-value-clv/

## Ways to Calculate CLV

historic approach: calculate CLVs by aggregate or by cohort

predictive approach: calculate CLVs by regression techniques

## DATA CLEANING

In [140]:
df = pd.read_excel('Data/uci_online_retail.xlsx')

In [141]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


## ==== InvoiceNo ====

- InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.

In [142]:
colName = 'InvoiceNo'

# View the rows that are not valid digits.

temp = pd.DataFrame(df.loc[~df[colName].astype(str).str.isdigit(), colName].unique())
#1.take colName column
#2.change to string datatype
#3.use isdigit() function
#4.filter df on the inverse of this condition using loc
#5.get unique values using unique()
#5.convert this array data structure into DataFrame data structure

temp.columns = [colName]
#6.rename column

temp.sort_values(by=colName)
#7.sort values by ascending

Unnamed: 0,InvoiceNo
2388,A563185
2389,A563186
2390,A563187
0,C536379
1,C536383
...,...
3834,C581484
3835,C581490
3836,C581499
3837,C581568


In [143]:
temp = df[~df[colName].astype('str').str.contains('C') & ~df[colName].astype('str').str.isdigit()]
temp
#1.convert column to string
#2.search for strings that contain C
#3.filter dataframe on the inverse of this condition
#4.AND with the previous condition we wrote

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
299982,A563185,B,Adjust bad debt,1,2011-08-12 14:50:00,11062.06,,United Kingdom
299983,A563186,B,Adjust bad debt,1,2011-08-12 14:51:00,-11062.06,,United Kingdom
299984,A563187,B,Adjust bad debt,1,2011-08-12 14:52:00,-11062.06,,United Kingdom


In [144]:
# there's our culprit - the adjustment of bad debt. 
# Hypothesis: the corresponding bad debt source is the listed 6 digit invoice number less the A
# Let's test this out

temp2 = df.iloc[0:0].copy()
#1.create empty dataframe with same column names as original

for invoice in temp[colName].str[1:]:
    temp2.append(df[df[colName] == invoice])
    #append to this empty dataframe any rows that match the stripped invoice number
    
temp2

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country


In [212]:
# no match....
# since these are not customer transactions, nor are they tagged to any existing transaction,
# let's just remove them

if colName == 'InvoiceNo':
    df = df.drop(temp.index)
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [191]:
# Hypothesis: The 7 digit StockCodes that are Cancelled have a a 6 digit equivalent,
# indicating the invoice the cancellation corresponds to

# Do a left join of the 7 digit-ers with the 6 digit-ers, 
# on a new column that extracts the last 6 digits of the 7 digit-ers

tempL = df[df[colName].astype('str').str.len() == 7]
#1.get 7 digit-ers
tempL[colName+'Trunc'] = tempL[colName].astype('str').str[1:].astype('int')
#2.extract last 6 digits, this will be the join column. 
#Important: must be same datatype as the other column we are joining on

tempR = df[df[colName].astype('str').str.len() == 6]
tempR[colName+'Trunc'] = tempR[colName].astype('int')
#3.create right table

pd.merge(tempL, tempR, left_on=(colName+'Trunc'), right_on=colName, how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempL[colName+'Trunc'] = tempL[colName].astype('str').str[1:].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tempR[colName+'Trunc'] = tempR[colName].astype('int')


Unnamed: 0,InvoiceNo_x,StockCode_x,Description_x,Quantity_x,InvoiceDate_x,UnitPrice_x,CustomerID_x,Country_x,InvoiceNoTrunc_x,InvoiceNo_y,StockCode_y,Description_y,Quantity_y,InvoiceDate_y,UnitPrice_y,CustomerID_y,Country_y,InvoiceNoTrunc_y
0,C536379,D,Discount,-1,2010-12-01 09:41:00,27.50,14527.0,United Kingdom,536379,,,,,NaT,,,,
1,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom,536383,,,,,NaT,,,,
2,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom,536391,,,,,NaT,,,,
3,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom,536391,,,,,NaT,,,,
4,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom,536391,,,,,NaT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9283,C581490,23144,ZINC T-LIGHT HOLDER STARS SMALL,-11,2011-12-09 09:57:00,0.83,14397.0,United Kingdom,581490,,,,,NaT,,,,
9284,C581499,M,Manual,-1,2011-12-09 10:28:00,224.69,15498.0,United Kingdom,581499,,,,,NaT,,,,
9285,C581568,21258,VICTORIAN SEWING BOX LARGE,-5,2011-12-09 11:57:00,10.95,15311.0,United Kingdom,581568,,,,,NaT,,,,
9286,C581569,84978,HANGING HEART JAR T-LIGHT HOLDER,-1,2011-12-09 11:58:00,1.25,17315.0,United Kingdom,581569,,,,,NaT,,,,


In [None]:
# our hypothesis is false. we will need to group by customer later to try to match these cancellations

## // InvoiceNo is done. //

## ==== Quantity ==== 

- Quantity: The quantities of each product (item) per transaction. Numeric.


In [194]:
colName = 'Quantity'

df[[colName]].describe()

Unnamed: 0,Quantity
count,541906.0
mean,9.552297
std,218.081761
min,-80995.0
25%,1.0
50%,3.0
75%,10.0
max,80995.0


In [195]:
# min should not be negative; in fact all quantities should be positive in theory
df[df[colName]<=0]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.50,14527.0,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311.0,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548.0,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548.0,United Kingdom
...,...,...,...,...,...,...,...,...
540449,C581490,23144,ZINC T-LIGHT HOLDER STARS SMALL,-11,2011-12-09 09:57:00,0.83,14397.0,United Kingdom
541541,C581499,M,Manual,-1,2011-12-09 10:28:00,224.69,15498.0,United Kingdom
541715,C581568,21258,VICTORIAN SEWING BOX LARGE,-5,2011-12-09 11:57:00,10.95,15311.0,United Kingdom
541716,C581569,84978,HANGING HEART JAR T-LIGHT HOLDER,-1,2011-12-09 11:58:00,1.25,17315.0,United Kingdom


In [213]:
# some of these are the cancellations. we ignore them
temp = df[(~df['InvoiceNo'].astype('str').str.contains('C')) & (df[colName]<=0)]
temp

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
2406,536589,21777,,-10,2010-12-01 16:50:00,0.0,,United Kingdom
4347,536764,84952C,,-38,2010-12-02 14:42:00,0.0,,United Kingdom
7188,536996,22712,,-20,2010-12-03 15:30:00,0.0,,United Kingdom
7189,536997,22028,,-20,2010-12-03 15:30:00,0.0,,United Kingdom
7190,536998,85067,,-6,2010-12-03 15:30:00,0.0,,United Kingdom
...,...,...,...,...,...,...,...,...
535333,581210,23395,check,-26,2011-12-07 18:36:00,0.0,,United Kingdom
535335,581212,22578,lost,-1050,2011-12-07 18:38:00,0.0,,United Kingdom
535336,581213,22576,check,-30,2011-12-07 18:38:00,0.0,,United Kingdom
536908,581226,23090,missing,-338,2011-12-08 09:56:00,0.0,,United Kingdom


In [217]:
temp['CustomerID'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: CustomerID, dtype: float64

In [218]:
temp['Description'].unique()

array([nan, '?', 'check', 'damages', 'faulty', 'Dotcom sales',
       'reverse 21/5/10 adjustment', 'mouldy, thrown away.', 'counted',
       'Given away', 'Dotcom', 'label mix up', 'samples/damages',
       'thrown away', 'incorrectly made-thrown away.', 'showroom', 'MIA',
       'Dotcom set', 'wrongly sold as sets', 'Amazon sold sets',
       'dotcom sold sets', 'wrongly sold sets', '? sold as sets?',
       '?sold as sets?', 'Thrown away.', 'damages/display',
       'damaged stock', 'broken', 'throw away', 'wrong barcode (22467)',
       'wrong barcode', 'barcode problem', '?lost',
       "thrown away-can't sell.", "thrown away-can't sell", 'damages?',
       're dotcom quick fix.', "Dotcom sold in 6's", 'sold in set?',
       'cracked', 'sold as 22467', 'Damaged',
       'mystery! Only ever imported 1800',
       'MERCHANT CHANDLER CREDIT ERROR, STO', 'POSSIBLE DAMAGES OR LOST?',
       'damaged', 'DAMAGED', 'Display', 'Missing', 'wrong code?',
       'wrong code', 'adjust', 'crush

In [219]:
# these rows mostly consist of inventory upkeep. 
# a rare few are related to sales, but for simplicity we shall just remove them

if colName == 'Quantity':
    df = df.drop(temp.index)
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [221]:
df[[colName]].describe()

Unnamed: 0,Quantity
count,540570.0
mean,9.958755
std,216.230153
min,-80995.0
25%,1.0
50%,3.0
75%,10.0
max,80995.0


In [None]:
df['']

## ==== StockCode ==== 

- StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.


In [7]:
colName = 'StockCode'

# Let's first view StockCodes that don't have 5 digits
df[df[colName].astype('str').str.len() != 5].sort_values(by='StockCode', ascending=True)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
109611,545655,10123C,HEARTS WRAPPING TAPE,3,2011-03-04 13:58:00,0.65,14670.0,United Kingdom
265293,560213,10123C,,-18,2011-07-15 15:05:00,0.00,,United Kingdom
5451,536863,10123C,HEARTS WRAPPING TAPE,1,2010-12-03 11:19:00,0.65,17967.0,United Kingdom
140929,548491,10123C,HEARTS WRAPPING TAPE,1,2011-03-31 13:14:00,0.65,14064.0,United Kingdom
150159,549349,10123G,,-38,2011-04-08 11:13:00,0.00,,United Kingdom
...,...,...,...,...,...,...,...,...
42057,539958,gift_0001_50,Dotcomgiftshop Gift Voucher £50.00,1,2010-12-23 13:26:00,42.55,,United Kingdom
178556,552232,gift_0001_50,Dotcomgiftshop Gift Voucher £50.00,1,2011-05-06 15:54:00,41.67,,United Kingdom
239744,558066,gift_0001_50,Dotcomgiftshop Gift Voucher £50.00,1,2011-06-24 15:45:00,41.67,,United Kingdom
245516,558614,gift_0001_50,Dotcomgiftshop Gift Voucher £50.00,1,2011-06-30 15:56:00,41.67,,United Kingdom


In [97]:
# Hypothesis: Those 6 digit StockCodes are to indicate variants of a base product. 
# Let's see if these 6 digit-ers have a 5 digit equivalent

# Do a left join of the 6 digit-ers with the 5 digit-ers, 
# on a new column that extracts the first 5 digits of the 6 digit-ers

tempL = df[df[colName].astype('str').str.len() == 6][['StockCode', 'Description', 'Quantity']]
#1.get 6 digit-ers
#2.get the 3 relevant columns
tempL['StockCodeTrunc'] = tempL['StockCode'].str[:5].astype('int')
#3.extract first 5 digits, this will be the join column. 
#Important: must be same datatype as the other column we are joining on

tempR = df[df[colName].astype('str').str.len() == 5][['StockCode', 'Description', 'Quantity']]
tempR['StockCode'] = tempR['StockCode'].astype('int')
#4.create right table

pd.merge(tempL, tempR, left_on='StockCodeTrunc', right_on='StockCode', how='left')

Unnamed: 0,StockCode_x,Description_x,Quantity_x,StockCodeTrunc,StockCode_y,Description_y,Quantity_y
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,85123,,,
1,84406B,CREAM CUPID HEARTS COAT HANGER,8,84406,,,
2,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,84029,,,
3,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,84029,,,
4,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,85123,,,
...,...,...,...,...,...,...,...
881690,84997C,CHILDRENS CUTLERY POLKADOT BLUE,8,84997,,,
881691,85099C,JUMBO BAG BAROQUE BLACK WHITE,10,85099,,,
881692,84993A,75 GREEN PETIT FOUR CASES,2,84993,,,
881693,85049A,TRADITIONAL CHRISTMAS RIBBONS,1,85049,,,


In [115]:
# No match...hence, we cannot merge these products directly.
# Let's observe what different products belong to each base StockCode

from collections import defaultdict
d1 = defaultdict(set)
#1.dictionary of sets

for i in range(len(tempL)):
    d1[tempL['StockCodeTrunc'].iloc[i]].add(tempL['Description'].iloc[i])
    #for every row in dataframe, add the item description to the set (value).
    #the corresponding key is the base StockCode

d1

defaultdict(set,
            {10123: {'HEARTS WRAPPING TAPE ', nan},
             10124: {'ARMY CAMO BOOKCOVER TAPE',
              'SPOTS ON RED BOOKCOVER TAPE'},
             15044: {'BLUE PAPER PARASOL ',
              'PINK PAPER PARASOL ',
              'PURPLE PAPER PARASOL',
              'RED PAPER PARASOL',
              nan},
             15056: {'EDWARDIAN PARASOL NATURAL', 'EDWARDIAN PARASOL PINK'},
             15058: {'BLUE POLKADOT GARDEN PARASOL',
              'ICE CREAM DESIGN GARDEN PARASOL',
              'PINK POLKADOT GARDEN PARASOL',
              nan,
              'wet/rusty'},
             15060: {'FAIRY CAKE DESIGN UMBRELLA', nan},
             16020: {'CLEAR STATIONERY BOX SET ', nan},
             16151: {'FLOWERS HANDBAG blue and orange', nan},
             16156: {'WRAP CAROUSEL',
              'WRAP PINK FAIRY CAKES ',
              'WRAP, CAROUSEL',
              nan},
             16161: {'WRAP  PINK FLOCK',
              'WRAP BAD HAIR DAY',
         

## ==== Description ==== 

- Description: Product (item) name. Nominal.


## ==== InvoiceDate ==== 

- InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.


## ==== UnitPrice ====

- UnitPrice: Unit price. Numeric, Product price per unit in sterling.


## ==== CustomerID ====

- CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.


## ==== Country ====

- Country: Country name. Nominal, the name of the country where each customer resides.

