In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import statistics as stat

In [2]:
# Starting dataset
df = pd.read_csv('cleaned_dataframe.csv', sep='\t', index_col=0)

In [3]:
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta
0,536365,01/12/10 08:26,2.55,17850,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6
1,536365,01/12/10 08:26,3.39,17850,United Kingdom,71053,WHITE METAL LANTERN,6
2,536365,01/12/10 08:26,2.75,17850,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8
3,536365,01/12/10 08:26,3.39,17850,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6
4,536365,01/12/10 08:26,3.39,17850,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400220 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         400220 non-null  object 
 1   BasketDate       400220 non-null  object 
 2   Sale             400220 non-null  float64
 3   CustomerID       400220 non-null  int64  
 4   CustomerCountry  400220 non-null  object 
 5   ProdID           400220 non-null  object 
 6   ProdDescr        400220 non-null  object 
 7   Qta              400220 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 19.8+ MB


In [5]:
# Create the dataframe for customers
data = pd.core.frame.DataFrame({'CustomerID' : df['CustomerID'].unique()})
print("Number of unique customers: ", len(data))

Number of unique customers:  4339


In [6]:
# TProd - handle negative Qta
# TSale - handle negativa Qta

# otptimize code, iteration over same object multiple times

In [7]:
stat.fmean([x for x in range(1,101)])

50.5

In [8]:
unique_products = []
unique_products_ret = []
total_products = []
total_sale = []
total_sale_wret = []
total_returned = []
min_spent = []
max_spent = []
total_orders = []
mean_prod_sale = []
for customer in tqdm(df['CustomerID'].unique(), total=len(data), desc="Iterating over customers"):
    unique_products.append(len(df.loc[df['CustomerID'] == customer, ['ProdID']].drop_duplicates()))
    unique_products_ret.append(len(df.loc[(df['CustomerID'] == customer) & (df['Qta'] < 0), ['ProdID']].drop_duplicates()))
    total_orders.append(len(df.loc[(df['CustomerID'] == customer) & (df['Qta'] > 0), ['BasketID']].drop_duplicates()))
    
    total_products.append(sum(x for x in df.loc[df['CustomerID'] == customer, ['Qta']]['Qta']))
    total_returned.append(sum(1 for x in df.loc[df['CustomerID'] == customer, ['Qta']]['Qta'] if x<0))
    
    total_sale.append(sum(r['Qta']*r['Sale'] for _, r in df.loc[df['CustomerID'] == customer, ['Qta', 'Sale']].iterrows()))
    total_sale_wret.append(sum(r['Qta']*r['Sale'] for _, r in df.loc[df['CustomerID'] == customer, ['Qta', 'Sale']].iterrows() if r['Qta']>0))
    min_spent.append(min(r['Qta']*r['Sale'] for _, r in df.loc[df['CustomerID'] == customer, ['Qta', 'Sale']].iterrows() if r['Qta'] > 0))
    max_spent.append(max(r['Qta']*r['Sale'] for _, r in df.loc[df['CustomerID'] == customer, ['Qta', 'Sale']].iterrows() if r['Qta'] > 0))
    mean_prod_sale.append(stat.fmean(r['Sale'] for _, r in df.loc[df['CustomerID'] == customer, ['Qta', 'Sale']].iterrows() if r['Qta'] > 0))

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4339.0), HTML(value='')))




In [9]:
# I: the total number of items purchased by a customer during the period of observation.
data['TProd'] = total_products

In [10]:
# Iu: the number of distinct items bought by a customer in the period of observation.
data['DProd'] = unique_products

In [11]:
# Imax: the maximum number of items purchased by a customer during a shopping session

In [12]:
# E: the Shannon entropy on the purchasing behaviour of the customer

In [13]:
# ProdPerOrderMean: mean number of items purchased by a customer during a shopping session

In [14]:
# SaleTot: total amount spent during the period of observation
data['TSale'] = total_sale

In [15]:
# SaleTotWithoutReturn: total amount spent without negative quantity during the period of observation
data['TSaleWRet'] = total_sale_wret

In [16]:
# SaleMin: min amount spent during the shopping session
data['MinSale'] = min_spent

In [17]:
# SaleMax: max amount spent during the shopping session
data['MaxSale'] = max_spent

In [18]:
# SaleMeanPerOrder: mean amount spent for each order during the period of observation

In [19]:
# NumRetProd: number of returned products in the period of observation
data['TRProd'] = total_returned

In [20]:
# NumDistRetProd: number of distinct returned products in the period of observation
data['TRDProd'] = unique_products_ret

In [21]:
# MeanProdCostInOrder: mean cost of the products in the orders
data['MeanProdSale'] = mean_prod_sale

In [22]:
# NumberOfOrders: total number of orders made by customer
data['TOrder'] = total_orders

In [23]:
# OrderPerMonth

In [24]:
# ProductPerMonth

In [25]:
# PrizePerMonth

In [26]:
# OrderPerTrimester

In [27]:
# ProductPerTrimester

In [28]:
# PrizePerTrimester

In [29]:
# OrderPerTrimester 2-3

In [30]:
# ProductPerTrimester 2-3

In [31]:
# PrizePerTrimester 2-3

In [32]:
# CustomerCounty : value from original dataframe
#data['CustomerCountry'] = ''
#data = data.join(pd.core.frame.DataFrame({'CustomerID':df['CustomerID'], 'CustomerCountry':df['CustomerCountry']}).drop_duplicates().set_index('CustomerID'), on='CustomerID')

#tmp = pd.core.frame.DataFrame({'CustomerID':df['CustomerID'], 'CustomerCountry':df['CustomerCountry']}).drop_duplicates()
#tmp['CustomerID'].value_counts()[0:8]
# 4372 distinct customers - 4380 elements in tmp
# 8 customers have two different nationalities

# Find entries with multiple nationality
#counts = tmp['CustomerID'].value_counts().items()
#count_dict = {x[0]:x[1] for x in counts}
#for _, row in tmp.sort_values(by=['CustomerID']).iterrows():
#    if count_dict[row['CustomerID']] > 1:
#        print(f"{row['CustomerID']}-{row['CustomerCountry']}")

In [33]:
data

Unnamed: 0,CustomerID,TProd,DProd,TSale,TSaleWRet,MinSale,MaxSale,TRProd,TRDProd,MeanProdSale,TOrder
0,17850,1702,21,5317.89,5391.21,6.36,107.25,8,6,3.960370,34
1,13047,1356,106,3094.05,3237.54,4.95,68.00,23,20,3.932035,10
2,12583,5009,115,7187.34,7281.38,6.80,132.80,4,4,3.103603,15
3,13748,439,24,948.25,948.25,9.36,204.00,0,0,3.996429,5
4,15100,58,1,635.10,876.00,175.20,350.40,3,1,10.950000,3
...,...,...,...,...,...,...,...,...,...,...,...
4334,13436,76,12,196.89,196.89,10.20,25.50,0,0,5.830000,1
4335,15520,314,18,343.50,343.50,9.48,34.68,0,0,1.724444,1
4336,13298,96,2,360.00,360.00,90.00,270.00,0,0,3.750000,1
4337,14569,79,10,227.39,227.39,1.65,122.40,0,0,3.920000,1
