In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import statistics as stat

In [2]:
# Starting dataset
df = pd.read_csv('cleaned_dataframe.csv', sep='\t', index_col=0)

In [3]:
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta
0,536365,01/12/10 08:26,2.55,17850,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6
1,536365,01/12/10 08:26,3.39,17850,United Kingdom,71053,WHITE METAL LANTERN,6
2,536365,01/12/10 08:26,2.75,17850,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8
3,536365,01/12/10 08:26,3.39,17850,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6
4,536365,01/12/10 08:26,3.39,17850,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400220 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         400220 non-null  object 
 1   BasketDate       400220 non-null  object 
 2   Sale             400220 non-null  float64
 3   CustomerID       400220 non-null  int64  
 4   CustomerCountry  400220 non-null  object 
 5   ProdID           400220 non-null  object 
 6   ProdDescr        400220 non-null  object 
 7   Qta              400220 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 19.8+ MB


In [5]:
# Create the dataframe for customers
data = pd.core.frame.DataFrame({'CustomerID' : df['CustomerID'].unique()})
print("Number of unique customers: ", len(data))

Number of unique customers:  4339


In [6]:
# TProd - handle negative Qta
# TSale - handle negativa Qta

# handle discount
# otptimize code, iteration over same object multiple times

In [7]:
unique_products = []
unique_products_ret = []
total_products = []
total_sale = []
total_sale_wret = []
total_prod_returned = []
min_prod_spent = []
max_prod_spent = []
total_orders = []
mean_prod_sale = []
for customer in tqdm(df['CustomerID'].unique(), total=len(data), desc="Iterating over customers"):
    # use set ?
    unique_products.append(len(df.loc[df['CustomerID'] == customer, ['ProdID']].drop_duplicates()))
    unique_products_ret.append(len(df.loc[(df['CustomerID'] == customer) & (df['Qta'] < 0), ['ProdID']].drop_duplicates()))
    total_orders.append(len(df.loc[(df['CustomerID'] == customer) & (df['Qta'] > 0), ['BasketID']].drop_duplicates()))
    
    df_aux = df.loc[df['CustomerID'] == customer, ['Qta', 'Sale']]
    tp = tpr = ts = tswr = 0
    mps = []
    meanps = []
    for _, r in df_aux.iterrows():
        cost = r['Qta']*r['Sale']
        tp += r['Qta'] 
        ts += cost
        if r['Qta'] > 0:
            tswr += cost
            mps.append(cost)
            meanps.append(r['Sale'])
        else:
            tpr += 1 # or -r['Qta']
            
    total_products.append(tp)
    total_prod_returned.append(tpr)
    total_sale.append(ts)
    total_sale_wret.append(tswr)
    min_prod_spent.append(min(mps))
    max_prod_spent.append(max(mps))
    mean_prod_sale.append(stat.fmean(meanps)) 

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4339.0), HTML(value='')))




In [8]:
order_stats_per_customer = {} # key = customerID, value = list(max_products_order, min, mean_accum, max_spent_order, min, mean_accum, mean_counter) 
mean_counter = 0
data_about_order = df[['BasketID','CustomerID']].drop_duplicates()
for _, r in tqdm(data_about_order.iterrows(), total=len(data_about_order), desc="Iterating over orders"):
    customer = r["CustomerID"]
    order = r["BasketID"]
    df_aux = df.loc[(df['BasketID'] == order) & (df['CustomerID'] == customer), ['Qta','Sale']]  
    temp_prod = 0
    temp_sale = 0    
    for _, int_r in df_aux.iterrows():
        temp_prod += int_r['Qta']
        temp_sale += int_r['Qta']*int_r['Sale']
    is_neg = True
    if temp_prod < 0 or temp_sale < 0:
        is_neg = False  
    if customer not in order_stats_per_customer:
        order_stats_per_customer[customer] = [math.inf, math.inf, math.inf, math.inf, math.inf, math.inf, 1]
    else:
        if not is_neg:
            if order_stats_per_customer[customer][0] < temp_prod or order_stats_per_customer[customer][0] != math.inf:
                order_stats_per_customer[customer][0] = temp_prod
            if order_stats_per_customer[customer][1] > temp_prod or order_stats_per_customer[customer][1] != math.inf:
                order_stats_per_customer[customer][1] = temp_prod
        order_stats_per_customer[customer][2] += temp_prod
        if not is_neg:
            if order_stats_per_customer[customer][3] < temp_sale or order_stats_per_customer[customer][3] != math.inf:
                order_stats_per_customer[customer][3] = temp_sale
            if order_stats_per_customer[customer][4] > temp_sale or order_stats_per_customer[customer][4] != math.inf:
                order_stats_per_customer[customer][4] = temp_sale
        order_stats_per_customer[customer][5] += temp_sale
        order_stats_per_customer[customer][6] += 1

HBox(children=(HTML(value='Iterating over orders'), FloatProgress(value=0.0, max=21678.0), HTML(value='')))




In [9]:
max_spent_order_wise = []
min_spent_order_wise = []
mean_spent_order_wise = []
max_products_order_wise = []
min_products_order_wise = []
mean_products_order_wise = []

for customer in tqdm(df['CustomerID'].unique(), total=len(data), desc="Iterating over customers"):
    max_products_order_wise.append(order_stats_per_customer[customer][0])
    min_products_order_wise.append(order_stats_per_customer[customer][1])
    mean_products_order_wise.append(order_stats_per_customer[customer][2] / order_stats_per_customer[customer][6])
    max_spent_order_wise.append(order_stats_per_customer[customer][3])
    min_spent_order_wise.append(order_stats_per_customer[customer][4])
    mean_spent_order_wise.append(order_stats_per_customer[customer][5] / order_stats_per_customer[customer][6])

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4339.0), HTML(value='')))




In [10]:
mean_order_month = []
mean_prod_month = []
mean_sale_month = []
for customer in tqdm(df['CustomerID'].unique(), total=len(data), desc="Iterating over customers"):
    order_per_month = []
    prod_per_month = []
    sale_per_month = []
    df_aux = df.loc[df['CustomerID'] == customer, ['BasketDate', 'BasketID', 'ProdID', 'Sale', 'Qta']]
    for month in range(1, 13):
        tot = 0
        orders = set()
        prods = set()
        for _, r in df_aux.iterrows():
             if r['BasketDate'][3:5] == str(month):
                orders.add(r['BasketID'])
                prods.add(r['ProdID'])
                tot += r['Qta']*r['Sale']
        order_per_month.append(len(orders))
        prod_per_month.append(len(prods))
        sale_per_month.append(tot)
    mean_order_month.append(stat.fmean(order_per_month))
    mean_prod_month.append(stat.fmean(prod_per_month))
    mean_sale_month.append(stat.fmean(sale_per_month))

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4339.0), HTML(value='')))




In [11]:
# I: the total number of items purchased by a customer during the period of observation.
data['TProd'] = total_products

In [12]:
# Iu: the number of distinct items bought by a customer in the period of observation.
data['DProd'] = unique_products

In [13]:
# Imax: the maximum number of items purchased by a customer during a shopping session
data['MaxPO'] = max_products_order_wise

In [14]:
# Imin: the minimum number of items purchased by a customer during a shopping session
data['MinPO'] = min_products_order_wise

In [15]:
# E: the Shannon entropy on the purchasing behaviour of the customer

In [16]:
# ProdPerOrderMean: mean number of items purchased by a customer during a shopping session
data['MeanProdOrder'] = mean_products_order_wise

In [17]:
# SaleTot: total amount spent during the period of observation
data['TSale'] = total_sale

In [18]:
# SaleTotWithoutReturn: total amount spent without negative quantity during the period of observation
data['TSaleWRet'] = total_sale_wret

In [19]:
# SaleMin: min amount spent for a product during the shopping session
data['MinPSale'] = min_prod_spent

In [20]:
# SaleMax: max amount spent for a product during the shopping session
data['MaxPSale'] = max_prod_spent

In [21]:
# SaleMeanPerOrder: mean amount spent for each order during the period of observation

In [22]:
# NumRetProd: number of returned products in the period of observation
data['TRProd'] = total_prod_returned

In [23]:
# NumDistRetProd: number of distinct returned products in the period of observation
data['TRDProd'] = unique_products_ret

In [24]:
# MeanProdCostInOrder: mean cost of the products in the orders in the period of observation
data['MeanPSale'] = mean_prod_sale

In [25]:
# NumberOfOrders: total number of orders made by customer
data['TOrder'] = total_orders

In [26]:
# MeanOrderPerMonth
data['OrderMonth'] = mean_order_month

In [27]:
# MeanProductPerMonth
data['ProdMonth'] = mean_prod_month

In [28]:
# MeanAmountSpentPerMonth
data['SaleMonth'] = mean_sale_month

In [29]:
# OrderPerTrimester

In [30]:
# ProductPerTrimester

In [31]:
# PrizePerTrimester

In [32]:
# OrderPerTrimester 2-3

In [33]:
# ProductPerTrimester 2-3

In [34]:
# PrizePerTrimester 2-3

In [35]:
# CustomerCounty : value from original dataframe
#data['CustomerCountry'] = ''
#data = data.join(pd.core.frame.DataFrame({'CustomerID':df['CustomerID'], 'CustomerCountry':df['CustomerCountry']}).drop_duplicates().set_index('CustomerID'), on='CustomerID')

#tmp = pd.core.frame.DataFrame({'CustomerID':df['CustomerID'], 'CustomerCountry':df['CustomerCountry']}).drop_duplicates()
#tmp['CustomerID'].value_counts()[0:8]
# 4372 distinct customers - 4380 elements in tmp
# 8 customers have two different nationalities

# Find entries with multiple nationality
#counts = tmp['CustomerID'].value_counts().items()
#count_dict = {x[0]:x[1] for x in counts}
#for _, row in tmp.sort_values(by=['CustomerID']).iterrows():
#    if count_dict[row['CustomerID']] > 1:
#        print(f"{row['CustomerID']}-{row['CustomerCountry']}")

In [36]:
data

Unnamed: 0,CustomerID,TProd,DProd,MaxPO,MinPO,MeanProdOrder,TSale,TSaleWRet,MinPSale,MaxPSale,TRProd,TRDProd,MeanPSale,TOrder,OrderMonth,ProdMonth,SaleMonth
0,17850,1702.0,21,inf,-31.0,inf,5317.89,5391.21,6.36,107.25,8,6,3.960370,34,2.833333,1.750000,449.267500
1,13047,1356.0,106,inf,-6.0,inf,3094.05,3237.54,4.95,68.00,23,20,3.932035,10,0.500000,4.083333,75.105000
2,12583,5009.0,115,inf,-36.0,inf,7187.34,7281.38,6.80,132.80,4,4,3.103603,15,0.500000,6.500000,248.013333
3,13748,439.0,24,inf,inf,inf,948.25,948.25,9.36,204.00,0,0,3.996429,5,0.083333,0.083333,17.000000
4,15100,58.0,1,inf,-3.0,inf,635.10,876.00,175.20,350.40,3,1,10.950000,3,0.333333,0.083333,41.062500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4334,13436,76.0,12,inf,inf,inf,196.89,196.89,10.20,25.50,0,0,5.830000,1,0.083333,1.000000,16.407500
4335,15520,314.0,18,inf,inf,inf,343.50,343.50,9.48,34.68,0,0,1.724444,1,0.083333,1.500000,28.625000
4336,13298,96.0,2,inf,inf,inf,360.00,360.00,90.00,270.00,0,0,3.750000,1,0.083333,0.166667,30.000000
4337,14569,79.0,10,inf,inf,inf,227.39,227.39,1.65,122.40,0,0,3.920000,1,0.083333,0.833333,18.949167


In [None]:
data.info()