In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import statistics as stat
import seaborn as sns

In [2]:
# Starting dataset
df = pd.read_csv('cleaned_dataframe.csv', sep='\t', index_col=0)

In [None]:
df.head()

In [None]:
df.info()

In [3]:
# Create the dataframe for customers
data = pd.core.frame.DataFrame({'CustomerID' : df['CustomerID'].unique()})
print("Number of unique customers: ", len(data))

Number of unique customers:  4339


In [27]:
# Handle Discount

In [8]:
unique_products = []
unique_products_ret = []
total_products = []
total_sale = []
total_sale_wret = []
total_prod_returned = []
min_prod_spent = []
max_prod_spent = []
total_orders = []
mean_prod_sale = []

#Discount shouldn't be part of products bought nor returned 
for customer in tqdm(df['CustomerID'].unique(), total=len(data), desc="Iterating over customers"):
    # use set ?
    # dividing filters instead of conjuncting them is MUCH FASTER
    df_temp = df.loc[df['CustomerID'] == customer, ['ProdID']].drop_duplicates()
    unique_products.append(len(df_temp.loc[df_temp['ProdID'] != 'D']))
    
    df_temp = df.loc[(df['CustomerID'] == customer) & (df['Qta'] < 0), ['ProdID']].drop_duplicates()
    unique_products_ret.append(len(df_temp.loc[df_temp['ProdID'] != 'D']))
    
    
    # discount are counted in total orders
    total_orders.append(len(df.loc[(df['CustomerID'] == customer) & (df['Qta'] > 0), ['BasketID']].drop_duplicates()))
    
    
    # discount values are useful in meanps and tswr
    # discount values are NOT useful in mps, since they would always be the min value
    # discount values are useful in ts but have to be avoided in tp (not a real product)
    df_aux = df.loc[df['CustomerID'] == customer, ['Qta', 'Sale','ProdID']]
    tp = tpr = ts = tswr = 0
    mps = []
    meanps = []
    for _, r in df_aux.iterrows():
        cost = r['Qta']*r['Sale']
        tp += r['Qta'] if r['ProdID'] != 'D' else 0
        ts += cost
        if r['Qta'] > 0 or r['ProdID'] == 'D':
            tswr += cost
            meanps.append(r['Sale'])
            if r['Qta'] > 0: # not DISCOUNT
                mps.append(cost)
        else:
            tpr += 1 # or -r['Qta']
            
    total_products.append(tp)
    total_prod_returned.append(tpr)
    total_sale.append(ts)
    total_sale_wret.append(tswr)
    min_prod_spent.append(min(mps))
    max_prod_spent.append(max(mps))
    mean_prod_sale.append(stat.fmean(meanps)) 

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4339.0), HTML(value='')))




In [6]:
order_stats_per_customer = {} # key = customerID, value = list(max_products_order, min, mean_accum, max_spent_order, min, mean_accum, mean_counter) 
mean_counter = 0
data_about_order = df[['BasketID','CustomerID']].drop_duplicates()

######################################################################################################################################
#what about discount? no sense reasoning about those records if calculating max/min qta/sale, but reasonable to add them in mean calc#
######################################################################################################################################

for _, r in tqdm(data_about_order.iterrows(), total=len(data_about_order), desc="Iterating over orders"):
    customer = r["CustomerID"]
    order = r["BasketID"]
    df_aux = df.loc[(df['BasketID'] == order) & (df['CustomerID'] == customer), ['Qta','Sale','ProdID']]  
    temp_prod = 0
    temp_sale = 0 
    
    for _, int_r in df_aux.iterrows():
        temp_prod += int_r['Qta']
        temp_sale += int_r['Qta']*int_r['Sale']
        
    is_neg = False
    if temp_prod < 0 or temp_sale < 0:
        is_neg = True  
    
    if customer not in order_stats_per_customer:
        order_stats_per_customer[customer] = [math.inf, math.inf, 0, math.inf, math.inf, 0, 0]
        
    if not is_neg:
        if order_stats_per_customer[customer][0] == math.inf or order_stats_per_customer[customer][0] < temp_prod:
            order_stats_per_customer[customer][0] = temp_prod
        if order_stats_per_customer[customer][1] == math.inf or order_stats_per_customer[customer][1] > temp_prod:
            order_stats_per_customer[customer][1] = temp_prod
        if order_stats_per_customer[customer][3] == math.inf or order_stats_per_customer[customer][3] < temp_sale:
            order_stats_per_customer[customer][3] = temp_sale
        if order_stats_per_customer[customer][4] == math.inf or order_stats_per_customer[customer][4] > temp_sale:
            order_stats_per_customer[customer][4] = temp_sale
            
    order_stats_per_customer[customer][2] += temp_prod
    order_stats_per_customer[customer][5] += temp_sale
    order_stats_per_customer[customer][6] += 1

HBox(children=(HTML(value='Iterating over orders'), FloatProgress(value=0.0, max=21736.0), HTML(value='')))




In [7]:
max_spent_order_wise = []
min_spent_order_wise = []
mean_spent_order_wise = []
max_products_order_wise = []
min_products_order_wise = []
mean_products_order_wise = []

for customer in tqdm(df['CustomerID'].unique(), total=len(data), desc="Iterating over customers"):
    max_products_order_wise.append(order_stats_per_customer[customer][0])
    min_products_order_wise.append(order_stats_per_customer[customer][1])
    mean_products_order_wise.append(order_stats_per_customer[customer][2] / order_stats_per_customer[customer][6])
    max_spent_order_wise.append(order_stats_per_customer[customer][3])
    min_spent_order_wise.append(order_stats_per_customer[customer][4])
    mean_spent_order_wise.append(order_stats_per_customer[customer][5] / order_stats_per_customer[customer][6])

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4339.0), HTML(value='')))




In [42]:
mean_order_month = []
mean_prod_month = []
mean_sale_month = []
for customer in tqdm(df['CustomerID'].unique(), total=len(data), desc="Iterating over customers"):
    order_per_month = []
    prod_per_month = []
    sale_per_month = []
    df_aux = df.loc[df['CustomerID'] == customer, ['BasketDate', 'BasketID', 'ProdID', 'Sale', 'Qta']]
    for month in range(1, 13):
        tot = 0
        orders = set()
        prods = set()
        for _, r in df_aux.iterrows():
             if r['BasketDate'][3:5] == str(month):
                orders.add(r['BasketID'])
                prods.add(r['ProdID'])
                tot += r['Qta']*r['Sale']
        order_per_month.append(len(orders))
        prod_per_month.append(len(prods))
        sale_per_month.append(tot)
    mean_order_month.append(stat.fmean(order_per_month))
    mean_prod_month.append(stat.fmean(prod_per_month))
    mean_sale_month.append(stat.fmean(sale_per_month))

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4339.0), HTML(value='')))

HBox(children=(HTML(value='Iterating over customers'), FloatProgress(value=0.0, max=4339.0), HTML(value='')))





In [9]:
# I: the total number of items purchased by a customer during the period of observation.
data['TProd'] = total_products

In [10]:
# Iu: the number of distinct items bought by a customer in the period of observation.
data['DProd'] = unique_products

In [11]:
# Imax: the maximum number of items purchased by a customer during a shopping session
data['MaxPO'] = max_products_order_wise

In [12]:
# Imin: the minimum number of items purchased by a customer during a shopping session
data['MinPO'] = min_products_order_wise

In [None]:
# E: the Shannon entropy on the purchasing behaviour of the customer

In [13]:
# ProdPerOrderMean: mean number of items purchased by a customer during a shopping session
data['MeanProdOrder'] = mean_products_order_wise

In [14]:
# SaleTot: total amount spent during the period of observation
data['TSale'] = total_sale

In [15]:
# SaleTotWithoutReturn: total amount spent without negative quantity during the period of observation
data['TSaleWRet'] = total_sale_wret

In [16]:
# SaleMin: min amount spent for a product during the shopping session
data['MinPSale'] = min_prod_spent

In [17]:
# SaleMax: max amount spent for a product during the shopping session
data['MaxPSale'] = max_prod_spent

In [19]:
# SaleMeanPerOrder: mean amount spent for each order during the period of observation
data['MeanSaleOrder'] = mean_spent_order_wise

In [20]:
# NumRetProd: number of returned products in the period of observation
data['TRProd'] = total_prod_returned

In [21]:
# NumDistRetProd: number of distinct returned products in the period of observation
data['TRDProd'] = unique_products_ret

In [22]:
# MeanProdCostInOrder: mean cost of the products in the orders in the period of observation
data['MeanPSale'] = mean_prod_sale

In [23]:
# NumberOfOrders: total number of orders made by customer
data['TOrder'] = total_orders

In [57]:
# MeanOrderPerMonth
data['OrderMonth'] = mean_order_month

In [58]:
# MeanProductPerMonth
data['ProdMonth'] = mean_prod_month

In [59]:
# MeanAmountSpentPerMonth
data['SaleMonth'] = mean_sale_month

In [None]:
# CustomerCounty : value from original dataframe
#data['CustomerCountry'] = ''
#data = data.join(pd.core.frame.DataFrame({'CustomerID':df['CustomerID'], 'CustomerCountry':df['CustomerCountry']}).drop_duplicates().set_index('CustomerID'), on='CustomerID')

#tmp = pd.core.frame.DataFrame({'CustomerID':df['CustomerID'], 'CustomerCountry':df['CustomerCountry']}).drop_duplicates()
#tmp['CustomerID'].value_counts()[0:8]
# 4372 distinct customers - 4380 elements in tmp
# 8 customers have two different nationalities

# Find entries with multiple nationality
#counts = tmp['CustomerID'].value_counts().items()
#count_dict = {x[0]:x[1] for x in counts}
#for _, row in tmp.sort_values(by=['CustomerID']).iterrows():
#    if count_dict[row['CustomerID']] > 1:
#        print(f"{row['CustomerID']}-{row['CustomerCountry']}")

In [26]:
data

Unnamed: 0,CustomerID,TProd,DProd,MaxPO,MinPO,MeanProdOrder,TSale,TSaleWRet,MinPSale,MaxPSale,MeanSaleOrder,TRProd,TRDProd,MeanPSale,TOrder
9,14527,2084,330,96,1,24.105882,7789.69,7818.29,0.39,87.6,91.643412,3,3,5.859692,55


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4339 entries, 0 to 4338
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CustomerID     4339 non-null   int64  
 1   TProd          4339 non-null   int64  
 2   DProd          4339 non-null   int64  
 3   MaxPO          4339 non-null   int64  
 4   MinPO          4339 non-null   int64  
 5   MeanProdOrder  4339 non-null   float64
 6   TSale          4339 non-null   float64
 7   TSaleWRet      4339 non-null   float64
 8   MinPSale       4339 non-null   float64
 9   MaxPSale       4339 non-null   float64
 10  MeanSaleOrder  4339 non-null   float64
 11  TRProd         4339 non-null   int64  
 12  TRDProd        4339 non-null   int64  
 13  MeanPSale      4339 non-null   float64
 14  TOrder         4339 non-null   int64  
dtypes: float64(7), int64(8)
memory usage: 508.6 KB


In [None]:
f, ax = plt.subplots(figsize=(15, 13))
correlation = data.corr()
sns.heatmap(correlation, cmap="coolwarm", vmin=0, vmax=1, annot=True, mask=np.zeros_like(correlation, dtype=np.bool), square=True, ax=ax, edgecolor='black')
plt.xticks(rotation=315)
plt.title("Correlation matrix")