In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt


In [2]:
# Starting dataset
df = pd.read_csv('customer_supermarket.csv', sep='\t', index_col=0)

In [3]:
# Handle NAN CustomerID rows -> drop them
# print(len(df[~df['CustomerID'].notnull()]))
df.drop(df[~df['CustomerID'].notnull()].index, inplace=True)

In [4]:
# Set Sale to have the correct type
df['Sale'].replace(to_replace=r'(\d+),(\d*)', value=r'\1.\2', regex=True, inplace=True)
df['Sale'] = df['Sale'].astype(float)

In [5]:
# Clean CustomerID from trailing '.0'
df['CustomerID'] = df['CustomerID'].astype(str)
df['CustomerID'].replace(to_replace=r'(\d+)\.(?:\d*)?', value=r'\1', regex=True, inplace=True)

In [6]:
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta
0,536365,01/12/10 08:26,2.55,17850,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6
1,536365,01/12/10 08:26,3.39,17850,United Kingdom,71053,WHITE METAL LANTERN,6
2,536365,01/12/10 08:26,2.75,17850,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8
3,536365,01/12/10 08:26,3.39,17850,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6
4,536365,01/12/10 08:26,3.39,17850,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406830 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         406830 non-null  object 
 1   BasketDate       406830 non-null  object 
 2   Sale             406830 non-null  float64
 3   CustomerID       406830 non-null  object 
 4   CustomerCountry  406830 non-null  object 
 5   ProdID           406830 non-null  object 
 6   ProdDescr        406830 non-null  object 
 7   Qta              406830 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 18.6+ MB


In [8]:
# Create the dataframe for customers
data = pd.core.frame.DataFrame()
data['CustomerID'] = df['CustomerID'].unique()

In [9]:
# CustomerCounty : value from original dataframe
data['CustomerCountry'] = ''
#data = data.join(pd.core.frame.DataFrame({'CustomerID':df['CustomerID'], 'CustomerCountry':df['CustomerCountry']}).drop_duplicates().set_index('CustomerID'), on='CustomerID')

tmp = pd.core.frame.DataFrame({'CustomerID':df['CustomerID'], 'CustomerCountry':df['CustomerCountry']}).drop_duplicates()
tmp['CustomerID'].value_counts()[0:8]
# 4372 distinct customers - 4380 elements in tmp
# 8 customers have two different nationalities

12394    2
12422    2
12455    2
12457    2
12429    2
12417    2
12431    2
12370    2
Name: CustomerID, dtype: int64

In [10]:
# Find entries with multiple nationality
counts = tmp['CustomerID'].value_counts().items()
count_dict = {x[0]:x[1] for x in counts}
for _, row in tmp.sort_values(by=['CustomerID']).iterrows():
    if count_dict[row['CustomerID']] > 1:
        print(f"{row['CustomerID']}-{row['CustomerCountry']}")

12370-Cyprus
12370-Austria
12394-Denmark
12394-Belgium
12417-Spain
12417-Belgium
12422-Australia
12422-Switzerland
12429-Austria
12429-Denmark
12431-Australia
12431-Belgium
12455-Cyprus
12455-Spain
12457-Cyprus
12457-Switzerland


In [11]:
# I: the total number of items purchased by a customer during the period of observation
data['I'] = 0

In [12]:
# Iu: the number of distinct items bought by a customer in the period of observation.
data['Iu'] = 0

In [13]:
# Imax: the maximum number of items purchased by a customer during a shopping session
data['Imax'] = 0

In [14]:
# E: the Shannon entropy on the purchasing behaviour of the customer
data['E'] = 0

In [15]:
data

Unnamed: 0,CustomerID,CustomerCountry,I,Iu,Imax,E
0,17850,,0,0,0,0
1,13047,,0,0,0,0
2,12583,,0,0,0,0
3,13748,,0,0,0,0
4,15100,,0,0,0,0
...,...,...,...,...,...,...
4367,13436,,0,0,0,0
4368,15520,,0,0,0,0
4369,13298,,0,0,0,0
4370,14569,,0,0,0,0
