# Data preparation
In questa sezione estrarremo i dati rilevanti per il clustering dei clienti.

## Import dataset

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [20]:
df = pd.read_csv("dataset/DQ-dataset.csv", sep="\t",index_col=0, decimal=",")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456644 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         456644 non-null  int64  
 1   BasketDate       456644 non-null  object 
 2   Sale             456644 non-null  float64
 3   CustomerID       456644 non-null  object 
 4   CustomerCountry  456644 non-null  object 
 5   ProdID           456644 non-null  object 
 6   ProdDescr        456644 non-null  object 
 7   Qta              456644 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 31.4+ MB


## Estrazione nuove features

Estraiamo le feature per ogni utente:
* I: the total number of items purchased by a customer during the period of
observation.

⚠️Si intende la somma delle quantità? Si chiarirà anche a lezione

* Iu: the number of distinct items bought by a customer in the period of
observation.

* Imax: the maximum number of items purchased by a customer during a
shopping session

⚠️Si intende la quantità massima? si chiarirà anche a lezione

* E: the Shannon entropy on the purchasing behaviour of the customer

Utilizzeremo la shanno entropy indicata nella seguente formula:
$H(X)=-\sum_{i=1}^{n} \frac{m_{i}}{m} \log _{2} \frac{m_{i}}{m}$


In [21]:
import numpy as np
def entropy(labels, base=None):

  m = len(labels)
  ent = 0.
  base = e if base is None else base

  if m <= 1:
    return 0

  _,mi = np.unique(labels, return_counts=True)
  probs = mi / m

  n_classes = np.count_nonzero(probs)
  if n_classes <= 1:
    return 0

  
  for p in probs:
    ent -= p * np.math.log(p, base)

  return ent

Quindi processiamo i dati e creiamo un entrata per ogni utente

In [24]:
df_pil = pd.read_csv("dataset/Country_PIL.csv", sep=";",index_col=0)

In [25]:
dict_pil={}
for index, row in df_pil.iterrows():
    dict_pil[row["Country"]]=row["PIL"]

In [5]:
new_data=[]
for elem in df.CustomerID.unique():
    I=df[df["CustomerID"]==elem].Qta.sum()
    Iu=len(df[df["CustomerID"]==elem].ProdID.unique())
    Imax=df[df["CustomerID"]==elem].Qta.max()
    E=entropy(df[df["CustomerID"]==elem].ProdID,2)
    
    country=df[df["CustomerID"]==elem].CustomerCountry.iloc[0]
    PIL = dict_pil[country]\
    new_data.append([elem, I, Iu,Imax,E, PIL])

    #print("Customer id:\t"+str(elem))
    #print("l:\t"+str(i))
    #print("lu:\t"+str(iu))
    #print("lmax:\t"+str(imax))
    #print("E:\t"+str(E))
new_df = pd.DataFrame(new_data, columns=['CustomerID', 'I', 'Iu', 'Imax','E','PIL'])


In [27]:
new_df.to_csv('dataset/_temp_CS-dataset.csv', sep='\t', index=False, decimal=",")

NameError: name 'new_df' is not defined

## Media Soldi Spesi per ogni basketID


In [28]:

#media soli spesi per ogni basketid
temp = df
temp["Total"] = temp["Sale"]*temp["Qta"]
temp = temp.drop(["Sale"],axis=1)
temp = temp.drop(["Qta"],axis=1)
temp.groupby(["ProdID","BasketID"])["Total"].mean()
temp.groupby(["BasketID"]).mean()

Unnamed: 0_level_0,Total
BasketID,Unnamed: 1_level_1
536365,19.874286
536366,11.100000
536367,23.227500
536368,17.512500
536369,17.850000
...,...
581583,62.300000
581584,70.320000
581585,15.669048
581586,84.800000


## Media soldi spesi per ogni customer

In [29]:
temp = df
temp["Total"] = temp["Sale"]*temp["Qta"]
temp = temp.drop(["Sale"],axis=1)
temp = temp.drop(["Qta"],axis=1)
temp.groupby(["CustomerID"])["Total"].sum()

CustomerID
10023N    1552.50
10202N      12.75
10353N     741.56
10370N    2780.29
10374N     618.82
           ...   
99606N     888.64
99733N       9.32
9983N       60.00
99958N     224.29
99983N     659.81
Name: Total, Length: 5575, dtype: float64