# Data preparation
In questa sezione estrarremo i dati rilevanti per il clustering dei clienti.

## Import dataset

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [32]:
df = pd.read_csv("dataset/DQ-dataset.csv", sep="\t",index_col=0, decimal=",")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456644 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   BasketID         456644 non-null  int64  
 1   BasketDate       456644 non-null  object 
 2   Sale             456644 non-null  float64
 3   CustomerID       456644 non-null  object 
 4   CustomerCountry  456644 non-null  object 
 5   ProdID           456644 non-null  object 
 6   ProdDescr        456644 non-null  object 
 7   Qta              456644 non-null  int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 31.4+ MB


## Estrazione nuove features

Estraiamo le feature per ogni utente:
* I: the total number of items purchased by a customer during the period of
observation.(Si intende la somma delle quantità)

* Iu: the number of distinct items bought by a customer in the period of
observation.

* Imax: the maximum number of items purchased by a customer during a
shopping session (Si intende la quantità massima)

* E: the Shannon entropy on the purchasing behaviour of the customer

Utilizzeremo la shanno entropy indicata nella seguente formula:
$H(X)=-\sum_{i=1}^{n} \frac{m_{i}}{m} \log _{2} \frac{m_{i}}{m}$


In [33]:
import numpy as np
def entropy(labels, base=None):

  m = len(labels)
  ent = 0.
  base = e if base is None else base

  if m <= 1:
    return 0

  _,mi = np.unique(labels, return_counts=True)
  probs = mi / m

  n_classes = np.count_nonzero(probs)
  if n_classes <= 1:
    return 0

  
  for p in probs:
    ent -= p * np.math.log(p, base)

  return ent

## Gestione Pil

In [34]:
df_pil = pd.read_csv("dataset/Country_PIL.csv", sep=";",index_col=0)

In [35]:
dict_pil={}
for index, row in df_pil.iterrows():
    dict_pil[row["Country"]]=row["PIL"]

## Gestione spesa media Clienti

In [36]:
temp = df
temp["Total"] = temp["Sale"]*temp["Qta"]
temp = temp.drop(["Sale"],axis=1)
temp = temp.drop(["Qta"],axis=1)
sum_single_basket=pd.DataFrame(temp.groupby(["CustomerID","BasketID"])["Total"].sum())
mean_of_basket=sum_single_basket.groupby(["CustomerID"])["Total"].mean().reset_index()

## Gestione Saldi totali spesi clienti

In [37]:
temp = df
temp["Total"] = temp["Sale"]*temp["Qta"]
temp = temp.drop(["Sale"],axis=1)
temp = temp.drop(["Qta"],axis=1)
total_sum=temp.groupby(["CustomerID"])["Total"].sum().reset_index()

## Gestione distanza acquisti

In [38]:
temp=df.copy()

single_basket_date=[]
for elem in temp.BasketID.unique():
    data=temp[temp["BasketID"]==elem].BasketDate.iloc[0]
    cust=temp[temp["BasketID"]==elem].CustomerID.iloc[0]
    single_basket_date.append([elem,cust,data])

single_basket_date = pd.DataFrame(single_basket_date, columns=["BasketID","CustomerID","BasketDate"])

In [39]:
single_basket_date=single_basket_date.sort_values(["BasketID","CustomerID","BasketDate"])
single_basket_date["BasketDate"]=pd.to_datetime(single_basket_date.BasketDate)
single_basket_date["time_diff"]=single_basket_date.groupby("CustomerID")["BasketDate"].diff()
single_basket_date["time_diff"]=single_basket_date["time_diff"].apply(lambda x: x.days)

In [40]:
single_basket_date["time_diff"] = single_basket_date["time_diff"].fillna(0)

In [41]:
single_basket_date

72.0

## Entropia periodo

In [42]:
df["BasketDate"] = pd.to_datetime(df['BasketDate']).dt.date
df["BasketDate"] = pd.to_datetime(df["BasketDate"] - pd.to_timedelta(0,unit="d"))
df["trim"]=pd.PeriodIndex(df.BasketDate, freq='Q')

## Creazione nuovo dataset

In [45]:
new_data=[]
for elem in df.CustomerID.unique():
    I=df[df["CustomerID"]==elem].Qta.sum()
    Iu=len(df[df["CustomerID"]==elem].ProdID.unique())
    Imax=df[df["CustomerID"]==elem].Qta.max()
    E=entropy(df[df["CustomerID"]==elem].ProdID,2)
    
    #Pil Indicator
    country=df[df["CustomerID"]==elem].CustomerCountry.iloc[0]
    PIL = dict_pil[country]

    # Basket Mean
    Mb=mean_of_basket[mean_of_basket["CustomerID"]==elem].Total.iloc[0]

    # Total Sale
    Ts=total_sum[total_sum["CustomerID"]==elem].Total.iloc[0]

    # Mean difference dates
    MeanD=single_basket_date[single_basket_date["CustomerID"]==elem].time_diff.mean()
    #Min difference dates
    MinD=single_basket_date[single_basket_date["CustomerID"]==elem].time_diff.min()
    #Max difference days
    MaxD=single_basket_date[single_basket_date["CustomerID"]==elem].time_diff.max()
    #Entropia trimestri
    
    
    EDate=entropy(df[df["CustomerID"]==elem].trim,2)

    new_data.append([elem, I, Iu,Imax,E, PIL,Mb, Ts, MeanD, MaxD, MinD, EDate])


new_df = pd.DataFrame(new_data, columns=['CustomerID', 'I', 'Iu', 'Imax','E','PIL', 'Mb', 'Ts','MeanD', 'MaxD','MinD', 'EDate'])


In [46]:
new_df.to_csv('dataset/CS1-dataset.csv', sep='\t', index=True, decimal=",")