# Data preparation
In questa sezione estrarremo i dati rilevanti per il clustering dei clienti.

## Import dataset

In [161]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [162]:
df = pd.read_csv("dataset/DQ-dataset.csv", sep="\t",index_col=0, decimal=",")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456644 entries, 0 to 541908
Data columns (total 8 columns):
BasketID           456644 non-null int64
BasketDate         456644 non-null object
Sale               456644 non-null float64
CustomerID         456644 non-null object
CustomerCountry    456644 non-null object
ProdID             456644 non-null object
ProdDescr          456644 non-null object
Qta                456644 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 31.4+ MB


## Estrazione nuove features

Estraiamo le feature per ogni utente:
* I: the total number of items purchased by a customer during the period of
observation.(Si intende la somma delle quantità)

* Iu: the number of distinct items bought by a customer in the period of
observation.

* Imax: the maximum number of items purchased by a customer during a
shopping session (Si intende la quantità massima)

* E: the Shannon entropy on the purchasing behaviour of the customer

Utilizzeremo la shanno entropy indicata nella seguente formula:
$H(X)=-\sum_{i=1}^{n} \frac{m_{i}}{m} \log _{2} \frac{m_{i}}{m}$


In [163]:
import numpy as np
def entropy(labels, base=None):

  m = len(labels)
  ent = 0.
  base = e if base is None else base

  if m <= 1:
    return 0

  _,mi = np.unique(labels, return_counts=True)
  probs = mi / m

  n_classes = np.count_nonzero(probs)
  if n_classes <= 1:
    return 0

  
  for p in probs:
    ent -= p * np.math.log(p, base)

  return ent

## Gestione Pil

In [164]:
df_pil = pd.read_csv("dataset/Country_PIL.csv", sep=";",index_col=0)

In [165]:
dict_pil={}
for index, row in df_pil.iterrows():
    dict_pil[row["Country"]]=row["PIL"]

## Gestione spesa media Clienti

In [166]:
temp = df
temp["Total"] = temp["Sale"]*temp["Qta"]
temp = temp.drop(["Sale"],axis=1)
temp = temp.drop(["Qta"],axis=1)
sum_single_basket=pd.DataFrame(temp.groupby(["CustomerID","BasketID"])["Total"].sum())
mean_of_basket=sum_single_basket.groupby(["CustomerID"])["Total"].mean().reset_index()

## Gestione Saldi totali spesi clienti

In [167]:
temp = df
temp["Total"] = temp["Sale"]*temp["Qta"]
temp = temp.drop(["Sale"],axis=1)
temp = temp.drop(["Qta"],axis=1)
total_sum=temp.groupby(["CustomerID"])["Total"].sum().reset_index()

## Gestione distanza acquisti

In [168]:
temp=df.copy()

single_basket_date=[]
for elem in temp.BasketID.unique():
    data=temp[temp["BasketID"]==elem].BasketDate.iloc[0]
    cust=temp[temp["BasketID"]==elem].CustomerID.iloc[0]
    single_basket_date.append([elem,cust,data])

single_basket_date = pd.DataFrame(single_basket_date, columns=["BasketID","CustomerID","BasketDate"])

In [169]:
difference_dataframe=single_basket_date
difference_dataframe["BasketDate"]=pd.to_datetime(difference_dataframe["BasketDate"])
difference_dataframe["previousDate"]=difference_dataframe.BasketDate.shift()
difference_dataframe["difference_days"]=difference_dataframe["BasketDate"]-difference_dataframe["previousDate"]
difference_dataframe["difference_days"]=difference_dataframe["difference_days"].apply(lambda x: x.days)

## Creazione nuovo dataset

In [172]:
new_data=[]
for elem in df.CustomerID.unique():
    I=df[df["CustomerID"]==elem].Qta.sum()
    Iu=len(df[df["CustomerID"]==elem].ProdID.unique())
    Imax=df[df["CustomerID"]==elem].Qta.max()
    E=entropy(df[df["CustomerID"]==elem].ProdID,2)
    
    #Pil Indicator
    country=df[df["CustomerID"]==elem].CustomerCountry.iloc[0]
    PIL = dict_pil[country]

    # Basket Mean
    Mb=mean_of_basket[mean_of_basket["CustomerID"]==elem].Total.iloc[0]

    # Total Sale
    Ts=total_sum[total_sum["CustomerID"]==elem].Total.iloc[0]

    # Mean difference dates
    Md=single_basket_date[difference_dataframe["CustomerID"]==elem].difference_days.mean()


    new_data.append([elem, I, Iu,Imax,E, PIL,Mb, Ts, Md])

    

    #print("Customer id:\t"+str(elem))
    #print("l:\t"+str(i))
    #print("lu:\t"+str(iu))
    #print("lmax:\t"+str(imax))
    #print("E:\t"+str(E))
new_df = pd.DataFrame(new_data, columns=['CustomerID', 'I', 'Iu', 'Imax','E','PIL', 'Mb', 'Ts','Md'])


In [173]:
new_df[new_df["Md"]!=0]

Unnamed: 0,CustomerID,I,Iu,Imax,E,PIL,Mb,Ts,Md
18,13408.0,16172,233,192,7.571949,2567,452.290968,28042.04,0.016129
89,18229.0,2488,66,100,5.688924,2567,363.845,7276.9,0.05
181,13798.0,23804,113,432,6.274834,2567,647.552456,36910.49,0.017544
234,17819.0,1031,36,50,4.912782,2567,529.267143,3704.87,0.142857
253,17243.0,3365,179,96,7.019719,2567,333.019231,8658.5,0.346154
254,15465.0,1257,43,120,4.963964,2567,408.59,7354.62,0.388889
265,16839.0,8874,107,144,6.328929,2567,482.136471,16392.64,0.029412
275,14810.0,1016,106,14,6.406077,2567,189.807273,2087.88,0.090909
276,16353.0,2006,51,96,5.062961,2567,290.248261,6675.71,0.043478
338,12708.0,1292,85,25,6.166405,3570,244.291,2442.91,0.1


In [174]:
new_df.to_csv('dataset/_temp_CS-dataset.csv', sep='\t', index=False, decimal=",")