# Data preparation
In questa sezione estrarremo i dati rilevanti per il clustering dei clienti.

## Import dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("dataset/DQ-dataset.csv", sep="\t",index_col=0, decimal=",")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 456644 entries, 0 to 541908
Data columns (total 8 columns):
BasketID           456644 non-null int64
BasketDate         456644 non-null object
Sale               456644 non-null float64
CustomerID         456644 non-null object
CustomerCountry    456644 non-null object
ProdID             456644 non-null object
ProdDescr          456644 non-null object
Qta                456644 non-null int64
dtypes: float64(1), int64(2), object(5)
memory usage: 31.4+ MB


## Estrazione nuove features

Estraiamo le feature per ogni utente:
* I: the total number of items purchased by a customer during the period of
observation.(Si intende la somma delle quantità)

* Iu: the number of distinct items bought by a customer in the period of
observation.

* Imax: the maximum number of items purchased by a customer during a
shopping session (Si intende la quantità massima)

* E: the Shannon entropy on the purchasing behaviour of the customer

Utilizzeremo la shanno entropy indicata nella seguente formula:
$H(X)=-\sum_{i=1}^{n} \frac{m_{i}}{m} \log _{2} \frac{m_{i}}{m}$


In [3]:
import numpy as np
def entropy(labels, base=None):

  m = len(labels)
  ent = 0.
  base = e if base is None else base

  if m <= 1:
    return 0

  _,mi = np.unique(labels, return_counts=True)
  probs = mi / m

  n_classes = np.count_nonzero(probs)
  if n_classes <= 1:
    return 0

  
  for p in probs:
    ent -= p * np.math.log(p, base)

  return ent

## Gestione Pil

In [4]:
df_pil = pd.read_csv("dataset/Country_PIL.csv", sep=";",index_col=0)

In [5]:
dict_pil={}
for index, row in df_pil.iterrows():
    dict_pil[row["Country"]]=row["PIL"]

## Gestione spesa media Clienti

In [6]:
temp = df
temp["Total"] = temp["Sale"]*temp["Qta"]
temp = temp.drop(["Sale"],axis=1)
temp = temp.drop(["Qta"],axis=1)
sum_single_basket=pd.DataFrame(temp.groupby(["CustomerID","BasketID"])["Total"].sum())
mean_of_basket=sum_single_basket.groupby(["CustomerID"])["Total"].mean().reset_index()

## Gestione Saldi totali spesi clienti

In [7]:
temp = df
temp["Total"] = temp["Sale"]*temp["Qta"]
temp = temp.drop(["Sale"],axis=1)
temp = temp.drop(["Qta"],axis=1)
total_sum=temp.groupby(["CustomerID"])["Total"].sum().reset_index()

## Gestione distanza acquisti

In [8]:
temp=df.copy()

single_basket_date=[]
for elem in temp.BasketID.unique():
    data=temp[temp["BasketID"]==elem].BasketDate.iloc[0]
    cust=temp[temp["BasketID"]==elem].CustomerID.iloc[0]
    single_basket_date.append([elem,cust,data])

single_basket_date = pd.DataFrame(single_basket_date, columns=["BasketID","CustomerID","BasketDate"])

In [9]:
single_basket_date=single_basket_date.sort_values(["BasketID","CustomerID","BasketDate"])
single_basket_date["BasketDate"]=pd.to_datetime(single_basket_date.BasketDate)
single_basket_date["time_diff"]=single_basket_date.groupby("CustomerID")["BasketDate"].diff()
single_basket_date["time_diff"]=single_basket_date["time_diff"].apply(lambda x: x.days)

In [10]:
single_basket_date["time_diff"] = single_basket_date["time_diff"].fillna(0)

In [11]:
single_basket_date

Unnamed: 0,BasketID,CustomerID,BasketDate,time_diff
0,536365,17850.0,2010-12-01 08:26:00,0.0
1,536366,17850.0,2010-12-01 08:28:00,0.0
3,536367,13047.0,2010-12-01 08:34:00,0.0
2,536368,13047.0,2010-12-01 08:34:00,0.0
4,536369,13047.0,2010-12-01 08:35:00,0.0
...,...,...,...,...
19513,581583,13777.0,2011-12-09 12:23:00,8.0
19514,581584,13777.0,2011-12-09 12:25:00,0.0
19515,581585,15804.0,2011-12-09 12:31:00,10.0
19516,581586,13113.0,2011-12-09 12:49:00,14.0


## Entropia orari

In [12]:
df["hour"] = pd.to_datetime(df['BasketDate']).dt.hour

## Entropia settimana

In [13]:

df["BasketDate"] = pd.to_datetime(df['BasketDate']).dt.date
df["BasketDate"] = pd.to_datetime(df["BasketDate"] - pd.to_timedelta(0,unit="d"))
df["day_of_week"]=df["BasketDate"].dt.dayofweek#Monday=0, Sunday=6.

## Entropia mesi

In [14]:

df["BasketDate"] = pd.to_datetime(df['BasketDate']).dt.date
df["BasketDate"] = pd.to_datetime(df["BasketDate"] - pd.to_timedelta(0,unit="d"))
df["month"]=df["BasketDate"].dt.month# January=1, December=12.

## Entropia trimestre

In [15]:
df["BasketDate"] = pd.to_datetime(df['BasketDate']).dt.date
df["BasketDate"] = pd.to_datetime(df["BasketDate"] - pd.to_timedelta(0,unit="d"))
df["trim"]=pd.PeriodIndex(df.BasketDate, freq='Q')

## Creazione nuovo dataset

In [16]:
new_data=[]
for elem in df.CustomerID.unique():
    I=df[df["CustomerID"]==elem].Qta.sum()
    Iu=len(df[df["CustomerID"]==elem].ProdID.unique())
    Imax=df[df["CustomerID"]==elem].Qta.max()
    E=entropy(df[df["CustomerID"]==elem].ProdID,2)
    
    #Pil Indicator
    country=df[df["CustomerID"]==elem].CustomerCountry.iloc[0]
    PIL = dict_pil[country]

    # Basket Mean
    Mb=mean_of_basket[mean_of_basket["CustomerID"]==elem].Total.iloc[0]

    # Total Sale
    Ts=total_sum[total_sum["CustomerID"]==elem].Total.iloc[0]

    # Mean difference dates
    MeanD=single_basket_date[single_basket_date["CustomerID"]==elem].time_diff.mean()
    #Min difference dates
    MinD=single_basket_date[single_basket_date["CustomerID"]==elem].time_diff.min()
    #Max difference days
    MaxD=single_basket_date[single_basket_date["CustomerID"]==elem].time_diff.max()

    #Entropia trimestri
    Etrim=entropy(df[df["CustomerID"]==elem].trim,2)

    #Entropia month
    Emonth=entropy(df[df["CustomerID"]==elem].month,2)

    #Entropia day_week
    Eday_of_week=entropy(df[df["CustomerID"]==elem].day_of_week,2)

    #Entropia hour
    Ehour=entropy(df[df["CustomerID"]==elem].hour,2)

    new_data.append([elem, I, Iu,Imax,E, PIL,Mb, Ts, MeanD, MaxD, MinD, Etrim, Emonth, Eday_of_week, Ehour])


new_df = pd.DataFrame(new_data, columns=['CustomerID', 'I', 'Iu', 'Imax','E','PIL', 'Mb', 'Ts','MeanD', 'MaxD','MinD', 'Etrim','Emonth','Eday_of_week','Ehour'])


In [17]:
new_df.to_csv('dataset/CL_Raw-dataset.csv', sep='\t', index=True, decimal=",")