# Loading Data and Preprocessing

In [None]:
%%capture
%run "preprocessing.ipynb"

In [None]:
df_proc = df_proc # type: ignore

# RFMVD Score

R(Recenecy): How recently did the customer purchase?

F(Frequency): How often do they purchase?

M(Monetary): How much do they spend?

V(Variety): How many products sold by each customer?

D(Duration): How much time between each two transactions?

## RFM

Recency


In [None]:
# In old code, this was formatted as ,format='%d-%m-%Y %H:%M' [REVISE]
max_date = df_proc['InvoiceDate'].max()
df_proc['Diff'] = max_date - df_proc['InvoiceDate']

# Last transaction date for each customer
rfm_r = df_proc.groupby('Customer ID')['Diff'].min().reset_index()
rfm_r['Diff'] = rfm_r['Diff'].dt.days
rfm_r.columns =  ['Customer ID', 'Recency']
rfm_r.head(1)

Unnamed: 0,Customer ID,Recency
0,12346.0,528


In [None]:
# Frequency
rfm_f = df_proc.groupby('Customer ID')['Invoice'].count().reset_index()
rfm_f.columns = ['Customer ID', 'Frequency']
rfm_f.head(1)

Unnamed: 0,Customer ID,Frequency
0,12346.0,33


In [None]:
# Monetary 
df_proc['Total'] = df_proc['Price'] * df_proc['Quantity']
rfm_m = df_proc.groupby('Customer ID')['Total'].sum().reset_index()
rfm_m.columns = ['Customer ID', 'Monetary']
rfm_m.head(1)

Unnamed: 0,Customer ID,Monetary
0,12346.0,372.86


In [None]:
# Creating the RFM data frame.
rfm = pd.merge(rfm_r, rfm_f, on='Customer ID', how='inner')
rfm = pd.merge(rfm, rfm_m, on='Customer ID', how='inner')
rfm.head(1)

Unnamed: 0,Customer ID,Recency,Frequency,Monetary
0,12346.0,528,33,372.86


## VD

Variety

In [None]:
def div(x: pd.Series):
    u = x.nunique()
    return u

rfm_v = df_proc.groupby('Customer ID')['StockCode'].agg([div]).reset_index()
rfm_v.columns = ['Customer ID', 'Variety']
rfm_v.head(1)

Unnamed: 0,Customer ID,Variety
0,12346.0,26


In [None]:
# Duration
import statistics

def duration(x: pd.Series):
    m = []
    x = pd.to_datetime(x, format='%d-%m-%Y %H:%M')
    for i in range(1, len(x)-1):
        diff =  x.iloc[i] - x.iloc[i - 1]
        m.append(diff.days)

    return statistics.mean(m) if m else 0 # Should be revised how to handle empty arrays

rfm_d= df_proc.groupby('Customer ID')['InvoiceDate'].agg([duration]).reset_index()
rfm_d.columns = ['Customer ID', 'Duration']
rfm_d.head(1)


Unnamed: 0,Customer ID,Duration
0,12346.0,6.193548


In [None]:
# Ceating the RFMVD data frame.
vd = pd.merge(rfm_v, rfm_d, on='Customer ID', how='inner')
rfmvd = pd.merge(rfm, vd, on='Customer ID', how='inner')
rfmvd.head(1)

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,Variety,Duration
0,12346.0,528,33,372.86,26,6.193548


In [None]:
rfmvd.corr()

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,Variety,Duration
Customer ID,1.0,0.028031,0.009918,-0.040094,0.007932,-0.005308
Recency,0.028031,1.0,-0.214235,-0.227478,-0.321802,-0.10389
Frequency,0.009918,-0.214235,1.0,0.78831,0.867775,-0.081312
Monetary,-0.040094,-0.227478,0.78831,1.0,0.717126,-0.065134
Variety,0.007932,-0.321802,0.867775,0.717126,1.0,-0.126965
Duration,-0.005308,-0.10389,-0.081312,-0.065134,-0.126965,1.0


## Scaling

In [None]:
scaler = StandardScaler()
rfmvd_tmp = rfmvd[['Recency', 'Frequency', 'Monetary', 'Variety', 'Duration']]
rfmvd_scaled = pd.DataFrame(scaler.fit_transform(rfmvd_tmp))
rfmvd_scaled.columns =['Recency', 'Frequency', 'Monetary', 'Variety', 'Duration']
rfmvd_scaled.head(1)

Unnamed: 0,Recency,Frequency,Monetary,Variety,Duration
0,1.569937,-0.281706,-0.309073,-0.459605,0.073869
