In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn import cluster, metrics, decomposition, preprocessing

import plotly.express as px
import plotly.graph_objects as go

# Import des modules contenant les fonctions utilitaires
import src.data_helpers as dth

In [2]:
# Réglage des graphiques
plt.style.use('seaborn-whitegrid')

plt.rc('font', size=12)
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=20)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rc('legend', fontsize=12)

dims_fig = (25,20)

In [62]:
orders_merge = pd.read_csv('merge.csv', sep=',')

In [63]:
rfm = orders_merge.groupby('customer_unique_id').agg(
    recence = ("order_purchase_timestamp", "max"),
    frequence = ("customer_id", "count"),
    montant_cumulé = ("payment_value", "sum")
)

In [64]:
date_format = '%Y-%m-%d %H:%M:%S'
max_date = datetime.strptime(rfm['recence'].max(), date_format)
rfm['recence'] = rfm['recence'].transform(lambda x: (max_date - datetime.strptime(x, date_format)).days)
rfm

Unnamed: 0_level_0,recence,frequence,montant_cumulé
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000366f3b9a7992bf8c76cfdf3221e2,160,1,141.90
0000b849f77a49e4a4ce2b2a4ca5be3f,163,1,27.19
0000f46a3911fa3c0805444483337064,585,1,86.22
0000f6ccb0745a6a4b88665a16c9f078,369,1,43.62
0004aac84e0df4da2b147fca70cf8255,336,1,196.89
...,...,...,...
fffcf5a5ff07b0908bd4e2dbc735a684,495,1,2067.42
fffea47cd6d3cc0a88bd621562a9d061,310,1,84.58
ffff371b4d645b6ecea244b27531430a,617,1,112.46
ffff5962728ec6157033ef9805bacc48,168,1,133.69


In [65]:
f0 = rfm[rfm['recence'] > 500]
f0

Unnamed: 0_level_0,recence,frequence,montant_cumulé
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000f46a3911fa3c0805444483337064,585,1,86.22
0005e1862207bf6ccc02e4228effd9a0,591,1,150.12
00115fc7123b5310cf6d3a3aa932699e,633,1,76.11
0011805441c0d1b68b48002f1d005526,541,1,297.14
00191a9719ef48ebb5860b130347bf33,546,1,58.86
...,...,...,...
ffebb6424578e7bb153322da9d65634f,639,1,665.70
ffedff0547d809c90c05c2691c51f9b7,566,1,32.42
ffef0ffa736c7b3d9af741611089729b,505,1,139.07
ffff371b4d645b6ecea244b27531430a,617,1,112.46


In [66]:
s0 = preprocessing.StandardScaler().fit(f0)
f0_std = pd.DataFrame(
    s0.transform(f0),
    columns=f0.columns
)

In [67]:
m0 = cluster.KMeans(5)
c0 = m0.fit_predict(f0_std)

In [68]:
f1 = rfm[rfm['recence'] > 470]

In [69]:
s1 = preprocessing.StandardScaler().fit(f1)
f1_std = pd.DataFrame(
    s1.transform(f1),
    columns=f1.columns
)

In [70]:
m1 = cluster.KMeans(5)
c1 = m1.fit_predict(f1_std)

In [71]:
f1_0_std = pd.DataFrame(
    s0.transform(f1),
    columns=f1.columns
)

In [72]:
c1_0 = m0.predict(f1_0_std)

In [73]:
a = metrics.adjusted_rand_score(c1_0, c1)
a

0.8204920491718937

In [49]:
def calculateAriScore(data, time):
    result = []
    for i in range (500, time, -time):
        # t0
        f0 = data[data['recence'] > i]

        s0 = preprocessing.StandardScaler().fit(f0)
        f0_std = pd.DataFrame(
            s0.transform(f0),
            columns=f0.columns
        )

        m0 = cluster.KMeans(5)
        c0 = m0.fit_predict(f0_std)

        # t1 = t0 + n days
        f1 = data[data['recence'] > i-time]

        s1 = preprocessing.StandardScaler().fit(f1)
        f1_std = pd.DataFrame(
            s1.transform(f1),
            columns=f1.columns
        )

        m1 = cluster.KMeans(5)
        c1 = m1.fit_predict(f1_std)

        # Comparaison entre t0 et t1
        f1_0_std = pd.DataFrame(
            s0.transform(f1),
            columns=f1.columns
        )

        c1_0 =  m0.predict(f1_0_std)

        score = metrics.adjusted_rand_score(c1_0, c1)

        result.append(score)
    
    return result

In [75]:
result = calculateAriScore(rfm, 15)
result

[0.8677655440731991,
 0.9008776516499173,
 0.8865086340292029,
 0.8909575496339828,
 0.8367586564012435,
 0.839126664087998,
 0.8143869054355705,
 0.8702638326256623,
 0.562460090558126,
 0.8889118826226364,
 0.9260396336512854,
 0.8547598869852064,
 0.8669075399085452,
 0.948903160672579,
 0.912736899252271,
 0.9225627783753885,
 0.9193919778302355,
 0.9206745852266548,
 0.9298768330433543,
 0.9350035868729845,
 0.9415923753545058,
 0.9475057505539635,
 0.9508058928095304,
 0.9495721214133882,
 0.9566203131419577,
 0.9628871157796226,
 0.9804142253569919,
 0.9661121137394848,
 0.41684339983437346,
 0.9233294238124661,
 0.9932305369564153,
 0.9978505952236336,
 0.9908512514160511]