In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler 

In [2]:
df = pd.read_csv("../input/online-retail-ii-uci/online_retail_II.csv")
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1067371 non-null  object 
 1   StockCode    1067371 non-null  object 
 2   Description  1062989 non-null  object 
 3   Quantity     1067371 non-null  int64  
 4   InvoiceDate  1067371 non-null  object 
 5   Price        1067371 non-null  float64
 6   Customer ID  824364 non-null   float64
 7   Country      1067371 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 65.1+ MB


In [4]:

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Invoice      1067371 non-null  object        
 1   StockCode    1067371 non-null  object        
 2   Description  1062989 non-null  object        
 3   Quantity     1067371 non-null  int64         
 4   InvoiceDate  1067371 non-null  datetime64[ns]
 5   Price        1067371 non-null  float64       
 6   Customer ID  824364 non-null   float64       
 7   Country      1067371 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 65.1+ MB


In [5]:
df.isna().sum()

Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

In [6]:
df.dropna(inplace = True)
print(df.isna().sum())
print(df.shape)

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64
(824364, 8)


In [7]:
df["Total_Price"] = df["Quantity"] * df["Price"]
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Total_Price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.0


In [8]:
df = df[~df["Invoice"].str.contains("C", na=False)]
df.shape

(805620, 9)

In [9]:
today_date = max(df["InvoiceDate"])

In [10]:

rfm = df.groupby('Customer ID').agg({'InvoiceDate': lambda InvoiceDate: (today_date - InvoiceDate.max()).days,
                                     'Invoice': lambda Invoice: Invoice.nunique(),
                                     'Total_Price': lambda Total_Price: Total_Price.sum()})
rfm.columns = ["recency", "frequency", "monetary"]
rfm.head()

Unnamed: 0_level_0,recency,frequency,monetary
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,325,12,77556.46
12347.0,1,8,5633.32
12348.0,74,5,2019.4
12349.0,18,4,4428.69
12350.0,309,1,334.4


In [11]:
rfm.index.name = None
rfm.head()

Unnamed: 0,recency,frequency,monetary
12346.0,325,12,77556.46
12347.0,1,8,5633.32
12348.0,74,5,2019.4
12349.0,18,4,4428.69
12350.0,309,1,334.4


In [12]:
rfm.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
recency,5881.0,200.457745,209.474135,0.0,25.0,95.0,379.0,738.0
frequency,5881.0,6.287196,13.012879,1.0,1.0,3.0,7.0,398.0
monetary,5881.0,3017.076888,14734.128619,0.0,347.8,897.62,2304.18,608821.65


# K- Means

In [13]:
scaler = StandardScaler().fit(rfm)
rfm_new = scaler.transform(rfm)
rfm_new = pd.DataFrame(rfm_new, columns = ["Recency", "Frequency", "Monetary"], index = rfm.index)
rfm_new.head()

Unnamed: 0,Recency,Frequency,Monetary
12346.0,0.594598,0.439049,5.059391
12347.0,-0.952264,0.131635,0.177579
12348.0,-0.603743,-0.098925,-0.067718
12349.0,-0.871102,-0.175779,0.095814
12350.0,0.518209,-0.406339,-0.182088


In [14]:
kmeans = KMeans(n_clusters = 4, n_init=25, max_iter=300)
k_means = kmeans.fit(rfm_new)

In [15]:
k_means

KMeans(n_clusters=4, n_init=25)

In [16]:
k_means.labels_

array([0, 0, 0, ..., 1, 1, 0], dtype=int32)

In [17]:
segment = k_means.labels_

In [18]:
rfm["segment"] = segment + 1
rfm.head()

Unnamed: 0,recency,frequency,monetary,segment
12346.0,325,12,77556.46,1
12347.0,1,8,5633.32,1
12348.0,74,5,2019.4,1
12349.0,18,4,4428.69,1
12350.0,309,1,334.4,2


In [19]:
rfm["segment"].value_counts()

1    3842
2    2000
4      35
3       4
Name: segment, dtype: int64

In [20]:
rfm.groupby("segment")["monetary"].mean()

segment
1      3008.619160
2       764.479201
3    436835.792500
4     83086.079771
Name: monetary, dtype: float64

In [21]:
!pip install pingouin

Collecting pingouin
  Downloading pingouin-0.5.2.tar.gz (185 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.4/185.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Collecting pandas_flavor>=0.2.0
  Downloading pandas_flavor-0.3.0-py3-none-any.whl (6.3 kB)
Collecting outdated
  Downloading outdated-0.2.1-py3-none-any.whl (7.5 kB)
Collecting pandas_flavor>=0.2.0
  Downloading pandas_flavor-0.2.0-py2.py3-none-any.whl (6.6 kB)
Collecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: pingouin, littleutils
  Building wheel for pingouin (setup.py) ... [?25l- \ | done
[?25h  Created wheel for pingouin: filename=pingouin-0.5.2-py3-none-any.whl size=196206 sha256=4dd3ecdfbb0f147f2d60dbf20c3bd72b816419bbac80be24fe8bc6b16259adb0
  Stored in directory: /root/.cache/pip/wheels/11/5a/63/a6d32fc26

# MANOVA

In [22]:
import pingouin as pg

print(pg.homoscedasticity(rfm, dv = 'recency', group = 'segment', center = 'mean'))
print(pg.homoscedasticity(rfm, dv = 'frequency', group = 'segment', center = 'mean'))
print(pg.homoscedasticity(rfm, dv = 'monetary', group = 'segment', center = 'mean'))

                 W           pval  equal_var
levene  441.280947  1.452788e-258      False
                 W  pval  equal_var
levene  991.928728   0.0      False
                  W  pval  equal_var
levene  2372.233836   0.0      False


In [23]:
from statsmodels.multivariate.manova import MANOVA

In [24]:
model = MANOVA.from_formula("recency + frequency + monetary ~ segment", data = rfm)
print(model.mv_test())

                   Multivariate linear model
                                                                
----------------------------------------------------------------
       Intercept        Value  Num DF   Den DF   F Value  Pr > F
----------------------------------------------------------------
          Wilks' lambda 0.6255 3.0000 5877.0000 1173.1340 0.0000
         Pillai's trace 0.3745 3.0000 5877.0000 1173.1340 0.0000
 Hotelling-Lawley trace 0.5988 3.0000 5877.0000 1173.1340 0.0000
    Roy's greatest root 0.5988 3.0000 5877.0000 1173.1340 0.0000
----------------------------------------------------------------
                                                                
----------------------------------------------------------------
        segment         Value  Num DF   Den DF   F Value  Pr > F
----------------------------------------------------------------
          Wilks' lambda 0.2711 3.0000 5877.0000 5265.9311 0.0000
         Pillai's trace 0.7289 3.0000 5877.00