In [1]:
from tmall_utils import *  # 引用寫好的 function
from collections import Counter
import datetime as dt
import json
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import pathlib
from scipy import stats
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Variables
dataset_folderpath = 'TMall_dataset/'
preprocessed_folderpath = 'TMall_preprocessed/'
output_folderpath = 'TMall_output/'
tmall_log_filepath = 'TMall_for_user_states_define_transformed.pkl'
produce_date = str(dt.date.today()).replace('-', '')
start_day = 178
end_day = 184
label = 'none'
check_best_centers = False  # {True, False}
check_cluster_transition = False  # {True, False}
cluster_method = 'hcut'  # {'hcut', 'kmeans'}
center = 8  # center ≥ 2
OM_version = 'OMstran'  # {'OM', 'OMloc', 'OMslen', 'OMspell', 'OMstran'}
sm_method = 'TRATE'
indel_method = 'auto'
version = f'V3.2-duration_{start_day}_{end_day}-label_{label}'
state_sequence_output_filepath = f'TMall_user_state_sequence_table_{produce_date}_{version}.csv'
action_count_output_filepath = f'action_counts-{produce_date}_{version}.csv'
file_name = f'{produce_date}_V3.2-duration_{start_day}_{end_day}-label_{label}'


#### 創建資料夾及載入資料集

In [10]:
# 若本地端沒有該資料夾則則創建
for path in [preprocessed_folderpath, output_folderpath]:
    path = pathlib.Path(path)
    path.mkdir(parents=True, exist_ok=True)

# 載入資料集
df = pd.read_pickle(dataset_folderpath + tmall_log_filepath)

# 排除 11/12 的少量流量，統一將雙 11 當天作為最後一天
df = df[df['time_stamp']<=1111]
print('Shape of tmall web log dataset: ', df.shape)
df.head()
print('Show proportion of user features: ')
for col in ['age_range', 'gender']:
    print(df.groupby([col]).size())


Shape of tmall web log dataset:  (54925284, 17)


Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,age_range,gender,click,add_to_cart,purchase,add_to_favorite,day,month,dayOfWeek,day_stamp,weekOfYear
0,328862,323294,833,2882,2661,829,6.0,1.0,1,0,0,0,29,8,4,110,34
1,328862,844400,1271,2882,2661,829,6.0,1.0,1,0,0,0,29,8,4,110,34
2,328862,575153,1271,2882,2661,829,6.0,1.0,1,0,0,0,29,8,4,110,34
3,328862,996875,1271,2882,2661,829,6.0,1.0,1,0,0,0,29,8,4,110,34
4,328862,1086186,1271,1253,1049,829,6.0,1.0,1,0,0,0,29,8,4,110,34


Show proportion of user features: 
age_range
0.0     9931159
1.0        1721
2.0     5385015
3.0    14976839
4.0    11802045
5.0     6199995
6.0     5413712
7.0     1052264
8.0      162534
dtype: int64
gender
0.0    40734644
1.0    12135516
2.0     2055124
dtype: int64


### 資料前處理與特徵工程

#### 研究對象用戶抽樣
- 根據年齡層抽樣


In [11]:
# 依據年齡層抽樣用戶
user_with_agerange = df[['user_id', 'age_range']].drop_duplicates()
agerange_proportion = user_with_agerange.groupby(['age_range']).size().reset_index().rename(columns={0: 'count'})
agerange_proportion['percentage'] = agerange_proportion['count'] / agerange_proportion['count'].sum()
user_with_agerange = pd.merge(user_with_agerange, agerange_proportion[['age_range', 'percentage']], on='age_range', how='left')
sampled_user_with_agerange = user_with_agerange.sample(frac=0.01, weights='percentage')
print(sampled_user_with_agerange.shape)
sampled_user_with_agerange.head()
user_list = sampled_user_with_agerange.user_id.unique()
sampled_df = df[df['user_id'].isin(user_list)].sort_values('day_stamp').reset_index()
sampled_df = sampled_df.drop(['index'], axis=1)

# 儲存抽樣後用戶備用
sampled_df.to_csv(preprocessed_folderpath + f'Tmall-sampledData_agerange-{produce_date}_{version}.csv', index=False)

# 僅篩選抽樣用戶行為紀錄使用
sampled_df = sampled_df[(sampled_df['day_stamp'] >= start_day) & (sampled_df['day_stamp'] <= end_day)]


(4242, 3)


Unnamed: 0,user_id,age_range,percentage
234853,177050,0.0,0.219049
25003,228403,3.0,0.268456
290620,89261,0.0,0.219049
119369,43439,5.0,0.096134
182143,123529,3.0,0.268456


#### 將物流進程日誌轉換為狀態序列

In [12]:
# 以字典儲存抽樣用戶的每日各項行為次數
accum_by_timestamp = user_daily_behavior_todict(user_list=user_list, sampled_df=sampled_df, start_day=start_day, end_day=end_day)
with open(preprocessed_folderpath + 'TMall_user_counts_{}_{}.json'.format(produce_date, version), 'w') as fp:
    json.dump(accum_by_timestamp, fp)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4242/4242 [00:46<00:00, 91.79it/s]


In [13]:
# 測試能否順利載入
with open(preprocessed_folderpath + 'TMall_user_counts_{}_{}.json'.format(produce_date, version), 'r') as f:
    tmall_user_count_stats = json.load(f)
# print(tmall_user_count_stats.keys())


In [14]:
# 將字典中所有抽樣用戶每日各項行為次數，轉換為狀態序列矩陣
user_daily_states = user_daily_state_tolist(user_action_dict=tmall_user_count_stats)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4242/4242 [00:13<00:00, 308.07it/s]


In [23]:
# 儲存用戶狀態序列為 .csv
print(produce_date, version)

columns = ['user_id'] + ['day_' + str(x) for x in range(start_day, end_day + 1)]
user_states_table = pd.DataFrame(user_daily_states, columns=columns)
user_states_table.to_csv(preprocessed_folderpath + state_sequence_output_filepath, index=False)
user_states_table.head()


20210715 V3.2-duration_178_184-label_none


Unnamed: 0,user_id,day_178,day_179,day_180,day_181,day_182,day_183,day_184
0,177050,no_browse,browse_to_add_to_consider,no_browse,no_browse,browse,browse_to_purchase,browse_to_purchase
1,228403,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,browse_to_purchase
2,89261,no_browse,no_browse,no_browse,no_browse,no_browse,browse,browse_to_purchase
3,43439,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,browse_to_purchase
4,123529,browse_to_add_to_consider,browse,no_browse,directly_add_to_consider,no_browse,no_browse,directly_purchase


In [21]:
# 計算每位用戶行為事件次數累計
action_types = list(set(user_states_table.iloc[:, 1:user_states_table.shape[1]].values.reshape(-1)))
seq_agg_count_list = user_states_table.iloc[:, 1:user_states_table.shape[1]].apply(count_action_num, axis=1).values
for _id in tqdm(range(0, len(seq_agg_count_list))):
    if _id == 0:
        df_action_agg = pd.DataFrame([seq_agg_count_list[_id]])
    else:
        df_action_agg = pd.concat([df_action_agg, pd.DataFrame([seq_agg_count_list[_id]])])
        
df_action_agg['user_id'] = user_states_table['user_id'].values
df_action_agg = df_action_agg.reset_index().drop(['index'], axis=1)
df_action_agg.to_csv(preprocessed_folderpath + action_count_output_filepath, index=False)
print(df_action_agg.shape)
df_action_agg.head()


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4242/4242 [00:03<00:00, 1210.65it/s]

(4242, 7)





Unnamed: 0,browse,browse_to_add_to_consider,browse_to_purchase,directly_add_to_consider,directly_purchase,no_browse,user_id
0,1,1,2,0,0,3,177050
1,0,0,1,0,0,6,228403
2,1,0,1,0,0,5,89261
3,0,0,1,0,0,6,43439
4,1,1,0,1,1,3,123529


In [22]:
# 檢視用戶狀態分布比例
states_count_dict = dict(Counter(user_states_table.iloc[:, 1:user_states_table.shape[1]].values.reshape(-1).tolist()))
states_count_df = pd.DataFrame([list(states_count_dict.keys()), list(states_count_dict.values())]).T
states_count_df.columns = ['action', 'count']
states_count_df.to_csv(preprocessed_folderpath + f'TMall_user_state-count_{produce_date}_{version}.csv', index=False)
states_count_df


Unnamed: 0,action,count
0,no_browse,19392
1,browse_to_add_to_consider,1677
2,browse,3387
3,browse_to_purchase,4424
4,directly_add_to_consider,420
5,directly_purchase,394


### Optimal Matching Analysis (using R)


#### 載入資料

```r
library('TraMineR')
library('cluster')
library('factoextra')
library('NbClust')
library('caret')
library('RcmdrMisc')
library('ggplot2')

# Variables
case_name = 'Tmall'
preprocess_folderpath <- sprintf('%s/%s_preprocessed', case_name, case_name)
output_folderpath <- sprintf('%s/%s_output', case_name, case_name)
produce_date <- '20210715'
start_day <- 178
end_day <- 184
label <- 'none'

# Assign case name
if (case_name == 'TMall') {
  file_name <- sprintf('%s_V3.2-duration_%s_%s-label_%s', produce_date, start_day, end_day, label)
  data <- read.csv(sprintf('%s/TMall_user_state_sequence_table_%s.csv', preprocess_folderpath, file_name))
  action_counts <- read.csv(sprintf('%s/action_counts-%s.csv', preprocess_folderpath, file_name))
  colorset <- c("gray", "forestgreen", "darkorange3", "green", "gold", "white")
} else {
  file_name <- sprintf('%s-sentday_%s', produce_date, sent_day)
  data <- read.csv(sprintf('%s/order_logistic_states-%s.csv', preprocess_folderpath, file_name))
  colorset <- c("white","aquamarine3","azure1","azure2","dodgerblue1","aquamarine1","azure3","aquamarine4","green","green3")
}

```

#### 計算狀態轉移矩陣、置換/增刪成本、相異度矩陣


```r
# Check if missing values exist
data[is.na(data)]

# Sequence Formatting
data.seq <- seqdef(data, 2:dim(data)[2], xtstep=6)

# Transition rates between states
data.trate <- round(seqtrate(data.seq, time.varying = FALSE), 2)
View(data.trate)
write.csv(data.trate, sprintf('%s/transition_rate_%s.csv', preprocess_folderpath, file_name))

# Substitution Cost & Indel Cost
data.seq.cost <- seqcost(data.seq, method='TRATE')
write.csv(data.seq.cost$sm, sprintf('%s/substitution_cost_matrix_%s.csv', preprocess_folderpath, file_name))

# Optimal Matching: computing distances between sequences
data.om <- seqdist(data.seq, method = 'OMstran', sm = 'TRATE', indel = "auto")  # method = {'OM', 'OMloc', 'OMslen', 'OMspell', 'OMstran'}

# output - dissimilarity matrix
write.csv(data.om, 
          sprintf('%s/dissimilarity_matrix-%s-seqdist_%s-sm_%s-indel_%s.csv', 
                  preprocess_folderpath, file_name, 'OMstran', 'TRATE', 'auto'), 
          row.names = FALSE)

```



### 分群分析


#### 搜尋最佳群數 (R)

```r
# Find optimal number of clusters, method = "silhouette", "wss"
fviz_nbclust(data.om, FUNcluster = hcut, method = "silhouette", k.max = 8, print.summary = TRUE)
```


#### 搜尋最佳群數 (Python)

In [None]:
if check_best_centers:
    cluster_methods = ['hcut', 'kmeans']
    OM_versions = ['OM', 'OMloc', 'OMslen', 'OMspell', 'OMstran']

    print('Start checking for best cluster center number...')
    for cluster_method in cluster_methods:
        for OM_version in OM_versions:
            silhouette_plot(file_name=file_name, OM_version=OM_version, sm_method=sm_method, indel_method=indel_method,
                            cluster_method=cluster_method, max_cluster=8)


#### 分群演算法並回標原資料集

```r
# Clustering: Hierarchical Clusterin
# (1) With origin data
clusterward <- agnes(action_counts, diss = FALSE, method = 'ward')
cluster.result.ac <- cutree(clusterward, k=8)

# (2) With dissimilarity matrix
clusterward <- agnes(data.om, diss = TRUE, method = 'ward')
cluster.result.om <- cutree(clusterward, k=8)

# Clustering: K-means
# (1) With origin data
cluster.result.ac <- kmeans(action_counts, centers = 8)$cluster

# (2) With dissimilarity matrix
cluster.result.om <- kmeans(data.om, centers = 8)$cluster

# change cluster label name
cluster.label.ac <- factor(cluster.result.ac, labels = paste('Cluster'), 1:8)
cluster.label.om <- factor(cluster.result.om, labels = paste('Cluster'), 1:8)


# output - cluster labels add back to user state sequences data
action_counts$hcut_cluster <- cluster.label.ac

data$hcut_cluster <- cluster.label.om

write.csv(data, 
          sprintf('%s/clustered-%s-seqdist_%s-sm_%s-indel_%s-method_%s-center_%s.csv',
                  output_folderpath, file_name, 'OMstran', 'TRATE', 'auto', 'hcut', '8'), 
          row.names = FALSE)

write.csv(action_counts, 
          sprintf('%s/clustered-%s-seqdist_%s-method_%s-center_%s.csv',
                  output_folderpath, file_name, 'action_counts', 'hcut', '8'), 
          row.names = FALSE)
```

#### 序列狀態分布圖

```r
# Plots
plot_name <- sprintf('%s/cluster_distribution-%s-seqdist_%s-sm_%s-indel_%s-method_%s-center_%s', 
                     output_folderpath, file_name, 'OMstran', 'TRATE', 'auto', 'hcut', '8')

# State distribution plot
seqdplot(data.seq, group = cluster.label.ac, border=NA, cpal=colorset)
seqdplot(data.seq, group = cluster.label.om, border=NA, cpal=colorset)


```

### 計算分群指標及其統計顯著性

#### 載入中間產出資料集

In [3]:
# 已標記分群標籤的用戶序列資料
clusteredSeqs = pd.read_csv(output_folderpath + f'clustered-{file_name}-seqdist_{OM_version}-sm_{sm_method}-indel_{indel_method}-method_{cluster_method}-center_{center}.csv')
clusteredSeqs = clusteredSeqs[['user_id', f'{cluster_method}_cluster']]
print(clusteredSeqs.shape)
clusteredSeqs.head()

# 前面預存之抽樣用戶行為紀錄
sampledWebLog = pd.read_csv(preprocessed_folderpath + f'Tmall-sampledData_agerange-{file_name}.csv')
print(sampledWebLog.shape)
sampledWebLog.head()



(4242, 2)


Unnamed: 0,user_id,hcut_cluster
0,177050,Cluster1
1,228403,Cluster2
2,89261,Cluster1
3,43439,Cluster2
4,123529,Cluster3


(537042, 17)


Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,age_range,gender,click,add_to_cart,purchase,add_to_favorite,day,month,dayOfWeek,day_stamp,weekOfYear
0,380565,265897,119,1783,5041.0,511,3.0,0.0,0,0,1,0,11,5,6,0,18
1,219981,742103,384,1462,4925.0,511,6.0,0.0,0,0,1,0,11,5,6,0,18
2,422537,377212,1526,1505,5472.0,511,4.0,0.0,0,0,1,0,11,5,6,0,18
3,219981,768332,662,3596,532.0,511,6.0,0.0,0,0,1,0,11,5,6,0,18
4,219981,54824,931,1462,4925.0,511,6.0,0.0,0,0,1,0,11,5,6,0,18


#### 計算分群結果移轉情形

In [5]:
if check_cluster_transition:
    
    action_counts_clust = pd.read_csv(output_folderpath + f'clustered-{file_name}-seqdist_action_counts-method_{cluster_method}-center_{center}.csv')
    action_counts_cluster_perc = cluster_result_transition(clusteredSeqs, action_counts_clust, cluster_method)
    action_counts_cluster_perc.to_csv(output_folderpath + f'cluster_transition-action_counts-{file_name}-seqdist_{OM_version}-sm_{sm_method}-indel_{indel_method}-method_{cluster_method}-center_{center}.csv', index=True)


      user_id hcut_cluster action_counts_cluster
0      177050     Cluster1              Cluster1
1      228403     Cluster2              Cluster2
2       89261     Cluster1              Cluster3
3       43439     Cluster2              Cluster4
4      123529     Cluster3              Cluster3
...       ...          ...                   ...
4237    45808     Cluster2              Cluster4
4238    64522     Cluster5              Cluster4
4239   194180     Cluster2              Cluster8
4240   114401     Cluster1              Cluster3
4241   330011     Cluster2              Cluster6

[4242 rows x 3 columns]
hcut_cluster           Cluster1  Cluster2  Cluster3  Cluster4  Cluster5  \
action_counts_cluster                                                     
Cluster1                     81       122        25        43        84   
Cluster2                     79       146        21        43        67   
Cluster3                    131       206        32        91       117   
Cluster4    

#### 計算分群前的總體指標
- 平均值
- 標準差
- 中位數


In [6]:
# 計算每位用戶的指標，並與分群結果合併
df_by_user = compute_user_metrics(df=sampledWebLog, clustered_df=clusteredSeqs)
df_by_user

# 計算分群前的總體指標平均、標準差與中位數
origin_metrics = compute_origin_metrics(df_by_user)
origin_metrics.to_csv(output_folderpath + f'origin_metrics-{file_name}.csv', index=True)

# 計算分群前的性別佔比
gender_perc = df_by_user.groupby(['性別'])['user_id'].count().reset_index().rename(columns={'user_id': 'proportion'})
gender_perc['proportion'] = round(gender_perc['proportion'] / gender_perc['proportion'].sum() * 100, 3)
gender_perc.to_csv(output_folderpath + f'origin_metrics_gender-{file_name}.csv', index=True)
gender_perc

# 計算分群前的年齡層佔比
age_perc = df_by_user.groupby(['年齡層'])['user_id'].count().reset_index().rename(columns={'user_id': 'proportion'})
age_perc['proportion'] = round(age_perc['proportion'] / age_perc['proportion'].sum() * 100, 3)
age_perc.to_csv(output_folderpath + f'origin_metrics_age-{file_name}.csv', index=True)
age_perc


user_id                              0
gender                               0
age_range                            0
daysVisited                          0
clicks                               0
add_to_carts                         0
add_to_favorites                     0
purchases                            0
unique_cat_clicks                    0
unique_merchant_clicks               0
unique_brand_clicks                  0
unique_cat_addToCarts             3941
unique_merchant_addToCarts        3941
unique_brand_addToCarts           3941
unique_cat_addToFavorites         1921
unique_merchant_addToFavorites    1921
unique_brand_addToFavorites       1921
unique_cat_purchases                 0
unique_merchant_purchases            0
unique_brand_purchases               0
purchasesOn1111                      0
purchasesBefore1111                  0
cvr                                  0
isRepeatBuyer                        0
dtype: int64


Unnamed: 0,user_id,性別,年齡層,每人進站天數,每人總點擊數,每人總購物車數,每人總願望清單數,每人總購買數,每人點擊不重複品類數,每人點擊不重複商家數,每人點擊不重複品牌數,每人購物車不重複品類數,每人購物車不重複商家數,每人購物車不重複品牌數,每人願望清單不重複品類數,每人願望清單不重複商家數,每人願望清單不重複品牌數,每人購買不重複品類數,每人購買不重複商家數,每人購買不重複品牌數,雙11當天購買數,雙11前購買數,轉換率,是否為回購者,om_cluster
0,51,0.0,4.0,24,203,0,4,3,27,48,51,0.0,0.0,0.0,3.0,4.0,4.0,3,3,3,1,2,0.014778,1,Cluster1
1,386,0.0,0.0,10,46,0,0,1,13,27,27,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,1,0,0.021739,0,Cluster2
2,424,0.0,4.0,4,51,0,3,7,23,21,23,0.0,0.0,0.0,3.0,3.0,3.0,6,4,4,7,0,0.137255,1,Cluster4
3,496,0.0,0.0,12,170,0,17,5,6,13,12,0.0,0.0,0.0,3.0,5.0,4.0,4,4,4,2,3,0.029412,1,Cluster4
4,636,0.0,0.0,23,78,0,35,13,29,28,28,0.0,0.0,0.0,16.0,17.0,17.0,8,7,7,3,10,0.166667,1,Cluster6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4237,423882,0.0,0.0,26,215,0,0,7,34,57,50,0.0,0.0,0.0,0.0,0.0,0.0,6,6,6,3,4,0.032558,1,Cluster2
4238,423892,1.0,0.0,7,21,0,0,2,6,9,7,0.0,0.0,0.0,0.0,0.0,0.0,2,2,2,1,1,0.095238,1,Cluster1
4239,423942,1.0,3.0,9,29,0,0,2,11,15,16,0.0,0.0,0.0,0.0,0.0,0.0,2,2,2,1,1,0.068966,1,Cluster2
4240,423971,0.0,0.0,6,36,0,0,3,8,14,14,0.0,0.0,0.0,0.0,0.0,0.0,3,3,3,2,1,0.083333,1,Cluster2


Unnamed: 0,性別,proportion
0,0.0,70.061
1,1.0,27.534
2,2.0,2.405


Unnamed: 0,年齡層,proportion
0,0.0,24.776
1,2.0,8.416
2,3.0,40.264
3,4.0,17.751
4,5.0,5.092
5,6.0,3.465
6,7.0,0.236


#### 計算分群後各群的總體指標
- 平均值
- 標準差
- 中位數


In [8]:
# 計算分群後，各群各指標的平均、標準差與中位數
cluster_metrics = compute_cluster_metrics(df_by_user)
cluster_metrics.to_csv(output_folderpath + f'cluster_metrics-{file_name}-seqdist_{OM_version}-sm_{sm_method}-indel_{indel_method}-method_{cluster_method}-center_{center}.csv',
                       index=True)
cluster_metrics

# 計算各群性別佔比表
gender_perc = df_by_user.groupby(['om_cluster', '性別'])['user_id'].count().groupby(level=0).apply(lambda x: x / x.sum() * 100).reset_index()
gender_perc_pivot = gender_perc.pivot(index='性別', columns='om_cluster', values='user_id').apply(lambda x: round(x, 3))
gender_perc_pivot.to_csv(output_folderpath + f'cluster_metrics_gender-{file_name}-seqdist_{OM_version}-sm_{sm_method}-indel_{indel_method}-method_{cluster_method}-center_{center}.csv',
                         index=True)
gender_perc_pivot

# 計算各群年齡層佔比表
age_perc = df_by_user.groupby(['om_cluster', '年齡層'])['user_id'].count().groupby(level=0).apply(lambda x: x / x.sum() * 100).reset_index()
age_perc_pivot = age_perc.pivot(index='年齡層', columns='om_cluster', values='user_id').apply(lambda x: round(x, 3))
age_perc_pivot.to_csv(output_folderpath + f'cluster_metrics_age-{file_name}-seqdist_{OM_version}-sm_{sm_method}-indel-{indel_method}-method_{cluster_method}-center_{center}.csv',
                      index=True)
age_perc_pivot


Unnamed: 0,Cluster1(mean),Cluster2(mean),Cluster3(mean),Cluster4(mean),Cluster5(mean),Cluster6(mean),Cluster7(mean),Cluster8(mean),Cluster1(std),Cluster2(std),Cluster3(std),Cluster4(std),Cluster5(std),Cluster6(std),Cluster7(std),Cluster8(std),Cluster1(median),Cluster2(median),Cluster3(median),Cluster4(median),Cluster5(median),Cluster6(median),Cluster7(median),Cluster8(median)
每人點擊不重複品類數,23.133,17.154,13.846,29.294,23.989,26.821,34.333,29.037,15.303,13.051,9.347,18.508,16.509,15.572,17.411,20.879,20.0,14.0,11.0,25.0,20.0,24.0,30.0,24.0
每人點擊不重複商家數,33.992,23.531,17.195,47.329,36.141,37.959,59.507,46.504,28.87,25.418,16.08,43.881,33.711,27.659,44.177,43.319,25.0,16.0,13.0,35.0,26.0,30.0,47.0,32.0
每人點擊不重複品牌數,33.644,23.428,17.411,46.925,35.742,37.569,58.522,46.287,28.252,24.855,16.034,42.84,32.969,27.274,42.257,42.634,25.5,16.5,13.0,35.0,26.0,29.0,48.0,32.0
每人購物車不重複品類數,0.082,0.103,0.075,0.325,0.072,0.26,0.261,0.352,0.416,0.551,0.431,0.952,0.412,0.886,0.869,1.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
每人購物車不重複商家數,0.079,0.107,0.075,0.339,0.071,0.236,0.275,0.369,0.384,0.573,0.431,0.992,0.425,0.831,0.922,1.079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
每人購物車不重複品牌數,0.052,0.055,0.046,0.158,0.042,0.106,0.101,0.148,0.223,0.229,0.209,0.365,0.2,0.309,0.304,0.355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
每人願望清單不重複品類數,2.056,1.595,3.328,9.162,2.23,13.065,7.377,8.086,4.277,3.731,5.83,9.055,5.276,11.483,6.754,9.459,0.0,0.0,1.0,6.0,0.0,10.0,5.0,4.0
每人願望清單不重複商家數,2.275,1.843,4.162,11.677,2.571,16.927,9.623,10.643,5.233,4.934,8.73,13.794,6.771,16.286,11.461,14.554,0.0,0.0,1.0,7.0,0.0,11.0,5.0,5.0
每人願望清單不重複品牌數,2.284,1.833,4.183,11.586,2.573,16.577,9.58,10.607,5.265,4.885,8.78,13.623,6.693,15.888,11.348,14.409,0.0,0.0,1.0,7.0,0.0,11.0,5.0,5.0
每人購買不重複品類數,5.686,4.869,4.685,6.416,5.801,7.35,6.942,5.615,4.425,3.925,3.634,4.746,4.812,4.622,5.125,4.79,5.0,4.0,4.0,5.0,4.0,6.0,6.0,4.0


om_cluster,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5,Cluster6,Cluster7,Cluster8
性別,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,68.794,65.138,71.784,73.428,73.684,76.423,84.058,74.18
1.0,28.422,32.242,26.556,23.529,25.478,16.26,15.942,22.951
2.0,2.784,2.62,1.66,3.043,0.837,7.317,,2.869


om_cluster,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5,Cluster6,Cluster7,Cluster8
年齡層,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,25.87,26.638,23.651,18.661,25.478,17.073,26.087,25.0
2.0,7.657,8.006,14.938,8.722,7.177,5.691,11.594,11.066
3.0,39.211,39.083,36.929,48.479,40.072,47.967,37.681,34.836
4.0,16.473,18.559,16.598,16.836,17.464,22.764,13.043,20.492
5.0,6.265,4.876,4.979,4.26,5.024,4.065,5.797,4.508
6.0,4.408,2.62,2.905,2.637,4.426,1.626,5.797,4.098
7.0,0.116,0.218,,0.406,0.359,0.813,,


#### 計算兩群間樣本特徵分布顯著性

In [9]:
# 計算兩樣本特徵分布顯著性
metrics_to_compare = ['性別', '年齡層', '每人點擊不重複品類數', '每人點擊不重複商家數', '每人點擊不重複品牌數']
for metric in metrics_to_compare:
    compute_kruskal_wallis_test(df=df_by_user, metric=metric, cluster_num=center)
    

Kruskal-Wallis Significance Test: 性別

Origin  v.s.  Cluster1
KruskalResult(statistic=0.6147161802985099, pvalue=0.43301759072502266)

Cluster1  v.s.  Cluster2
KruskalResult(statistic=2.881385579547441, pvalue=0.08960888550936288)
Cluster1  v.s.  Cluster3
KruskalResult(statistic=0.932015122721681, pvalue=0.3343398435797852)
Cluster1  v.s.  Cluster4
KruskalResult(statistic=2.917440412245045, pvalue=0.08762676062627027)
Cluster1  v.s.  Cluster5
KruskalResult(statistic=5.9065863178876965, pvalue=0.015084375208954738)
Cluster1  v.s.  Cluster6
KruskalResult(statistic=1.8125649793803966, pvalue=0.17820084256603957)
Cluster1  v.s.  Cluster7
KruskalResult(statistic=7.41275897702716, pvalue=0.006476293213179214)
Cluster1  v.s.  Cluster8
KruskalResult(statistic=2.428956506927441, pvalue=0.11911216476672597)
Origin  v.s.  Cluster2
KruskalResult(statistic=11.356117755885936, pvalue=0.000751998498385067)

Cluster2  v.s.  Cluster3
KruskalResult(statistic=4.189227290140264, pvalue=0.04068163585369113)

KruskalResult(statistic=33.97168457433739, pvalue=5.59199697284502e-09)
Cluster4  v.s.  Cluster6
KruskalResult(statistic=3.080069049936793, pvalue=0.07925716114901782)
Cluster4  v.s.  Cluster7
KruskalResult(statistic=9.650106469198912, pvalue=0.0018934060988359074)
Cluster4  v.s.  Cluster8
KruskalResult(statistic=0.6445968574865523, pvalue=0.4220510939620554)
Origin  v.s.  Cluster5
KruskalResult(statistic=17.124596501902623, pvalue=3.500601173738907e-05)

Cluster5  v.s.  Cluster6
KruskalResult(statistic=3.5763037016890245, pvalue=0.05860942655498483)
Cluster5  v.s.  Cluster7
KruskalResult(statistic=31.95790847459757, pvalue=1.575496443198103e-08)
Cluster5  v.s.  Cluster8
KruskalResult(statistic=10.713369693974574, pvalue=0.0010636413763579742)
Origin  v.s.  Cluster6
KruskalResult(statistic=13.868265489277892, pvalue=0.0001960815875320615)

Cluster6  v.s.  Cluster7
KruskalResult(statistic=15.74107333572741, pvalue=7.262985614510052e-05)
Cluster6  v.s.  Cluster8
KruskalResult(statistic=0