In [38]:
import gc
import sys
sys.path.append('../')
import json
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
# import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from tqdm import tqdm
from collections import defaultdict

from custom_functions import (
    gen_dayHrMin,
    gen_timeblock
)

%matplotlib inline



### User States 定義
> action_type: 0 is for click, 1 is for add-to-cart, 2 is for purchase and 3 is for add-to-favourite

##### 前一版
> User states 分為兩類，瀏覽類和購買及其意圖類
> 1. 瀏覽類：離站或有瀏覽，瀏覽中又細分8類(依據瀏覽不重複品類數、品牌數)
> 2. 購買及其意圖類：無、加入購物車、加入願望清單、購買

> ### 結論：
> ### **根據 silhouette function 最佳分群為 2 群，能夠解釋的豐富度不夠，看起來只分出進站頻率高低兩群人而已**

---

#### 測試不同 user state 分法增加分群豐富程度
##### V3.1：把購買或加入待購(願望清單/購物車)的意圖動作，和瀏覽(點擊)一併定義 state（９）
> *加入購物車事件因為數量較少，將其跟購物車事件歸成同一件*

> 1. **少類別瀏覽且當天購買**：只瀏覽2個以內類別、接著在同一天下單
> 2. **少類別瀏覽且當天加入待購**：一個只瀏覽2個以內類別、接著在同一天只加入願望清單或購物車
> 3. **少類別瀏覽後無進一步動作**：一個只瀏覽2個以內類別、接著無後續動作
> 4. **多類別瀏覽且當天購買**：一個只瀏覽3個以上類別、接著在同一天下單
> 5. **多類別瀏覽且當天加入待購**：一個只瀏覽3個以上類別、接著在同一天只加入願望清單或購物車
> 6. **多類別瀏覽後無進一步動作**：一個只瀏覽3個以上類別、接著無後續動作
> 7. **當天只有加入待購**
> 8. **當天促成購買**
> 9. **未進站**：該日無事件發生

##### V3.2：不分 多/少 類別瀏覽（６）

> 1. **瀏覽且當天購買**：一個只瀏覽3個以上類別、接著在同一天下單
> 2. **瀏覽且當天加入待購**：一個只瀏覽3個以上類別、接著在同一天只加入願望清單或購物車
> 3. **瀏覽後無進一步動作**：一個只瀏覽3個以上類別、接著無後續動作
> 4. **當天只有加入待購**
> 5. **當天促成購買**
> 6. **未進站**：該日無事件發生

##### V3.3：不分 多/少 類別瀏覽、但加入願望清單和加入購物車分開看（８）

> 1. **瀏覽且當天購買**：一個只瀏覽3個以上類別、接著在同一天下單
> 2. **瀏覽且當天加入購物車**：一個只瀏覽3個以上類別、接著在同一天只加入購物車
> 3. **瀏覽且當天加入願望清單**：一個只瀏覽3個以上類別、接著在同一天只加入願望清單
> 4. **瀏覽後無進一步動作**：一個只瀏覽3個以上類別、接著無後續動作
> 5. **當天只有加入購物車**
> 6. **當天只有加入願望清單**
> 7. **當天促成購買**
> 8. **未進站**：該日無事件發生



In [176]:
produce_date = '201117'
version = 'V3.2-crosscategorypurchase-past14days'
pastDays = 14


In [3]:
df = pd.read_pickle('../TMall_datasets/TMall_for_user_states_define_transformed.pkl')  # moved to "../TMall_datasets"
df.shape
df.head()


(54925330, 17)

Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,label,age_range,gender,click,add_to_cart,purchase,add_to_favorite,day,month,dayOfWeek,day_stamp
0,328862,323294,833,2882,2661,829,-,6.0,1.0,1,0,0,0,29,8,4,110
1,328862,844400,1271,2882,2661,829,-,6.0,1.0,1,0,0,0,29,8,4,110
2,328862,575153,1271,2882,2661,829,-,6.0,1.0,1,0,0,0,29,8,4,110
3,328862,996875,1271,2882,2661,829,-,6.0,1.0,1,0,0,0,29,8,4,110
4,328862,1086186,1271,1253,1049,829,-,6.0,1.0,1,0,0,0,29,8,4,110


In [177]:
### 計算每個 user 在區間內的購買次數
tmp_df = df[(df['time_stamp']!=1111)].groupby(['user_id','cat_id'])['purchase'].sum().reset_index()
tmp_df = tmp_df[tmp_df['purchase']>0].groupby(['user_id'])['cat_id'].nunique().reset_index()
selected_user_1 = tmp_df[(tmp_df['cat_id']>2)].user_id.unique() # 在雙11以外的，在多個品類瀏覽/購買
tmp_df_2 = df[(df['time_stamp']!=1111)].groupby(['user_id'])['click'].sum().reset_index()
selected_user_2 = tmp_df_2[(tmp_df_2['click']>0)].user_id.unique() # 在雙11以外時間造訪一次以上的人


### 篩選 user
selected_user = []
for u in tqdm(selected_user_1):
    if u in selected_user_2:
        selected_user.append(u)


### 抽樣 1000 位用戶
print(len(selected_user))
selected_user = np.random.choice(selected_user, 1000)
print(len(selected_user))


df_target = df[df['user_id'].isin(selected_user)].sort_values('day_stamp')
print(df_target.shape)
df_target.head()


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 198693/198693 [00:43<00:00, 4594.51it/s]


198533
1000
(189532, 17)


Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,label,age_range,gender,click,add_to_cart,purchase,add_to_favorite,day,month,dayOfWeek,day_stamp
24019066,152951,214546,1090,4605,7622,511,-,4.0,0.0,0,0,1,0,11,5,6,0
16640096,175992,333753,662,4173,5376,511,-,3.0,0.0,0,0,0,1,11,5,6,0
42488701,186120,1073970,602,2591,5120,511,-,3.0,0.0,0,0,0,1,11,5,6,0
16640097,175992,207096,1467,3578,3091,511,-,3.0,0.0,0,0,0,1,11,5,6,0
16640098,175992,1057639,1467,3578,3091,511,-,3.0,0.0,0,0,0,1,11,5,6,0


In [178]:
###只留下購買前__天的紀錄

df_target = df_target.sort_values('time_stamp').reset_index().drop('index', axis=1)
user_list = df_target.user_id.unique()
user = user_list[0]
purchase_timing = df_target[(df_target['user_id']==user) & (df_target['purchase']>0)].time_stamp.values[0]
df_purchase_a_week_ago = df_target[(df_target['user_id']==user)&(df_target['time_stamp']<=purchase_timing)&(df_target['time_stamp']>=purchase_timing-pastDays)]

for uid in tqdm(user_list[1:]):
    purchase_timing = df_target[(df_target['user_id']==uid) & (df_target['purchase']>0)].time_stamp.values[0]
    tmp_df_2 = df_target[(df_target['user_id']==uid)&(df_target['time_stamp']<=purchase_timing)&(df_target['time_stamp']>=purchase_timing-pastDays)]
    df_purchase_a_week_ago = pd.concat([df_purchase_a_week_ago, tmp_df_2])

print(df_purchase_a_week_ago.shape)
df_purchase_a_week_ago.head()



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 999/999 [00:04<00:00, 227.86it/s]

(12505, 17)





Unnamed: 0,user_id,item_id,cat_id,merchant_id,brand_id,time_stamp,label,age_range,gender,click,add_to_cart,purchase,add_to_favorite,day,month,dayOfWeek,day_stamp
0,152951,214546,1090,4605,7622,511,-,4.0,0.0,0,0,1,0,11,5,6,0
1,1368,1036854,177,219,1818,511,-,4.0,0.0,0,0,0,1,11,5,6,0
7,1368,807111,1349,1457,4279,511,-,4.0,0.0,0,0,0,1,11,5,6,0
8,1368,1007438,1349,1457,3650,511,-,4.0,0.0,0,0,0,1,11,5,6,0
37,1368,412630,1349,1457,3650,511,-,4.0,0.0,0,0,0,1,11,5,6,0


In [179]:
accum_by_timestamp = {}

for uid in tqdm(user_list):
    accum_by_timestamp[str(uid)] = {}
    
    tmp_df = df_purchase_a_week_ago[df_purchase_a_week_ago['user_id']==uid].groupby(['time_stamp'])['click','add_to_cart','add_to_favorite','purchase'].sum()
    tmp_df['add_to_favor_or_cart'] = tmp_df['add_to_favorite'] + tmp_df['add_to_cart']
    tmp_df = tmp_df.drop(['add_to_favorite','add_to_cart'], axis=1)
    tmp_df['unique_cat'] = df_purchase_a_week_ago[df_purchase_a_week_ago['user_id']==uid].groupby(['time_stamp'])['cat_id'].nunique()
    tmp_df = tmp_df.reset_index()#.drop('index', axis=1)

    for i in range(tmp_df.time_stamp.max()-pastDays, tmp_df.time_stamp.max()+1):
        if i not in tmp_df.time_stamp.values:
            tmp_df = pd.concat([tmp_df, pd.DataFrame([[i,0,0,0,0]], columns=tmp_df.columns)], ignore_index=True)

    tmp_df = tmp_df.sort_values('time_stamp').reset_index().drop(['index'], axis=1)
    
    for c in tmp_df.columns:
        accum_by_timestamp[str(uid)][c] = tmp_df[c].values.tolist()
        
        
# accum_by_timestamp
with open('TMall_user_counts_{}_{}.json'.format(produce_date, version), 'w') as fp:
    json.dump(accum_by_timestamp, fp)


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 63.32it/s]


In [180]:
with open('TMall_user_counts_{}_{}.json'.format('201117', 'V3.2-crosscategorypurchase-past14days'), 'r') as f:
    tmall_user_count_stats = json.load(f)
    
tmall_user_count_stats.keys()
# tmall_user_count_stats['395747']

dict_keys(['152951', '1368', '64713', '161229', '285098', '321029', '222608', '235310', '227090', '182365', '352203', '378993', '127571', '130448', '177503', '204494', '27885', '24786', '386700', '269513', '335442', '378377', '318687', '191131', '314176', '167959', '356016', '318040', '404547', '56065', '383743', '227508', '175992', '186120', '269948', '50500', '44162', '13745', '165026', '320688', '13547', '252795', '313354', '168792', '395144', '119396', '44834', '343302', '108413', '238702', '370321', '416065', '379612', '135575', '296726', '38925', '40619', '354677', '159950', '297609', '141503', '72709', '412039', '380890', '306999', '308141', '233395', '269183', '280447', '43904', '142893', '401850', '51455', '324176', '121943', '348357', '403048', '406233', '396372', '221133', '191428', '410475', '34649', '337355', '379594', '376608', '169881', '319416', '282752', '408820', '177572', '4328', '5862', '146171', '3034', '34448', '247023', '369226', '213963', '410323', '133539', '32

##### Label User States

In [97]:
tmp_df = pd.DataFrame(tmall_user_count_stats['395747'])

##user_state
# v3_1_cond_list = [
#     (tmp_df['click']+tmp_df['purchase']+tmp_df['add_to_favor_or_cart']+tmp_df['unique_cat']==0),
#     (tmp_df['click']==0)&(tmp_df['purchase']>0),
#     (tmp_df['click']==0)&(tmp_df['add_to_favor_or_cart']>0),
#     (tmp_df['click']>0)&(tmp_df['unique_cat']<3)&(tmp_df['purchase']>0),
#     (tmp_df['click']>0)&(tmp_df['unique_cat']<3)&(tmp_df['add_to_favor_or_cart']>0),
#     (tmp_df['click']>0)&(tmp_df['unique_cat']<3)&(tmp_df['add_to_favor_or_cart']+tmp_df['purchase']==0),
#     (tmp_df['click']>0)&(tmp_df['unique_cat']>=3)&(tmp_df['purchase']>0),
#     (tmp_df['click']>0)&(tmp_df['unique_cat']>=3)&(tmp_df['add_to_favor_or_cart']>0),
#     (tmp_df['click']>0)&(tmp_df['unique_cat']>=3)&(tmp_df['add_to_favor_or_cart']+tmp_df['purchase']==0),
# ]
# v3_1_choice_list = [
#     'no_browse', 'directly_purchase', 'directly_add_to_consider', 'browse_less_cate_to_purchase', 
#     'browse_less_cate_to_add_to_consider', 'browse_less_cate', 'browse_multi_cate_to_purchase', 
#     'browse_multi_cate_to_add_to_consider', 'browse_multi_cate'
# ]


v3_2_cond_list = [
    (tmp_df['click']+tmp_df['purchase']+tmp_df['add_to_favor_or_cart']+tmp_df['unique_cat']==0),
    (tmp_df['click']==0)&(tmp_df['purchase']>0),
    (tmp_df['click']==0)&(tmp_df['add_to_favor_or_cart']>0),
    (tmp_df['click']>0)&(tmp_df['purchase']>0),
    (tmp_df['click']>0)&(tmp_df['add_to_favor_or_cart']>0),
    (tmp_df['click']>0)&(tmp_df['add_to_favor_or_cart']+tmp_df['purchase']==0)
]
v3_2_choice_list = [
    'no_browse', 'directly_purchase', 'directly_add_to_consider', 'browse_to_purchase', 
    'browse_to_add_to_consider', 'browse'
]


# v3_3_cond_list = [
#     (tmp_df['click']+tmp_df['purchase']+tmp_df['add_to_cart']+tmp_df['add_to_favorite']+tmp_df['unique_cat']==0),
#     (tmp_df['click']==0)&(tmp_df['purchase']>0),
#     (tmp_df['click']==0)&(tmp_df['add_to_cart']>0),
#     (tmp_df['click']==0)&(tmp_df['add_to_favorite']>0),
#     (tmp_df['click']>0)&(tmp_df['purchase']>0),
#     (tmp_df['click']>0)&(tmp_df['add_to_cart']>0),
#     (tmp_df['click']>0)&(tmp_df['add_to_favorite']>0),
#     (tmp_df['click']>0)&(tmp_df['add_to_cart']+tmp_df['add_to_favorite']+tmp_df['purchase']==0)
# ]
# v3_3_choice_list = [
#     'no_browse', 'directly_purchase', 'directly_add_to_cart', 'directly_add_to_favorite', 
#     'browse_to_purchase', 'browse_to_add_to_cart', 'browse_to_add_to_favorite', 'browse'
# ]



tmp_df['user_state'] = np.select(condlist=v3_2_cond_list, choicelist=v3_2_choice_list, default='')
tmp_df

# pd.melt(tmp_df, id_vars=['time_stamp'], value_vars=['user_state']).sort_values(['time_stamp','variable']).reset_index().drop(['index','variable'], axis=1)



Unnamed: 0,time_stamp,click,purchase,add_to_favor_or_cart,unique_cat,user_state
0,504,0,0,0,0,no_browse
1,505,0,0,0,0,no_browse
2,506,0,0,0,0,no_browse
3,507,0,0,0,0,no_browse
4,508,0,0,0,0,no_browse
5,509,0,0,0,0,no_browse
6,510,0,0,0,0,no_browse
7,511,0,1,0,1,directly_purchase


In [181]:
###寫進迴圈應用到所有user

collect = []
for uid in tqdm(tmall_user_count_stats.keys()):

    tmp_df = pd.DataFrame(tmall_user_count_stats[uid])

    ##user_state
#     v3_1_cond_list = [
#         (tmp_df['click']+tmp_df['purchase']+tmp_df['add_to_favor_or_cart']+tmp_df['unique_cat']==0),
#         (tmp_df['click']==0)&(tmp_df['add_to_favor_or_cart']>0),
#         (tmp_df['click']==0)&(tmp_df['purchase']>0),
#         (tmp_df['click']>0)&(tmp_df['unique_cat']<3)&(tmp_df['purchase']>0),
#         (tmp_df['click']>0)&(tmp_df['unique_cat']<3)&(tmp_df['add_to_favor_or_cart']>0),
#         (tmp_df['click']>0)&(tmp_df['unique_cat']<3)&(tmp_df['add_to_favor_or_cart']+tmp_df['purchase']==0),
#         (tmp_df['click']>0)&(tmp_df['unique_cat']>=3)&(tmp_df['purchase']>0),
#         (tmp_df['click']>0)&(tmp_df['unique_cat']>=3)&(tmp_df['add_to_favor_or_cart']>0),
#         (tmp_df['click']>0)&(tmp_df['unique_cat']>=3)&(tmp_df['add_to_favor_or_cart']+tmp_df['purchase']==0),
#     ]
#     v3_1_choice_list = [
#         'no_browse', 'directly_add_to_consider', 'directly_purchase', 'browse_less_cate_to_purchase', 
#         'browse_less_cate_to_add_to_consider', 'browse_less_cate', 'browse_multi_cate_to_purchase', 
#         'browse_multi_cate_to_add_to_consider', 'browse_multi_cate'
#     ]
    
    
    
    v3_2_cond_list = [
        (tmp_df['click']+tmp_df['purchase']+tmp_df['add_to_favor_or_cart']+tmp_df['unique_cat']==0),
        (tmp_df['click']==0)&(tmp_df['purchase']>0),
        (tmp_df['click']==0)&(tmp_df['add_to_favor_or_cart']>0),
        (tmp_df['click']>0)&(tmp_df['purchase']>0),
        (tmp_df['click']>0)&(tmp_df['add_to_favor_or_cart']>0),
        (tmp_df['click']>0)&(tmp_df['add_to_favor_or_cart']+tmp_df['purchase']==0)
    ]
    v3_2_choice_list = [
        'no_browse', 'directly_purchase', 'directly_add_to_consider', 'browse_to_purchase', 
        'browse_to_add_to_consider', 'browse'
    ]


#     v3_3_cond_list = [
#         (tmp_df['click']+tmp_df['purchase']+tmp_df['add_to_cart']+tmp_df['add_to_favorite']+tmp_df['unique_cat']==0),
#         (tmp_df['click']==0)&(tmp_df['purchase']>0),
#         (tmp_df['click']==0)&(tmp_df['add_to_cart']>0),
#         (tmp_df['click']==0)&(tmp_df['add_to_favorite']>0),
#         (tmp_df['click']>0)&(tmp_df['purchase']>0),
#         (tmp_df['click']>0)&(tmp_df['add_to_cart']>0),
#         (tmp_df['click']>0)&(tmp_df['add_to_favorite']>0),
#         (tmp_df['click']>0)&(tmp_df['add_to_cart']+tmp_df['add_to_favorite']+tmp_df['purchase']==0)
#     ]
#     v3_3_choice_list = [
#         'no_browse', 'directly_purchase', 'directly_add_to_cart', 'directly_add_to_favorite', 
#         'browse_to_purchase', 
#         'browse_to_add_to_cart', 'browse_to_add_to_favorite', 'browse'
#     ]


    tmp_df['user_state'] = np.select(condlist=v3_2_cond_list, choicelist=v3_2_choice_list, default='')
    
    ##append to collect list
    uss = tmp_df.user_state.values.tolist()
    uss.insert(0, uid)
    collect.append(uss)

    
##test
collect[98]


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 344.98it/s]


['213963',
 'no_browse',
 'no_browse',
 'no_browse',
 'no_browse',
 'no_browse',
 'no_browse',
 'directly_add_to_consider',
 'no_browse',
 'no_browse',
 'no_browse',
 'no_browse',
 'no_browse',
 'no_browse',
 'no_browse',
 'browse_to_purchase']

In [182]:
print(produce_date, version)

201117 V3.2-crosscategorypurchase-past14days


In [183]:
user_states_col = ['user_id'] + ['day_' + str(x) for x in range(0, pastDays+1)]
user_states_table = pd.DataFrame(collect, columns=user_states_col)
user_states_table.to_csv('TMall_user_state_sequence_table_{}_{}.csv'.format(produce_date, version), index=False)
user_states_table.head(50)


Unnamed: 0,user_id,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14
0,152951,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase
1,1368,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_add_to_consider,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase
2,64713,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase
3,161229,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase
4,285098,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase
5,321029,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase
6,222608,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase
7,235310,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase
8,227090,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,browse_to_purchase
9,182365,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,no_browse,directly_purchase


In [184]:
collect_all = list()
for l in collect:
    for item in l[1:]:
        collect_all.append(item)
    
pd.DataFrame(collect_all, columns=['items']).groupby(['items']).size().reset_index().rename(columns={0: 'count'})


Unnamed: 0,items,count
0,browse,486
1,browse_to_add_to_consider,131
2,browse_to_purchase,688
3,directly_add_to_consider,116
4,directly_purchase,312
5,no_browse,13267
