# Part1:商品向量

Link to google drive

In [None]:
# 環境設定
import os
import pandas as pd
import sys
import numpy as np
import re
import pickle

In [None]:
# !pip install -U --no-cache-dir gdown --pre
# !pip install --upgrade --no-cache-dir gdown
from google.colab import drive
drive.mount('/content/gdrive')
!ls


import os
import pandas as pd
raw_path = os.path.join(os.path.join('/content/gdrive/MyDrive/BDA_Final/Raw'))
test_file = pd.read_csv(os.path.join(raw_path, '91APP_DataSet_2022/91APP_SalePageData.csv'))
print(test_file)

In [None]:
!gdown --id 1YVz3rpIKtZVxC7pxEmTUnpRPtdnGHoEn --output SalePageData.csv

Part 1: 斷詞->產生各詞向量->產生各商品矩陣

In [None]:
# Read SalePage: SalePageId <-> SalePageTitle
import os
import sys
import pandas as pd
import numpy as np

pages = pd.read_csv(os.path.join('/content', 'SalePageData.csv'))

pages # (17742, 2)

In [None]:
# Filter out "直播" items (6550)
# Items (11192)
exclude_mask = pages['SalePageTitle'].str.contains(r'【.*】.*直播$', regex=True)
items = pages[~exclude_mask].copy()

items

In [None]:
# Prerequistie for ckip-tagger
!pip install ckiptagger
!pip install tensorflow
!pip install gdown
from ckiptagger import WS, POS, NER
from ckiptagger import data_utils
data_utils.download_data_gdown("./")
ws = WS("./data")

In [None]:
# Process each title
# Extract color information: -顏色 in the end of the title
# Extract size information: (版型偏大/小) after color information
# Eliminate emojis
# Tokenize with ckip-tagger
from tqdm import tqdm
import re

def process_sentence(x):
    # Extract size
    size_pattern = r'\([\u4e00-\u9fa5]*\)$'
    size_info= re.search(size_pattern, x)
    if size_info is not None: size_info = size_info.group(0)
    x = re.sub(size_pattern, '', x)

    # Clear emojis
    x = re.sub(r'[^\u4e00-\u9fa5 | \w | \' | \:| % |\.]+', ' ', x)

    # Extract color
    color_pattern = r'\s([\u4e00-\u9fa5]){1,3}$'
    color_info = re.search(color_pattern, x)
    if color_info is not None: color_info = color_info.group(0)
    x = re.sub(color_pattern, '', x)

    # Eliminate Ann's
    x = re.sub(r'^.*Ann.?s', '', x, flags=re.IGNORECASE)

    # Remove space in front/end
    x = x.strip()
    return x, size_info, color_info

def ckip_split(sentence_lst):
    ws_results = ws(sentence_lst)
    return ws_results
    
def has_word(str):
    return bool(re.search(r'[A-Z | a-z | 0-9]', str))

processed_lst = []
size_lst = []
color_lst = []
for title in tqdm(items['SalePageTitle']):
    x, sz, cr= process_sentence(title)
    processed_lst.append(x)
    size_lst.append(sz)
    color_lst.append(cr)
items['ProcessedTitle'] = processed_lst
items['SizeInfo'] = size_lst
items['ColorInfo'] = color_lst

split_phrases = []
long_enough = []
for idx, row in tqdm(items.iterrows(), total=items.shape[0]):
    sentence_lst = []
    if isinstance(row['ProcessedTitle'], str): sentence_lst.append(row['ProcessedTitle'])
    if isinstance(row['SizeInfo'], str): sentence_lst.append(row['SizeInfo'])
    splitted = ckip_split(sentence_lst)

    if row['SizeInfo'] is not None: splitted = splitted[0] + splitted[1]
    else: splitted = splitted[0]
    split_phrases.append(splitted)

    slices_long_enough = [i.strip(' -') for i in splitted if len(i.strip(' -')) >= 2 and not has_word(i.strip())]
    if row['ColorInfo'] is not None: slices_long_enough += [row['ColorInfo'].strip()]
    long_enough.append(slices_long_enough)
    pass
items['SplitPhrases'] = split_phrases
items['LongEnough'] = long_enough

items

In [None]:
phrases_series = items['LongEnough']
phr_set = [] # 3630 # pseudo-set, actually a list

for phrases_lst in tqdm(items['LongEnough']):
    for phr in phrases_lst:
        if phr not in phr_set: phr_set.append(phr)
# phrases = pd.Series(list(phr_set)).to_frame('Phrase')

print(phr_set)

In [None]:
# Download BERT pretrain model
!wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
!unzip chinese_L-12_H-768_A-12.zip

In [None]:
# BERT: generate word vector
!pip install keras-bert
import tensorflow
from keras_bert import load_vocabulary
from keras_bert import Tokenizer
from keras_bert import load_trained_model_from_checkpoint

text = phr_set

pretrained='/content/chinese_L-12_H-768_A-12'
dictpath=os.path.join(pretrained, 'vocab.txt')
config_path=os.path.join(pretrained,'bert_config.json')
checkpoint_path=os.path.join(pretrained,'bert_model.ckpt')
print("The pretrained model is loaded")
# 載入中文預訓練好的BERT語料庫

model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
bert_token_dict=load_vocabulary(dictpath)
bert_tokenizer = Tokenizer(bert_token_dict)

# Estimated 6 hours
num = len(text)
data=np.zeros((num ,768))  # 建立空的nparray
for i in tqdm(range(0, num)):
	token = bert_tokenizer.tokenize(text[i])
	indices, segments=bert_tokenizer.encode(first=text[i], max_len=512)
	# 轉化成向量(向量維度768是預設輸出長度)
	predicts = model.predict([np.array([indices]), np.array([segments])])[0]
	embedding=predicts[0]
	data[i,:]=embedding

In [None]:
# Save output matrix
import datetime
import pytz
from google.colab import files
import pickle
import numpy as np

timenow_str =  datetime.datetime.now(pytz.timezone('Asia/Taipei')).strftime("%m%d%H%M")

np.save('array{}.npy'.format(timenow_str), data)
files.download('array{}.npy'.format(timenow_str))

with open('phrases{}.pkl'.format(timenow_str), 'wb') as f:
  pickle.dump(text, f)
files.download('phrases{}.pkl'.format(timenow_str))

items.to_pickle('items{}.pkl'.format(timenow_str))
files.download('items{}.pkl'.format(timenow_str))


with open(os.path.join('/content/gdrive/MyDrive/BDA_Final/Results/ckpt_bert', 'array{}.npy'.format(timenow_str)), 'wb') as f:
  np.save(f, data)
with open(os.path.join('/content/gdrive/MyDrive/BDA_Final/Results/ckpt_bert', 'phrases{}.pkl'.format(timenow_str)), 'wb') as f:
  pickle.dump(text, f)
with open(os.path.join('/content/gdrive/MyDrive/BDA_Final/Results/ckpt_bert', 'items{}.pkl'.format(timenow_str)), 'wb') as f:
  items.to_pickle(f)

Vector Dictionary column

In [None]:
# Load checkpoint for pervious sections
import os
!gdown --folder --id 1A9iBOw0vb4lX-QvI3sOsoBrVHr27KNmz
ckpt_bert_pth = '/content/ckpt_bert'
version='05251237'
with open(os.path.join(ckpt_bert_pth, 'phrases{}.pkl'.format(version)), 'rb') as fp:
    phrase_list = pickle.load(fp)

with open(os.path.join(ckpt_bert_pth, 'items{}.pkl'.format(version)), 'rb') as fpl:
    item_df = pickle.load(fpl)
phrase_array = np.load(os.path.join(ckpt_bert_pth, 'array{}.npy'.format(version)))

In [None]:
# Generate Columns for Phrases&Vectors for each Title
phrase_dict={} 
# 字典：{標題0:{關鍵字0：向量0,關鍵字1:向量1}}

item_df.reset_index(drop=True, inplace=True)
for i in range(0,len(phrase_list)):
	phrase_dict[phrase_list[i]] = phrase_array[i]

tokens_list=item_df['LongEnough'].tolist()
title_list=item_df['SalePageTitle'].tolist()
item_vec_dict={}

for i in range(0,len(tokens_list)):#len(tokens_list)
	title=title_list[i]
	item_vec_dict[title]={}
	for p in range(0,len(tokens_list[i])):
		word=str(tokens_list[i][p])
		item_vec_dict[title][word]=phrase_dict[word]

# 在item_df新增dict欄位; dict={關鍵字0：向量0,關鍵字1:向量1}
item_df["dict"] = ""
count=0
for i in title_list:
	item_df["dict"][count]= item_vec_dict[i]
	count+=1

In [None]:
def avg_dict_values(d):
  sum_v = 0
  for k,v in d.items():
      sum_v += v
  if len(d.keys()) !=0:
    return sum_v / len(d.keys())
  else:
    return None

item_df.drop(columns=['ProcessedTitle','SizeInfo','ColorInfo','SplitPhrases'],inplace=True)
item_df.rename(columns={'LongEnough':'Splitted','dict':'Vector'},inplace=True)
# 留下SalePageId, SalePageTitle, Splitted, Vector四個欄位

item_df["Vector_avg"] = item_df.Vector.apply(lambda x:avg_dict_values(x))

In [None]:
item_df

In [None]:
# Generate a Dictionary of Phrases(3630) to Vector(768,)
phrase_vec_dict = {}

for i in range(len(phrase_list)):
  phrase_vec_dict[phrase_list[i]] = phrase_array[i, :]

phrase_vec_df = pd.DataFrame(phrase_vec_dict.items(), columns=['Phrase', 'Vector'])

In [None]:
# Save Output Dataframe: ID, Title, Splitted(list of phrases), Vector(dictionary with Phrases(appear in title) <-> Vector (BERT generated (768,)))
import datetime
import pytz
from google.colab import files
import pickle

timenow_str =  datetime.datetime.now(pytz.timezone('Asia/Taipei')).strftime("%m%d%H%M")

item_df.to_pickle('SalePageWithPhrasesVectors{}.pkl'.format(timenow_str))
files.download('SalePageWithPhrasesVectors{}.pkl'.format(timenow_str))

phrase_vec_df.to_pickle('PhraseVector{}.pkl'.format(timenow_str))
files.download('PhraseVector{}.pkl'.format(timenow_str))

item_df.to_pickle(os.path.join('/content/gdrive/MyDrive/BDA_Final/Results/Items&Phrases', 'SalePageWithPhrasesVectors{}.pkl'.format(timenow_str)))
phrase_vec_df.to_pickle(os.path.join('/content/gdrive/MyDrive/BDA_Final/Results/Items&Phrases', 'PhraseVector{}.pkl').format(timenow_str))

# Part2: 會員向量

In [None]:
# 環境設定
import os
import pandas as pd
import sys
import numpy as np
import re
import pickle

In [None]:
# 載入Part1 的 output：OneVectorVersion dataframe,TwoVectorVersion dataframe
!gdown --folder --id 1VSrGhg_WLbpep0D9i4thlaz8eJLGm8wL
part1_path = '/content/Items&Phrases'
version = '05251401'
item_df = pd.read_pickle(os.path.join(part1_path, 'SalePageWithPhrasesVectors{}.pkl'.format(version)))
phrase2vector_df= pd.read_pickle(os.path.join(part1_path,  'PhraseVector{}.pkl'.format(version)))

In [None]:
''' 
Desciption: Aggregated Version of the original SalePageData, information including: 
  SalePageId:[String] Can be mapped to 'SalePageId' in behavior data <-話說好像有些viewproduct、add、checkout、purchase的行為有SalePAgeId, 但是對應不到SalePageTitle
  SalePageTitle: [String]
  Splitted: [list] 斷詞並篩選後的剩下的詞
  Vector: [Dict] 每個Splitted裡的詞對應的向量(768, )
'''
item_df

In [None]:
'''
Discription: 詞(不重複，共3630個) 對應到的 Vector(BERT生成，每個768維)
或許你們用的到?
'''
phrase2vector_df

#### 讀入會員行為資料(201806-202012)

In [None]:
# 201806-201812
!gdown --id '1MbGXfWAwTe7GdAxmyyS9i3JfC9z_9l7w' --output 91APP_BehaviorData_20180601.csv
!gdown --id '1B_LNLxeEQFmJq6_GqJMsurASUbntXpGn' --output 91APP_BehaviorData_20180701.csv
!gdown --id '1YOWE7clQ6R7F0C7zxolx8_H7t9IbimZl' --output 91APP_BehaviorData_20180801.csv
!gdown --id '1OOosr9kcJttOPQr55Si5ImCHi-HVCtTj' --output 91APP_BehaviorData_20180901.csv
!gdown --id '14EnXGrQ2rItnMw6FVylkAuuTNgmjvYaX' --output 91APP_BehaviorData_20181001.csv
!gdown --id '13nE7MxOr5gpOPGPO9746jJNF_YkRbcsv' --output 91APP_BehaviorData_20181101.csv
!gdown --id '1aeQZDBJghtkXH1Ul34GCdjvjMMcsNkD0' --output 91APP_BehaviorData_20181201.csv

In [None]:
# 201901-201912
!gdown --id '1nuPR_g4UKR8JjBbtXZtIpLpnlg-LmwFf' --output 91APP_BehaviorData_20190101.csv
!gdown --id '1WOkXxrgAtwT5X34rneoklLTJ5Ux21v3z' --output 91APP_BehaviorData_20190201.csv
!gdown --id '1JHR3_hZi5A58H6J2-c-AQZx3DD-TlCcw' --output 91APP_BehaviorData_20190301.csv
!gdown --id '13-2X4xs1H9qT7qjSJecOlJvhjV7_hWjA' --output 91APP_BehaviorData_20190401.csv
!gdown --id '18pX-Pd6v9d_dQ1h2wNFYk6L5HQ9RPLv_' --output 91APP_BehaviorData_20190501.csv
!gdown --id '1IaC3fM7DsGo91EaiFWHBFRKj4eimiSjR' --output 91APP_BehaviorData_20190601.csv
!gdown --id '1KdC-V2FE9KSXmpQmdppZ30zSSeyqMgrk' --output 91APP_BehaviorData_20190701.csv
!gdown --id '1ZKL2hJopqRTrZxBCFvqCrP9Xqi_neAt4' --output 91APP_BehaviorData_20190801.csv
!gdown --id '1SK4cMqMMfN7gamLyFUC4pMLJc0WirmRt' --output 91APP_BehaviorData_20190901.csv
!gdown --id '17kH5eF9JmAAlzAKUqZLe4q6d8Yz0qB7e' --output 91APP_BehaviorData_20191001.csv
!gdown --id '1niVVjV_n15zSq8sln9i4fc-GFNs2dOMP' --output 91APP_BehaviorData_20191101.csv
!gdown --id '1e5DO0S7aMz2uFVFw4Sjn73J6QXzP89gO' --output 91APP_BehaviorData_20191201.csv

In [None]:
# 202001-202006
!gdown --id '1F_1gAyEtmlW6NRQaHySCRLWtyFmN_7IK' --output 91APP_BehaviorData_20200101.csv
!gdown --id '1RZavefjlWwECDTHGtzT751HNg4h8m_BO' --output 91APP_BehaviorData_20200201.csv
!gdown --id '1e4C7YARl6ZICO3NYr740NGhx7QQn3jJs' --output 91APP_BehaviorData_20200301.csv
!gdown --id '1VNJSqsJpV-CtRse7aA5K0GPDMWloxEml' --output 91APP_BehaviorData_20200401.csv
!gdown --id '1DAbArrKDDmAefsZBDZk7SXeg0eYKNgWa' --output 91APP_BehaviorData_20200501.csv
!gdown --id '1_gxCmW5vRSozvrE23HdAX7yZS6mhaDXt' --output 91APP_BehaviorData_20200601.csv

In [None]:
df_all = pd.DataFrame()
ymd = ['20180601', '20180701', '20180801', '20180901', '20181001', '20181101', '20181201', 
       '20190101', '20190201', '20190301', '20190401', '20190501', '20190601', '20190701', 
       '20190801', '20190901', '20191001', '20191101', '20191201', '20200101', '20200201', 
       '20200301', '20200401', '20200501', '20200601']
usecols = ['MemberId', 'HitTime', 'Behavior', 'SalePageId', 'EventTime']

#讀檔
for i in ymd:
    df = pd.read_csv('/content/91APP_BehaviorData_'+i+'.csv', usecols=usecols, low_memory=False)
    #刪除NaN
    df = df.dropna(axis=0,how='any')
    #合併df
    df_all = df_all.append([df])
    print(i+' finished')
#重新編號
df_all.index = range(len(df_all))
print(df_all)

In [None]:
#整理時間資料(注意跑很久很耗RAM)
from datetime import timedelta
df_all['HitDateTime'] = df_all['HitTime'].apply(lambda x:pd.to_datetime(x,unit='ms')+timedelta(hours=8))
df_all['EventDateTime'] = df_all['EventTime'].apply(lambda x:pd.to_datetime(x,unit='ms')+timedelta(hours=8))

In [None]:
#存df_all
try:    # 先存 excel
    fname='MembersBehavior'+".xlsx"
    writer = pd.ExcelWriter(fname)
    df_all.to_excel(writer,'Sheet1',encoding='utf-8')
    writer.save()
except:   # error 再存 pickle
    fname='MembersBehavior'+".pkl"
    with open(fname, "wb") as fp: 
        pickle.dump(df_all, fp)   
print ('save ', fname)  

In [None]:
!gdown --id '11u1K6il5PWQ-wsObkrSzbYVNnIxgxfe5' --output MembersBehavior_timechanged.pkl

In [None]:
#讀整理後的pickle檔
MembersBehavior = pd.read_pickle('/content/MembersBehavior_timechanged.pkl')
MembersBehavior

####補資料(202007-202012)

In [None]:
# 202007-202012
!gdown --id '1uaawxG0b5OJn9yvdho2R9PkKnga1j3tI' --output 91APP_BehaviorData_20200701.csv
!gdown --id '1Ua5GdDYfAFeUaECKcS2v0aJSgcMY1tHl' --output 91APP_BehaviorData_20200801.csv
!gdown --id '1lG3lmlxXkdFknEsoc-ZwuhB23AqOl-YY' --output 91APP_BehaviorData_20200901.csv
!gdown --id '1Ka_ha0qxumklldLBlvmD3D2c8fOcbRlG' --output 91APP_BehaviorData_20201001.csv
!gdown --id '1Tbb1GqOhBTthe7HUAODdRmGClkAsmosq' --output 91APP_BehaviorData_20201101.csv
!gdown --id '1sc4EGutXpeYHn4FhH2qIrEPf369yzCCT' --output 91APP_BehaviorData_20201201.csv

In [None]:
df_all = pd.DataFrame()
ymd = ['20200701', '20200801', '20200901', '20201001', '20201101', '20201201']
usecols = ['MemberId', 'HitTime', 'Behavior', 'SalePageId', 'EventTime']

#讀檔
for i in ymd:
    df = pd.read_csv('/content/91APP_BehaviorData_'+i+'.csv', usecols=usecols, low_memory=False)
    #刪除NaN
    df = df.dropna(axis=0,how='any')
    #合併df
    df_all = df_all.append([df])
    print(i+' finished')
#重新編號
df_all.index = range(len(df_all))
print(df_all)

In [None]:
#整理時間資料
from datetime import timedelta
df_all['HitDateTime'] = df_all['HitTime'].apply(lambda x:pd.to_datetime(x,unit='ms')+timedelta(hours=8))
df_all['EventDateTime'] = df_all['EventTime'].apply(lambda x:pd.to_datetime(x,unit='ms')+timedelta(hours=8))

In [None]:
import pickle

#存df_all
try:    # 先存 excel
    fname='MembersBehavior_v2'+".xlsx"
    writer = pd.ExcelWriter(fname)
    df_all.to_excel(writer,'Sheet1',encoding='utf-8')
    writer.save()
except:   # error 再存 pickle
    fname='MembersBehavior_v2'+".pkl"
    with open(fname, "wb") as fp: 
        pickle.dump(df_all, fp)   
print ('save ', fname)

###整合201806-202012會員行為資料

In [None]:
#補資料(202007-202012)
!gdown --id '1RSwUysCRV_CAA-Ipr7dMht6MITYTbNwZ' --output MembersBehavior_v2.pkl

In [None]:
#讀整理後的pickle檔
MembersBehavior_v2 = pd.read_pickle('/content/MembersBehavior_v2.pkl')
MembersBehavior_v2

In [None]:
#整合201806-202012會員行為資料
MembersBehavior = MembersBehavior.append([MembersBehavior_v2])

In [None]:
# 只留下曾有購買紀錄的member
ever_purchase_memberId_list = MembersBehavior[MembersBehavior["Behavior"]=="purchase"].MemberId.unique()
Members_df = MembersBehavior.set_index("MemberId")
Members_df = Members_df.loc[ever_purchase_memberId_list,:]
Members_df = Members_df.reset_index()

# 與關鍵字向量合併
score_dict = {"checkout":1, "viewproduct":2, "add":4, "purchase":8}
merge_df = Members_df.merge(item_df, on="SalePageId")
merge_df = merge_df.loc[:,["MemberId", "Behavior", "Vector_avg"]]
merge_df["score"] = merge_df.Behavior.apply(lambda x : score_dict[x])

merge_vector_df = merge_df.groupby("MemberId").apply(lambda x: np.average(x['Vector_avg'], weights=x['score']))
merge_vector_df

In [None]:
# merge_vector_df.to_pickle("vector_df.pkl")

# Part3

In [None]:

"""
Naive Idea:
1. Dimension Reduction to product data and user behavioral data -> PCA
2. Use Cosines Similarity to rank product recommended
3. Choose the top K product
4. Examine if the user do buy the product in the future (time frame : 6 months)
"""


In [None]:
import pandas as pd
import random
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#商品向量(Part1最終結果)
!gdown --id '1BKV0_PgPAjD729DZY3RRMZlSGMib4cID' --output item_df.pkl
#會員向量(Part2最終結果)
!gdown --id '1Kj3PEMq9LI-Vd9oTuPztyNLR9kZQC7wJ' --output vector_df.pkl
!gdown --id 1YVz3rpIKtZVxC7pxEmTUnpRPtdnGHoEn --output SalePageData.csv

In [None]:
#讀整理後的pickle檔
members_df = pd.read_pickle('/content/vector_df.pkl')
item_df = pd.read_pickle('/content/item_df.pkl')

In [None]:
item_df

In [None]:
#for a given person
for person in sampling_list:
  rank_list = []
  for product in range(1, item_df.shape[0]):
      if product not in [635, 1423]:
        rank_list.append([item_df["SalePageTitle"][product],float(cosine_similarity(members_df[person].reshape(1,-1), item_df["Vector_avg"][product].reshape(1,-1)))])
      

In [None]:
def SortByScore(sub_li):
    sub_li.sort(key = lambda x: x[1], reverse = True)
    return sub_li
sorted_rank_list = SortByScore(rank_list)

In [None]:
product_preferene_frame = pd.DataFrame(sorted_rank_list, columns = ["product", "similarity"]).head(10)
product_preferene_frame

In [None]:
#最近一次的購買紀錄是否落在10個候選名單中，且購買越前面的產品所獲得的權重越高
#簡單版：如果落在範圍內就是1，沒有就是0，用算出來的數字去除上訂單總數，即是準確率
#複雜版：推薦越前面的權重應該越高，購買排序一的產品可以獲得10分，第10名1分，最後將全部的分數做加總，並看分數會落在甚麼區間（假如有100個人，最高分數就是1000，而接下來的區間可能就是800-1000算是這個演算法很棒，以此類推）
purchase = Members_df["Behavior"] == "purchase"
print(Members_df[purchase])

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls

import os
import pandas as pd
import numpy as np
from datetime import timedelta
from datetime import datetime, date
import random
from random import sample 
import pandas as pd
import random
from sklearn.metrics.pairwise import cosine_similarity

raw_path = os.path.join(os.path.join('/content/gdrive/MyDrive/BDA'))

df_all = pd.DataFrame()
ymd = ['20200701', '20200801', '20200901', '20201001', '20201101', '20201201', '20210101', 
       '20210201', '20210301', '20210401', '20210501', '20210601', '20210701', '20210801', 
       '20210901', '20211001', '20211101', '20211201', '20220101', '20220201', 
       '20220301', '20220401']
usecols = ['MemberId', 'HitTime', 'Behavior', 'SalePageId', 'EventTime']

#讀檔
for i in ymd:
    df = pd.read_csv(os.path.join(raw_path, "91APP_BehaviorData_" +ｉ+ ".csv"), usecols = usecols)
    #刪除NaN
    df = df.dropna(axis=0,how='any')
    #合併df
    df_all = df_all.append([df])
    print(i+' finished')
#重新編號
df_all.index = range(len(df_all))
print(df_all)



In [None]:
df_all['HitTime'] = pd.to_datetime(df_all['HitTime'],unit = 'ms')
df_all["HitTime"] = df_all["HitTime"] + timedelta(hours = 8)

df_all['EventTime'] = pd.to_datetime(df_all['EventTime'],unit = 'ms')
df_all["EventTime"] = df_all["EventTime"] + timedelta(hours = 8)

In [None]:
st = datetime(2020,7,1)
et = datetime(2022,4,30)

In [None]:
temp_time = st
time_list = []
time_list.append(st) 
pages = pd.read_csv(os.path.join('/content', 'SalePageData.csv'))

while True:
  if temp_time + timedelta(days = 60) < et:  
    temp_time = temp_time + timedelta(days = 60)
    time_list.append(temp_time)
  else:
    break

for time in range(len(time_list)-1):
  df_temp = df_all[(df_all["EventTime"] > time_list[time]) & (df_all["EventTime"] < time_list[time+1])]
  df_temp = df_temp.merge(pages, left_on = "SalePageId", right_on = "SalePageId")

  ever_purchase_memberId_list = df_temp[df_temp["Behavior"]=="purchase"].MemberId.unique()
  random.seed(1)
  sampling_list = sample([i for i in range(len(ever_purchase_memberId_list))],100)
  
  for person in sampling_list:
    personal_temp = df_temp[df_temp["MemberId"] == ever_purchase_memberId_list[person]]
    #print(time_list[time], time_list[time+1], personal_temp)
    # 只留下曾有購買紀錄的member
    Members_df = personal_temp.set_index("MemberId")
    Members_df = Members_df.reset_index()

    # 與關鍵字向量合併
    score_dict = {"checkout":1, "viewproduct":2, "add":4, "purchase":8}
    merge_df = Members_df.merge(item_df, on="SalePageId")
    merge_df = merge_df.loc[:,["MemberId", "Behavior", "Vector_avg"]]
    merge_df["score"] = merge_df.Behavior.apply(lambda x : score_dict[x])
    merge_vector_df = merge_df.groupby("MemberId").apply(lambda x: np.average(x['Vector_avg'], weights=x['score']))
    rank_list = []
    for product in range(1, item_df.shape[0]):
        if product not in [635, 1423]:
          rank_list.append([item_df["SalePageTitle"][product],float(cosine_similarity(merge_vector_df[0].reshape(1,-1), item_df["Vector_avg"][product].reshape(1,-1)))])
    print(rank_list)

  #抽人出來算該會員的向量（去search item_df，並加權平均）

  #對於每個人output我們推的產品
  
  #對應該會員購買的資料，去計算準確率以及複雜版本的指標


  """
  TODO : 
    1. item
    3. member (from item, input:df_temp)
    2. similarity
  """



In [None]:
pages