In [None]:
# possible analyzing tools / ML projects / Recommender System for reference
# https://medium.com/data-scientists-playground/wide-deep%E6%A8%A1%E5%9E%8B-%E6%8E%A8%E8%96%A6%E7%B3%BB%E7%B5%B1-%E5%8E%9F%E7%90%86-8badacf777f3
# https://github.com/tensorflow/models/tree/master/official/wide_deep
# https://tensorflow.juejin.im/get_started/feature_columns.html
# https://www.slideshare.net/JamesKirk58/boston-ml-architecting-recommender-systems


In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
import time

#set chinese font
from matplotlib.font_manager import FontProperties
sn.set(font=['sans-serif'])
sn.set_style("whitegrid",{"font.sans-serif":['Microsoft JhengHei']})

pd.set_option('display.max_columns', 100)
pd.options.display.max_rows = 999

import sys
sys.path.append("..")
from _module_.dbtool import io, compute

import warnings
warnings.filterwarnings("ignore")

## 會員瀏覽(by member_id & 料號) + 會員購買

In [3]:
# 瀏覽數據
df_pno = io.read_table('select * from felixlin."Proj006_browse+purchase_browse2019_bypno" limit 200000')

df_pno['ndate'] = df_pno['date'].apply(lambda x: len(x))
df_pno['nsess'] = df_pno['sessionnumber'].apply(lambda x: len(x))
df_pno['ntime'] = df_pno['time'].apply(lambda x: len(x))

def time_diff(x):
    tmp = np.sort(x)
    tmp = np.array([(tmp[i]-tmp[i-1]).seconds for i in np.arange(1,len(tmp))])
    return([np.median(tmp), tmp.std()])

df_pno[['freq1','freq2']] = df_pno['time'].apply(lambda x: pd.Series(time_diff(x)))
df_pno.head(2)

Connection Established.


Unnamed: 0,member_id,p_no,p_name,level1,level2,level3,date,sessionnumber,time,ndate,nsess,ntime,freq1,freq2
0,100010,502400149502,吹風機/TESCOM 大風量負離子吹風機 TID192TW 白,生活家電,吹風機,TESCOM,[2019-07-01],[27440359],"[2019-07-01 22:42:18.694000, 2019-07-01 22:45:...",1,1,4,5.0,85.560894
1,100010,502401809532,吹風機/TESCOM TID292TW 大風量負離子吹風機 白,生活家電,吹風機,TESCOM,[2019-07-01],[27440359],"[2019-07-01 22:43:07.958000, 2019-07-01 22:45:...",1,1,2,116.0,0.0


In [4]:
# 購買數據 & 合併瀏覽
df_pno2 = io.read_table('felixlin."Proj006_browse+purchase_purchase2019_bypno"','std')

df_merge = df_pno.merge(df_pno2[['member_id','p_no', 'eorder_no', 'time']].rename({'time':'buytime'}, axis=1), 
                        on=['member_id','p_no'], how='left')
df_merge.head(1)

Connection Established.


Unnamed: 0,member_id,p_no,p_name,level1,level2,level3,date,sessionnumber,time,ndate,nsess,ntime,freq1,freq2,eorder_no,buytime
0,100010,502400149502,吹風機/TESCOM 大風量負離子吹風機 TID192TW 白,生活家電,吹風機,TESCOM,[2019-07-01],[27440359],"[2019-07-01 22:42:18.694000, 2019-07-01 22:45:...",1,1,4,5.0,85.560894,,NaT


In [5]:
#p_no, level1, level2 都要有值
df_merge = df_merge[df_merge.p_no.notnull()]
df_merge = df_merge[df_merge.level2.notnull()]
df_merge = df_merge[df_merge.level3.notnull()]

#### 加入品牌資訊

In [93]:
# 獨資料
df_brand = io.read_table('felixlin."002product_brand_ec_v2"','std')
df_itemno = io.read_table('select item_no, brand from felixlin."002product_brand_v3"')

Connection Established.
Connection Established.


In [96]:
# 清理資料
brand_dict = {df_brand.iloc[i][0]:df_brand.iloc[i][1] for i in np.arange(df_brand.shape[0])}

In [97]:
# 合併表格
df_merge = df_merge.merge(df_itemno, left_on='p_no', right_on='item_no', how='left')
df_merge = df_merge.drop('item_no',1)
# 轉成乾淨的品牌
df_merge['brand'] = df_merge['brand'].map(brand_dict)
df_merge.head(1)

Unnamed: 0,member_id,p_no,p_name,level1,level2,level3,date,sessionnumber,time,ndate,nsess,ntime,freq1,freq2,eorder_no,buytime,buy,item_no_x,brand_x,item_no_y,brand_y,brand
0,100010,502400149502,吹風機/TESCOM 大風量負離子吹風機 TID192TW 白,生活家電,吹風機,TESCOM,[2019-07-01],[27440359],"[2019-07-01 22:42:18.694000, 2019-07-01 22:45:...",1,1,4,5.0,85.560894,,NaT,1,502400149502,東元,502400149502,TESCOM,東元


In [None]:
del df_pno, df_pno2, df_brand, df_itemno

# TensorRec

In [None]:
#tensorrec
#<module 'tensorrec' from 'c:\\users\\017084\\appdata\\local\\continuum\\anaconda3\\envs\\work\\lib\\site-packages\\tensorrec\\__init__.py'>

In [7]:
from collections import defaultdict
import random
from scipy import sparse
import tensorrec
import logging
logging.getLogger().setLevel(logging.INFO)

In [27]:
# 將id做對應，產生dictionary
def value_map(df, col):
    uni = np.sort(df[col].fillna('NaN').unique())
    return({uni[i]:i for i in np.arange(uni.shape[0])})

value_map_reverse = lambda x: {value:key for key, value in x.items()}

def append_value_map(old_dict, new_df, col):
    max_index = max(old_dict.values())
    uni = np.sort(new_df[col].unique())
    uni = np.setdiff1d(uni, list(old_dict.keys()))
    new_dict = old_dict.copy()
    new_dict.update({uni[i]:(max_index+i+1) for i in np.arange(uni.shape[0])})
    return(new_dict)

# This method converts a list of (user, item, rating, time) to a sparse matrix
def interactions_to_sparse(interactions, n_users, n_items):
    users_column, items_column, buy_column  = zip(*interactions.values.tolist())
    return(sparse.coo_matrix((buy_column, (users_column, items_column)),shape=(n_users, n_items)))

# This method consumes item ranks for each user and prints out recall@10 train/test metrics
def check_results(ranks, sparse_train2, sparse_test2, k=10):
    train_recall_at_10 = tensorrec.eval.__recall_at_k(test_interactions=sparse_train2,predicted_ranks=ranks,k=k).mean()
    test_recall_at_10 = tensorrec.eval.__recall_at_k(test_interactions=sparse_test2,predicted_ranks=ranks,k=k).mean()
    print("Recall at {}: Train: {:.4f} Test: {:.4f}".format(k,train_recall_at_10,test_recall_at_10))

### 整理會員和商品數據

In [9]:
# 數據清理
df_merge['buy'] = df_merge.buytime.apply(lambda x: 1 if pd.isnull(x) else 2)
df_use = df_merge[['member_id','p_no','buy']]

# 要去掉為NULL的數據
mmb_map= value_map(df_use, 'member_id')
item_map= value_map(df_use, 'p_no')

df_use['member_id'] = df_use['member_id'].map(mmb_map)
df_use['p_no'] = df_use['p_no'].map(item_map)

# unique的會員數和商品數
n_users = df_use.member_id.nunique()
n_items = df_use.p_no.nunique()

In [10]:
# 拆分成training 和 testing
df_use = df_use.sample(frac=1)
cutoff = int((df_use.shape[0])*0.7)
train_ratings = df_use[:cutoff]
test_ratings = df_use[cutoff:]

# 轉成Sparse Matrix
sparse_train = interactions_to_sparse(train_ratings, n_users, n_items)
sparse_test = interactions_to_sparse(test_ratings, n_users, n_items)

In [11]:
del df_use, train_ratings, test_ratings

In [12]:
# Construct indicator features for users and items
user_indicator = sparse.identity(n_users)
item_indicator = sparse.identity(n_items)

In [13]:
def batch_predict_rank(model, ufeat, ifeat, bsize=1000, nrank=300):
    length = ufeat.shape[0]
    blen = int(length/bsize)+1 if length%bsize!=0 else int(length/bsize)
    ufeat_tmp = ufeat.tocsr()
    for i in np.arange(blen):
        if i != blen-1:
            rank_data = model.predict_rank(user_features=ufeat_tmp[(i*bsize):((i+1)*bsize),],item_features=ifeat)
        else:
            rank_data = model.predict_rank(user_features=ufeat_tmp[(i*bsize):,],item_features=ifeat)
        rank_data = sparse.csr_matrix(rank_data*np.less(rank_data, (nrank + 1)))
        arr = rank_data if i == 0 else sparse.vstack([arr, rank_data])
    return(arr)

In [14]:
df_merge.head(1)

Unnamed: 0,member_id,p_no,p_name,level1,level2,level3,date,sessionnumber,time,ndate,nsess,ntime,freq1,freq2,eorder_no,buytime,buy
0,100010,502400149502,吹風機/TESCOM 大風量負離子吹風機 TID192TW 白,生活家電,吹風機,TESCOM,[2019-07-01],[27440359],"[2019-07-01 22:42:18.694000, 2019-07-01 22:45:...",1,1,4,5.0,85.560894,,NaT,1


### Content-Based Item Data

In [111]:
# 數據清理
df_item = df_merge[['p_no', 'p_name','level1','level2','level3', 'brand']]

level1_map= value_map(df_item, 'level1')
level2_map= value_map(df_item, 'level2')
level3_map= value_map(df_item, 'level3')
brand_map = value_map(df_merge, 'brand')

df_item['p_no'] = df_item['p_no'].map(item_map)
df_item['level1'] = df_item['level1'].map(level1_map)
df_item['level2'] = df_item['level2'].map(level2_map)
df_item['level3'] = df_item['level3'].map(level3_map)
df_item['brand'] = df_item['brand'].map(brand_map)

item_tmp = df_item[['p_no','level1','level2','level3','brand']].drop_duplicates(['p_no','level1','level2','level3','brand']).sort_values(['p_no','level1','level2','level3','brand'])
# 檢查p_no和level1,level2是否1->1
# df_item[['p_no','level1','level2']].p_no.nunique()

item_level1 = sparse.coo_matrix(pd.get_dummies(item_tmp['level1']))
item_level2 = sparse.coo_matrix(pd.get_dummies(item_tmp['level2']))
item_level3 = sparse.coo_matrix(pd.get_dummies(item_tmp['level3']))
item_brand = sparse.coo_matrix(pd.get_dummies(item_tmp['brand']))
item_levels = sparse.hstack([item_level1, item_level2, item_level3, item_brand])
n_item = item_levels.shape[1]

In [124]:
item_levels = sparse.hstack([item_level1, item_level2, item_level3])
n_item = item_levels.shape[1]

## 加入會員 META DATA

In [34]:
df_mmb = io.read_table('felixlin."Proj006_browse+purchase_mmb_info"','std')

Connection Established.


In [35]:
# 年齡層
bins= [-999,0,20,30,40,50,60,120]
labels = np.arange(len(bins)-1)
df_mmb['age'] = df_mmb['age'].fillna(-1)
df_mmb['age'] = pd.cut(df_mmb['age'], bins=bins, labels=labels, right=False)

# 性別
df_mmb['sex'] = df_mmb.sex.fillna('N')
sex_map = value_map(df_mmb,'sex')
df_mmb['sex'] = df_mmb['sex'].map(sex_map)

# 註冊日
bins= [0,180,360,720,10000]
labels = np.arange(len(bins)-1)
df_mmb['rdays'] = pd.cut(df_mmb['rdays'], bins=bins, labels=labels, right=False)
if df_mmb.rdays.isnull().any():
    df_mmb['rdays'] = df_mmb['rdays'].fillna(len(bins)-1)

# 裝置
df_mmb['os_type'] = df_mmb.os_type.fillna('z')
os_map = value_map(df_mmb,'os_type')
df_mmb['os_type'] = df_mmb['os_type'].map(os_map)

# 地區
dict_city={"花蓮縣":"東部與外島地區", "宜蘭縣":"北部地區", "台東縣":"東部與外島地區", "南投縣":"中部地區", 
           "金門縣":"東部與外島地區", "嘉義縣":"南部地區", "基隆市":"北部地區", "彰化縣":"中部地區", 
           "台中市":"中部地區", "雲林縣":"南部地區", "高雄市":"南部地區", "桃園縣":"北部地區", 
           "新竹縣":"北部地區", "新竹市":"北部地區", "苗栗縣":"中部地區", "新北市":"北部地區", 
           "台北市":"北部地區", "連江縣":"東部與外島地區", "桃園市":"北部地區", "屏東縣":"南部地區", 
           "台南市":"南部地區", "嘉義市":"南部地區", "澎湖縣":"東部與外島地區", "未知":"未知"}

df_mmb['c_city'] = df_mmb.c_city.fillna('未知')
df_mmb['c_city'] = df_mmb['c_city'].map(dict_city)
city_map = value_map(df_mmb,'c_city')
df_mmb['c_city'] = df_mmb['c_city'].map(city_map)

In [36]:
# 消失的會員，補上遺失資料
expectmmb = set(df_merge.member_id.unique())
getmmb = set(df_mmb[df_mmb.member_id.isin(df_merge.member_id)].member_id.unique())
if len(getmmb)<len(expectmmb):
    diff = expectmmb.difference(getmmb)
    print('{} members are not listed in member info.\nAlready added to member info.'.format(len(diff)))
    df_mmb = pd.concat([df_mmb, pd.DataFrame([[i,0,2,4,3,2] for i in diff], columns=['member_id', 'age','sex','rdays','c_city','os_type'])],0)
else:
    print("All members are listed")

#df_mmb = pd.concat([df_mmb, pd.DataFrame([['1242148',0,2,0,3,2]], columns=['member_id', 'age','sex','rdays','c_city','os_type'])],0)

1 members are not listed in member info.
Already added to member info.


In [37]:
# 會員資料的ID轉換成建模時的ID
df_mmb2 = df_mmb[df_mmb.member_id.isin(df_merge.member_id)]
df_mmb2['member_id'] = df_mmb2.member_id.map(mmb_map)

# Create User Full Matrix
df_mmb2 = df_mmb2.sort_values('member_id')
user_meta = sparse.hstack([sparse.coo_matrix(pd.get_dummies(df_mmb2['age'])),
                           sparse.coo_matrix(pd.get_dummies(df_mmb2['sex'])),
                           sparse.coo_matrix(pd.get_dummies(df_mmb2['rdays'])),
                           sparse.coo_matrix(pd.get_dummies(df_mmb2['os_type'])),
                           sparse.coo_matrix(pd.get_dummies(df_mmb2['c_city']))
                          ])
# Use Hybrid Model
item_full = sparse.hstack([item_indicator, item_levels])
user_full = sparse.hstack([user_indicator, user_meta])

In [38]:
pno_map_to_pname = df_merge[['p_no','p_name']].drop_duplicates(['p_no','p_name']).set_index('p_no').p_name.to_dict()
item_map_reverse = value_map_reverse(item_map)
mmb_map_reverse = value_map_reverse(mmb_map)

# Modeling

## Collaborative Filter

In [39]:
# Build a matrix factorization collaborative filter model
cf_model = tensorrec.TensorRec(n_components=5)
# Fit the collaborative filter model
print("Training collaborative filter")
cf_model.fit(interactions=sparse_train,user_features=user_indicator,item_features=item_indicator, user_batch_size=500)

Training collaborative filter


In [40]:
# Create sets of train/test interactions that are only ratings >= 4.0
sparse_train2 = sparse_train.multiply(sparse_train >= 2.0)
sparse_test2 = sparse_test.multiply(sparse_test >= 2.0)    

# Check the results of the MF CF model
print("Matrix factorization collaborative filter:")
predicted_ranks = batch_predict_rank(cf_model,user_indicator,item_indicator,2000)
#predicted_ranks = cf_model.predict_rank(user_features=user_indicator,item_features=item_indicator)
check_results(predicted_ranks,sparse_train2, sparse_test2)

Matrix factorization collaborative filter:
Recall at 10: Train: 0.0006 Test: 0.0006


In [41]:
# Let's try a new loss function: WMRB
print("Training collaborative filter with WMRB loss")
ranking_cf_model = tensorrec.TensorRec(n_components=5,loss_graph=tensorrec.loss_graphs.WMRBLossGraph())
ranking_cf_model.fit(interactions=sparse_train,user_features=user_indicator,item_features=item_indicator,n_sampled_items=int(n_items * .01))
# Check the results of the WMRB MF CF model
print("WMRB matrix factorization collaborative filter:")
#predicted_ranks = ranking_cf_model.predict_rank(user_features=user_indicator,item_features=item_indicator)

Training collaborative filter with WMRB loss
WMRB matrix factorization collaborative filter:


In [42]:
#### Try Batches
print("WMRB matrix factorization collaborative filter:")
predicted_ranks = batch_predict_rank(ranking_cf_model,user_indicator,item_indicator,2000)
# 看最後的分數
check_results(predicted_ranks,sparse_train2, sparse_test2, 10)

WMRB matrix factorization collaborative filter:
Recall at 10: Train: 0.1118 Test: 0.0395


## Content-Based Model

In [None]:
# Fit a content-based model using the genres as item features
print("Training content-based recommender...")
content_model = tensorrec.TensorRec(n_components=n_item,item_repr_graph=tensorrec.representation_graphs.FeaturePassThroughRepresentationGraph(),
                                    loss_graph=tensorrec.loss_graphs.WMRBLossGraph())
content_model.fit(interactions=sparse_train,user_features=user_indicator,
                  item_features=item_levels,n_sampled_items=int(n_items * .01),user_batch_size=500)

Training content-based recommender...


In [None]:
# Check the results of the content-based model
print("Content-based recommender:")
#predicted_ranks = content_model.predict_rank(user_features=user_indicator,item_features=item_levels)
predicted_ranks = batch_predict_rank(content_model,user_indicator,item_levels,1000)
check_results(predicted_ranks,sparse_train, sparse_test, 10)

In [None]:
check_results(predicted_ranks,sparse_train, sparse_test, 30)

## Hybrid Model

In [None]:
# Try concatenating the genres on to the indicator features for a hybrid recommender system
item_full = sparse.hstack([item_indicator, item_levels])
print("Training hybrid recommender")

In [None]:
hybrid_model = tensorrec.TensorRec(n_components=20,loss_graph=tensorrec.loss_graphs.WMRBLossGraph())
hybrid_model.fit(interactions=sparse_train,user_features=user_indicator,
                 item_features=item_full,n_sampled_items=int(n_items * .01),user_batch_size=500)
print("Hybrid recommender:")
predicted_ranks = batch_predict_rank(hybrid_model, user_indicator,item_full)

In [None]:
check_results(predicted_ranks,sparse_train, sparse_test, 10)

#### 看單一會員的推薦狀況

In [None]:
# Pull user 432's features out of the user features matrix and predict movie ranks for just that user
mmbtest = 400
u432_features = sparse.csr_matrix(user_indicator)[mmbtest]
u432_rankings = hybrid_model.predict_rank(user_features=u432_features,item_features=item_full)[0]
# Get internal IDs of User 432's top 10 recommendations
# These are sorted by item ID, not by rank
# This may contain items with which User 432 has already interacted
u432_top_ten_recs = np.where(u432_rankings <= 30)[0]
u432_top_ten_recs

print("User {} recommendations:".format(mmbtest))
for m in u432_top_ten_recs:
    print('{}\t{}'.format(item_map_reverse[m], pno_map_to_pname[item_map_reverse[m]]))

In [None]:
df_merge[df_merge.member_id==mmb_map_reverse[mmbtest]]

## Hybrid Model w/ 會員數據

In [None]:
print("Training hybrid recommender")
hybrid_model = tensorrec.TensorRec(n_components=5,loss_graph=tensorrec.loss_graphs.WMRBLossGraph())
hybrid_model.fit(interactions=sparse_train,user_features=user_full,
                 item_features=item_full,n_sampled_items=int(n_items * .01), user_batch_size=1000)

#print("Hybrid recommender:")
predicted_ranks = batch_predict_rank(hybrid_model, user_full,item_full)
check_results(predicted_ranks,sparse_train, sparse_test, 10)

In [None]:
check_results(predicted_ranks,sparse_train, sparse_test, 30)

In [None]:
print("Training hybrid recommender")
hybrid_model = tensorrec.TensorRec(n_components=9,loss_graph=tensorrec.loss_graphs.WMRBLossGraph())
hybrid_model.fit(interactions=sparse_train,user_features=user_full,
                 item_features=item_full,n_sampled_items=int(n_items * .01), user_batch_size=1000)

#print("Hybrid recommender:")
predicted_ranks = batch_predict_rank(hybrid_model, user_full,item_full)
check_results(predicted_ranks,sparse_train, sparse_test, 10)

In [None]:
check_results(predicted_ranks,sparse_train, sparse_test, 30)

## 使用tensorrec內建的NN ReLU作為graph representation

In [None]:
from tensorrec.representation_graphs import (ReLURepresentationGraph, AbstractKerasRepresentationGraph, 
    LinearRepresentationGraph, NormalizedLinearRepresentationGraph, AbstractRepresentationGraph)
from tensorrec.prediction_graphs import CosineSimilarityPredictionGraph

In [None]:
tm1 = compute.timerec()
#tm2 = compute.timerec()

In [None]:
# Use Hybrid Model
tm1.start()
print("Training hybrid neural network recommender")
hybrid_model1 = tensorrec.TensorRec(n_components=10,
                                   loss_graph=tensorrec.loss_graphs.BalancedWMRBLossGraph(),
                                   user_repr_graph = LinearRepresentationGraph(),
                                   item_repr_graph = LinearRepresentationGraph()#,
                                   #attention_graph = AbstractRepresentationGraph()
                                   )
hybrid_model1.fit(interactions=sparse_train,user_features=user_full,
                 item_features=item_full,n_sampled_items=int(n_items * .01), user_batch_size=1000)
tm1.end()
tm1.delta

In [None]:
print("Hybrid recommender:")
predicted_ranks = batch_predict_rank(hybrid_model1, user_full,item_full, bsize=1000)
check_results(predicted_ranks,sparse_train, sparse_test, 10)

In [None]:
check_results(predicted_ranks,sparse_train, sparse_test, 30)

In [None]:
sim1 = hybrid_model1.predict_similar_items(item_full, np.arange(1,100), 5)

In [None]:
for i in range(len(sim1)):
    print('item_id: {}\t{}\nSimilar items:'.format(i, pno_map_to_pname[item_map_reverse[i]]))
    for j in sim1[i]:
        print(pno_map_to_pname[item_map_reverse[j[0]]])
    print('\n')