In [1]:
from scipy import sparse as sp
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,recall_at_k,auc_score
import copy
import itertools
import numpy as np 
import lightfm
import pickle 
import pandas as pd 



In [2]:
#fundid_names_df.to_csv('./funds/fundid_to_name.csv',index=False)

with open('./funds/sp_funds_datasets.pickle','rb') as f:
    data = pickle.load(f)
    
test = data['test']
train = data['train']
user_idxs = data['user_idxs']
idx_to_userid = data['idx_to_userid']
userid_to_idx = data['userid_to_idx']
idx_to_itemid = data['idx_to_itemid']
itemid_to_idx = data['itemid_to_idx']

fundid_names_df = pd.read_csv('./funds/fundid_to_name.csv',encoding='cp950')
fundid_to_names = {}

for d in fundid_names_df.to_dict('records'):
    fundid_to_names[d['基金代碼']] = d['基金中文名稱']
#%% 

In [3]:
model = LightFM(learning_rate=0.01, loss='warp')
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()
train_recall = recall_at_k(model,train,k=10).mean()
test_recall = recall_at_k(model,test,k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()
print('Recall: train {:.2f}%, test {:.2f}%'.format(100*train_recall,100*test_recall))
print('Precision: train {:.2f}% , test {:.2f}%.'.format(100*train_precision, 100*test_precision))
print('AUC: train {:.2f}, test {:.2f}.'.format(train_auc, test_auc))

Recall: train 20.99%, test 19.34%
Precision: train 9.41% , test 1.93%.
AUC: train 0.92, test 0.91.


In [8]:
def sample_recommendation_original(model, data, user_ids, print_output=True):
    
    train = data['train']
    test = data['test']
    assert isinstance(train,sp.csr_matrix) and isinstance(test,sp.csr_matrix)
        
    n_users, n_items = train.shape

    for user_id in user_ids:
        
        known_positives_itemids = [ 
                idx_to_itemid[e] for e in train[user_id].indices
                ]
        known_positives_item_names = [
                fundid_to_names[e] for e in known_positives_itemids
                ]
        scores = model.predict(user_id, np.arange(n_items))
        top_items_ids = [idx_to_itemid[e] for e in np.argsort(-scores)]
        if print_output == True:
            print("User %s" % user_id)
            print("     Known positives:")

            for x in known_positives_item_names[:3]:
                print("        %s" % x)

            print("     Recommended:")

            for x in top_items_ids[:3]:
                print("        %s" % fundid_to_names[x])

In [44]:
sample_recommendation_original(model,data,range(3))

User 0
     Known positives:
        (百元基金)摩根美國複合收益債券基金-JPM-A股累計(美元)
        富達中國聚焦基金(年配)-配息帳戶-美元
        (百元基金)貝萊德歐洲價值型基金HEDGED A2(美元)
     Recommended:
        (百元基金)貝萊德中國基金A2(美元)
        安聯收益成長基金-AM(穩定月收類股)(美元)
        瑞銀(盧森堡)生化股票基金(美元)
User 1
     Known positives:
        柏瑞印度股票基金A(美元)
        (百元基金)貝萊德歐洲價值型基金HEDGED A2(美元)
        瑞銀(盧森堡)生化股票基金(美元)
     Recommended:
        安聯收益成長基金-AM(穩定月收類股)(美元)
        (百元基金)貝萊德中國基金A2(美元)
        瑞銀(盧森堡)生化股票基金(美元)
User 2
     Known positives:
        摩根日本(日圓)基金
        (百元基金)貝萊德世界能源基金(美元)
        (百元基金)貝萊德中國基金A2(美元)
     Recommended:
        安聯收益成長基金-AM(穩定月收類股)(美元)
        (百元基金)貝萊德中國基金A2(美元)
        (百元基金)永豐滬深300紅利指數基金


Now we'll do an Annoy example showing how we can do user recommendations using a neat trick outlined by the 
[Xbox recomendations team]( https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf)

In [21]:
import annoy 
from annoy import AnnoyIndex

In [12]:
item_vectors = model.item_embeddings

In [80]:
norms = np.linalg.norm(item_vectors, axis=1)
max_norm = norms.max()
extra_dimension = np.sqrt(max_norm ** 2 - norms ** 2)
norm_data = np.append(
    item_vectors, extra_dimension.reshape(norms.shape[0], 1), axis=1)

f_member = norm_data.shape[1]
t_member = AnnoyIndex(f_member)  # Length of item vector that will be indexed

for i in range(norm_data.shape[0]):
    v = norm_data[i]
    t_member.add_item(i, v)

t_member.build(200)

True

In [81]:
user_vectors = model.user_embeddings

In [82]:
def sample_recommendation_annoy(model,data,user_ids,print_output=True):
    train = data['train']
    test = data['test']
    n_users, n_items = train.shape

    for user_id in user_ids:
        
        known_positives_itemids = [ 
                idx_to_itemid[e] for e in train[user_id].indices
                ]
        known_positives_item_names = [
                fundid_to_names[e] for e in known_positives_itemids
                ]
        
        top_items_ids = [idx_to_itemid[e] for e in t_member.get_nns_by_vector(
            np.append(user_vectors[user_id], 0), 50)]
        
        if print_output == True:
            print("User %s" % user_id)
            print("     Known positives:")

            for x in known_positives_item_names[:3]:
                print("        %s" % x)

            print("     Recommended:")

            for x in top_items_ids[:3]:
                print("        %s" % fundid_to_names[x])

In [50]:
sample_recommendation_annoy(model,data,user_ids=[0,1,2])

User 0
     Known positives:
        (百元基金)摩根美國複合收益債券基金-JPM-A股累計(美元)
        富達中國聚焦基金(年配)-配息帳戶-美元
        (百元基金)貝萊德歐洲價值型基金HEDGED A2(美元)
     Recommended:
        (百元基金)富蘭克林黃金基金(年配權)
        (百元基金)富蘭克林坦伯頓全球生技領航基金(美元)
        路博邁投資基金-NB新興市場股票基金T累積類股(美元)
User 1
     Known positives:
        柏瑞印度股票基金A(美元)
        (百元基金)貝萊德歐洲價值型基金HEDGED A2(美元)
        瑞銀(盧森堡)生化股票基金(美元)
     Recommended:
        (百元基金)富蘭克林黃金基金(年配權)
        路博邁投資基金-NB新興市場股票基金T累積類股(美元)
        第一金中國世紀基金
User 2
     Known positives:
        摩根日本(日圓)基金
        (百元基金)貝萊德世界能源基金(美元)
        (百元基金)貝萊德中國基金A2(美元)
     Recommended:
        (百元基金)富蘭克林坦伯頓全球生技領航基金(美元)
        (百元基金)施羅德環球能源A1累積(美元)
        (百元基金)富蘭克林黃金基金(年配權)


recall, precision on annoy 

In [83]:
def eval_recommendation_test(model,test,topn=10,annoy=True):
    n_users,n_items = test.shape
    hr = 0
    n_test = 0
    for user_id in range(n_users):
        if test[user_id].indices:
            if annoy:
                top_items_idxs = t_member.get_nns_by_vector(np.append(user_vectors[user_id],0),topn)
            else:
                scores = model.predict(user_id, np.arange(n_items))
                top_items_idxs = np.argsort(-scores)[:topn]
                
            test_fundidx = test[user_id].indices
            hr += np.any(np.intersect1d(test_fundidx,top_items_idxs))
            n_test += 1
    return hr/n_test

In [85]:
%time rat10 = eval_recommendation_test(model,test,annoy=True)
%time rat10_annoy = eval_recommendation_test(model,test,annoy=False)

Wall time: 4.82 s
Wall time: 7.08 s


In [89]:
print('recall:{:.2f}% at topn=10, without approximation '.format(rat10*100))
print('recall:{:.2f}% at topn=10, with approximation '.format(rat10_annoy*100))

recall:9.16% at topn=10, without approximation 
recall:19.34% at topn=10, with approximation 
