Using annoy package to do nearest neighbor search instead of doing pairwise similarity calculation. We did this on our fund datasets.

In [32]:
import annoy 
import lightfm
import numpy as np 
import pandas as pd
import pickle


In [2]:
with open('./funds/sp_funds_datasets.pickle','rb') as f:
    data = pickle.load(f)

In [41]:
fundid_names_df = pd.read_csv('./funds/fundid_to_name.csv',encoding='cp950')
fundid_to_names = {}

for d in fundid_names_df.to_dict('records'):
    fundid_to_names[d['基金代碼']] = d['基金中文名稱']

In [3]:
import random
from annoy import AnnoyIndex

In [33]:
train = data['train']
test = data['test']
idx_to_itemid = data['idx_to_itemid']
itemid_to_idx = data['itemid_to_idx']
userid_to_idx = data['userid_to_idx']
idx_to_userid = data['idx_to_userid']

In [6]:
train[:,0].A.flatten()

array([1, 0, 0, ..., 0, 0, 0], dtype=int32)

In [37]:
n_users, n_items = train.shape
tn = AnnoyIndex(n_users,metric="hamming") #length of item vectors ==> item vectors
for idx in range(n_items):
    v = train[:,idx].A.flatten()
    tn.add_item(idx,v)
    if idx % 1000==0 :
        print('idx:{},\tcomplete'.format(idx))

idx:0,	complete
idx:1000,	complete
idx:2000,	complete


In [38]:
print('build index,unchangeable.')
tn.build(10)

build index,unchangeable.


True

In [39]:
tn.get_nns_by_item(1,10)

[331, 468, 1646, 273, 361, 397, 153, 175, 95, 881]

In [73]:
def get_fundids_names(fundidxs,originidx):
    originid = idx_to_itemid[originidx]
    originname = fundid_to_names[originid]
    print('origin fund:{}'.format(originname))
    
    print('\n==========nearest neighbors==========\n')
    for idx in fundidxs:
        itemid = idx_to_itemid[idx]
        print('fund:{}'.format(fundid_to_names[itemid]))

In [89]:
get_fundids_names(tn.get_nns_by_item(itemidx,10),itemidx)

origin fund:(百元基金)貝萊德世界能源基金(美元)


fund:(百元基金)瀚亞大中華股票基金A
fund:(百元基金)瀚亞全球新興市場債券基金 A
fund:安本環球-歐元高收益債券基金A2累積(基本貨幣避險-美元)
fund:柏瑞特別股息收益基金A(不配息)(美元)
fund:(百元基金)貝萊德全球股票入息基金A2(美元)
fund:聯博成熟市場多元收益基金AD月配級別美元
fund:永豐新興高收雙債組合基金-月配類型
fund:安本環球-歐元高收益債券基金A1月配息(基本貨幣避險-美元)
fund:貝萊德全球股票收益基金A6(穩定配息)(美元)
fund:摩根美國基金-JPM-A股累計(澳幣)


In [46]:
def sample_recommendation(data,uidxs,print_len=3,recommended_len=10):
    for uidx in uidxs:
        print('useridx:{}'.format(uidx))
        known_items_givenid = data['train'][uidx].indices
        for item in known_items_givenid[:print_len]:
            itemid = idx_to_itemid[item]
            print('\tknown positive items:{}'.format(fundid_to_names[itemid]))                
#     test = data['test']

In [47]:
sample_recommendation(data,uidxs=[0,1])

useridx:0
	known positive items:(百元基金)摩根美國複合收益債券基金-JPM-A股累計(美元)
	known positive items:富達中國聚焦基金(年配)-配息帳戶-美元
	known positive items:(百元基金)貝萊德歐洲價值型基金HEDGED A2(美元)
useridx:1
	known positive items:柏瑞印度股票基金A(美元)
	known positive items:(百元基金)貝萊德歐洲價值型基金HEDGED A2(美元)
	known positive items:瑞銀(盧森堡)生化股票基金(美元)


In [None]:
norms = np.linalg.norm(item_vectors, axis=1)
max_norm = norms.max()
extra_dimension = np.sqrt(max_norm ** 2 - norms ** 2)
norm_data = np.append(
    item_vectors, extra_dimension.reshape(norms.shape[0], 1), axis=1)

f_member = norm_data.shape[1]
t_member = AnnoyIndex(f_member)  # Length of item vector that will be indexed

for i in range(norm_data.shape[0]):
    v = norm_data[i]
    t_member.add_item(i, v)

t_member.build(10)

In [23]:
from annoy import AnnoyIndex
import random

f = 40
t = AnnoyIndex(f)  # Length of item vector that will be indexed
for i in range(1000):
    v = [random.gauss(0, 1) for z in range(f)]
    t.add_item(i, v)

t.build(10) # 10 trees

True

In [24]:
t.get_nns_by_item(1,10)

[1, 637, 603, 366, 167, 404, 959, 326, 78, 205]