In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

ds = pd.read_csv("sample-data.csv")

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

In [4]:
tfidf_matrix

<500x52262 sparse matrix of type '<class 'numpy.float64'>'
	with 148989 stored elements in Compressed Sparse Row format>

In [10]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
cosine_similarities

array([[1.        , 0.10110642, 0.06487353, ..., 0.06097409, 0.06546914,
        0.06955608],
       [0.10110642, 1.        , 0.4181664 , ..., 0.03550042, 0.06936414,
        0.06480538],
       [0.06487353, 0.4181664 , 1.        , ..., 0.03402428, 0.0455137 ,
        0.05038512],
       ...,
       [0.06097409, 0.03550042, 0.03402428, ..., 1.        , 0.04187121,
        0.04958298],
       [0.06546914, 0.06936414, 0.0455137 , ..., 0.04187121, 1.        ,
        0.36281626],
       [0.06955608, 0.06480538, 0.05038512, ..., 0.04958298, 0.36281626,
        1.        ]])

In [5]:
results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]

    results[row['id']] = similar_items[1:]
    
print('done!')

done!


In [12]:
results[1][:5]

[(0.22037921472617467, 19),
 (0.16938950913002365, 494),
 (0.16769458065321555, 18),
 (0.1648552774562297, 172),
 (0.1481261546058637, 442)]

In [9]:
def item(id):
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

recommend(item_id=11, num=5)

Recommending 5 products similar to Baby sunshade top...
-------
Recommended: Sunshade hoody (score:0.21330296021085024)
Recommended: Baby baggies apron dress (score:0.10975311296284812)
Recommended: Runshade t-shirt (score:0.09988151262780731)
Recommended: Runshade t-shirt (score:0.09530698241688207)
Recommended: Runshade top (score:0.08510550093018411)


In [13]:
from annoy import AnnoyIndex
import random

f = 40
t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
for i in range(1000):
    v = [random.gauss(0, 1) for z in range(f)]
    t.add_item(i, v)

t.build(10) # 10 trees
t.save('test.ann')

# ...

u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors

[0, 38, 732, 188, 935, 466, 170, 316, 674, 635, 297, 435, 417, 553, 494, 826, 156, 962, 550, 137, 115, 605, 574, 649, 385, 483, 496, 160, 507, 630, 731, 45, 795, 908, 406, 657, 186, 120, 200, 185, 767, 512, 524, 752, 280, 840, 871, 177, 80, 503, 468, 225, 815, 199, 101, 73, 231, 351, 457, 216, 491, 75, 152, 562, 161, 786, 169, 700, 824, 988, 370, 59, 888, 71, 843, 384, 306, 703, 511, 688, 641, 233, 713, 545, 480, 822, 438, 756, 768, 845, 98, 301, 727, 181, 775, 603, 348, 123, 863, 823, 694, 522, 538, 880, 467, 19, 973, 970, 521, 416, 708, 425, 285, 366, 41, 365, 920, 634, 364, 832, 492, 362, 526, 145, 77, 528, 983, 950, 87, 421, 608, 187, 895, 712, 273, 304, 388, 336, 710, 551, 198, 311, 331, 104, 834, 449, 760, 877, 31, 256, 982, 647, 127, 374, 733, 132, 400, 441, 576, 527, 505, 11, 625, 591, 403, 446, 645, 725, 851, 543, 269, 882, 21, 816, 915, 215, 748, 250, 204, 520, 853, 735, 745, 743, 46, 607, 479, 814, 930, 606, 85, 997, 211, 693, 387, 226, 736, 443, 899, 917, 398, 704, 423, 517

In [16]:
v

[-0.8001011758851562,
 0.9742643296787099,
 -1.2154792731722794,
 1.9825728398048323,
 -0.6208463437171171,
 0.9375365505327832,
 -0.428954333556072,
 0.45412334212993993,
 -1.4723811814035772,
 -0.5126822511928186,
 -0.7850791915494103,
 0.23780897994617925,
 -0.20616142516991354,
 0.3725166776412706,
 0.45977692654578656,
 0.42127035275223246,
 -0.5313737246054463,
 0.034720724138456914,
 -2.832818163133335,
 -0.8865548869689042,
 -0.1823758948287257,
 -1.1939632183810371,
 -0.10804910717176042,
 0.0408013799735344,
 0.5566634234671195,
 -0.27930416235842526,
 -1.0007106102959211,
 1.1437234543797479,
 0.11712053796182555,
 -1.1817322236226169,
 0.035819030938356235,
 -1.0224365663175146,
 1.449321074127968,
 -0.5862636636897857,
 -0.41851861977651467,
 -0.28824978866704926,
 0.8423204614380185,
 -0.026736695199035376,
 -0.4776572755579751,
 -0.060155925060645335]

In [14]:
dir(u)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'add_item',
 'build',
 'f',
 'get_distance',
 'get_item_vector',
 'get_n_items',
 'get_n_trees',
 'get_nns_by_item',
 'get_nns_by_vector',
 'load',
 'on_disk_build',
 'save',
 'set_seed',
 'unbuild',
 'unload',
 'verbose']