In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import csv

In [71]:
data = pd.read_csv('sample-data.csv')

In [72]:
data.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [73]:
data['tokens'] = data['description'].apply(lambda x: word_tokenize(x.lower()))

In [74]:
data.head(2)

Unnamed: 0,id,description,tokens
0,1,Active classic boxers - There's a reason why o...,"[active, classic, boxers, -, there, 's, a, rea..."
1,2,Active sport boxer briefs - Skinning up Glory ...,"[active, sport, boxer, briefs, -, skinning, up..."


In [75]:
# tf_idf
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['description'])

In [76]:
# word2vec
model_w2v = Word2Vec(sentences=data['tokens'], 
                     vector_size=50,
                     window=5,
                     min_count=1, # –º–∏–Ω–∏–º–∞–ª—å–Ω–∞—è —á–∞—Å—Ç–æ—Ç–∞ —Å–ª–æ–≤–∞
                     workers=4,
                     epochs=10)
word2vec_matrix = np.array([np.mean([model_w2v.wv[token] for token in tokens], axis=0) for tokens in data['tokens']])

In [77]:
# cosine distance
cosine_sim_tfidf = cosine_similarity(tfidf_matrix)
cosine_sim_w2v = cosine_similarity(word2vec_matrix)

In [78]:
# –æ–±–Ω–æ—É–ª–º –¥–∏–∞–≥–æ–Ω–∞–ª –≤ –º–∞—Ç—Ä–∏—Ü–∞—Ö, —á—Ç–æ–± –≤ —Å–ø–∏—Å–∫–∏ –Ω–µ –ø–æ–ø–∞–¥–∞–ª–∏ —Å–∞–º —Ç–æ–≤–∞—Ä, —Å –∫–æ—Ç–æ—Ä—ã–º –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç —Å—Ä–∞–≤–Ω–µ–Ω–∏–µ

np.fill_diagonal(cosine_sim_tfidf, 0)
np.fill_diagonal(cosine_sim_w2v, 0)

In [79]:
threshold = 0.7
mask_tfidf = cosine_sim_tfidf > threshold
mask_w2v = cosine_sim_w2v > threshold

In [80]:
similar_items_tfidf = {id: list(data['id'][mask]) for id, mask in zip(data['id'], mask_tfidf)}
similar_items_w2v = {id: list(data['id'][mask]) for id, mask in zip(data['id'], mask_w2v)}

In [81]:
print("–°–ø–∏—Å–æ–∫ –ø–æ—Ö–æ–∂–∏—Ö —Ç–æ–≤–∞—Ä–æ–≤ –Ω–∞ –æ—Å–Ω–æ–≤–µ TF-IDF:", similar_items_tfidf)

–°–ø–∏—Å–æ–∫ –ø–æ—Ö–æ–∂–∏—Ö —Ç–æ–≤–∞—Ä–æ–≤ –Ω–∞ –æ—Å–Ω–æ–≤–µ TF-IDF: {1: [], 2: [], 3: [], 4: [159], 5: [308], 6: [], 7: [], 8: [220], 9: [], 10: [], 11: [], 12: [], 13: [], 14: [], 15: [16], 16: [15], 17: [], 18: [171], 19: [494], 20: [21, 172, 340, 487, 488], 21: [20, 172, 487, 488], 22: [23, 174, 359, 360, 497], 23: [22, 175, 359, 360, 497], 24: [441, 443], 25: [176], 26: [], 27: [28, 452, 453], 28: [27, 452, 453], 29: [454], 30: [], 31: [], 32: [462, 463], 33: [], 34: [], 35: [179, 282, 473], 36: [], 37: [481, 482], 38: [], 39: [], 40: [], 41: [], 42: [421], 43: [], 44: [], 45: [], 46: [409], 47: [], 48: [69, 238, 319, 396], 49: [134], 50: [439], 51: [], 52: [444], 53: [], 54: [], 55: [], 56: [], 57: [], 58: [63, 64, 65, 432, 433], 59: [], 60: [], 61: [], 62: [], 63: [58, 64, 65, 432, 433], 64: [58, 63, 65, 432, 433], 65: [58, 63, 64, 432, 433], 66: [], 67: [], 68: [], 69: [48, 238, 319, 332, 371, 396, 490], 70: [], 71: [], 72: [334, 470], 73: [335], 74: [], 75: [77, 361], 76: [125

–î–ª—è –∫–∞–∫–∏—Ö-—Ç–æ —Ç–æ–≤–∞—Ä–æ–≤ –Ω–µ –Ω–∞—à–ª–æ—Å—å –ø–æ—Ö–æ–∂–∏—Ö, –ø–æ—ç—Ç–æ–º—É —Ç–∞–º –ø—É—Å—Ç—ã–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã.
–°–¥–µ–ª–∞–µ–º –≤—Ç–æ—Ä–æ–π –≤–∞—Ä–∏–∞–Ω—Ç —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏—è, –≤ –∫–æ—Ç–æ—Ä–æ–º –∑–∞–ø–æ–ª–Ω–∏–º –ø—É—Å—Ç—ã–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–∞–º—ã–º –±–ª–∏–∑–∫–∏–º –ø–æ –∑–Ω–∞—á–µ–Ω–∏—é –≤–∞—Ä–∏–∞–Ω—Ç–æ–º, –¥–∞–∂–µ –µ—Å–ª–∏ –æ–Ω –Ω–µ –ø—Ä–µ–≤—ã—à–∞–µ—Ç —É—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π –ø–æ—Ä–æ–≥

In [84]:
def get_most_similar(cosine_sim, ids):
    similar_items = {}
    for i, id in enumerate(ids):
        similar = list(ids[cosine_sim[i] > threshold])
        if len(similar) == 0:
            most_similar_idx = np.argmax(cosine_sim[i])
            similar.append(ids[most_similar_idx])
        similar_items[id] = similar
    return similar_items

similar_items_tfidf_filled = get_most_similar(cosine_sim_tfidf, data['id'])
print(similar_items_tfidf_filled)

{1: [19], 2: [3], 3: [2], 4: [159], 5: [308], 6: [438], 7: [354], 8: [220], 9: [417], 10: [425], 11: [419], 12: [402], 13: [135], 14: [166], 15: [16], 16: [15], 17: [13], 18: [171], 19: [494], 20: [21, 172, 340, 487, 488], 21: [20, 172, 487, 488], 22: [23, 174, 359, 360, 497], 23: [22, 175, 359, 360, 497], 24: [441, 443], 25: [176], 26: [491], 27: [28, 452, 453], 28: [27, 452, 453], 29: [454], 30: [92], 31: [436], 32: [462, 463], 33: [51], 34: [462], 35: [179, 282, 473], 36: [491], 37: [481, 482], 38: [279], 39: [38], 40: [357], 41: [491], 42: [421], 43: [427], 44: [427], 45: [46], 46: [409], 47: [249], 48: [69, 238, 319, 396], 49: [134], 50: [439], 51: [33], 52: [444], 53: [342], 54: [368], 55: [472], 56: [397], 57: [349], 58: [63, 64, 65, 432, 433], 59: [16], 60: [20], 61: [413], 62: [180], 63: [58, 64, 65, 432, 433], 64: [58, 63, 65, 432, 433], 65: [58, 63, 64, 432, 433], 66: [475], 67: [382], 68: [98], 69: [48, 238, 319, 332, 371, 396, 490], 70: [92], 71: [72], 72: [334, 470], 73: 

In [None]:
def save_to_csv(similar_items, filename):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['id', 'similar_ids'])
        for id, similar_ids in similar_items.items():
            writer.writerow([id, ','.join(map(str, similar_ids))])

In [88]:
# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –≤ CSV-—Ñ–∞–π–ª—ã
save_to_csv(similar_items_tfidf, 'similar_items_tfidf.csv')
save_to_csv(similar_items_tfidf_filled, 'similar_items_tfidf_filled.csv')

## —Ä–µ–∑—É–ª—å—Ç–∞—Ç –≤ w2v –ø–æ–ª—É—á–∏–ª—Å—è –¥–æ–≤–æ–ª—å–Ω–æ —Å—Ç—Ä–∞–Ω–Ω—ã–π üëá

In [None]:
print(similar_items_w2v)