In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("sample-data.csv")
df.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [3]:
#Deccription by id function
def item(item_id):     
    return df.loc[df['id'] == item_id]['description'].tolist()[0].split(' - ')[0]

#Similar id find function
def recommend(similarity_matrix, item_id, num):
    index=df.index[df['id']==item_id].tolist()[0]                                 #Id in similarity matrix
    similar_indices = similarity_matrix[index].argsort()[:-(num+2):-1]            #Find similar id
    print("Top " + str(num) + " items similar to " + item(item_id) + ":")
    print("--------------------------------------------------")
    for ind in similar_indices[1:]:
        print("Item: " + item(df['id'][ind]) + " [score:" + str(similarity_matrix[index][ind]) + "]")

# TF-IDF

In [4]:
#Vectorize the entire dataset and calculate the cosine distance using cosine_similarity 
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['description'])

cosine_similarities_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix) 

In [5]:
recommend(cosine_similarities_tfidf, item_id=14, num=5)

Top 5 items similar to Better sweater jkt:
--------------------------------------------------
Item: Better sweater jkt [score:0.38209411902102336]
Item: Synch vest [score:0.12173888284287283]
Item: Araveto jkt [score:0.11882679419455687]
Item: Araveto 1/4 zip jkt [score:0.11628244300105817]
Item: Pique fleece 1/4 zip [score:0.109313884864437]


In [6]:
recommend(cosine_similarities_tfidf, item_id=73, num=5)

Top 5 items similar to Live simply guitar t-shirt:
--------------------------------------------------
Item: Live simply guitar t-shirt [score:0.8132167761824267]
Item: Live simply deer t-shirt [score:0.39437076691402356]
Item: Flying fish 2 t-shirt [score:0.36263779509999755]
Item: Trout silhouette t-shirt [score:0.3606039591845198]
Item: Live simply bug t-shirt [score:0.3453461123976988]


In [7]:
recommend(cosine_similarities_tfidf, item_id=41, num=5)

Top 5 items similar to Fish frenzy t-shirt:
--------------------------------------------------
Item: Peregrine t-shirt [score:0.41103016268642906]
Item: Tarpon t-shirt [score:0.3464251532296172]
Item: Flying fish 2 t-shirt [score:0.34063889656554563]
Item: Trout head t-shirt [score:0.3371382209954106]
Item: Wind path t-shirt [score:0.3370361283516137]


# Word2Vec

In [8]:
import gensim.downloader
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chuna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Tokenization text function
def tokenize_text(text):
    tokenizer = RegexpTokenizer(r"[\w-]+")                                         # Select individual words using regular expressions (include "-")
    html_pattern = re.compile('<.*?>')                                             # Remove html markup
    clean_text = tokenizer.tokenize(html_pattern.sub('', text.lower()))            # Lower case
    stops = set(stopwords.words("english"))                                        # Delete stop words
    sentence = [w for w in clean_text if not w in stops]
    return sentence

In [10]:
# Filling vectors matrix over the corpus of texts. 
# As a vector for one text used the average value of word2vec of individual words
def create_embed_matrix(texts):
    embed_matrix=np.zeros((len(texts),glove_vectors.vector_size))
    for ind, text in enumerate(texts):
        mean_word2vec=np.zeros(glove_vectors.vector_size)
        num_words=0
        tokenized_text=tokenize_text(text)                                                                                    
        for word in tokenized_text:                                                # Iterate through all the words from the text and sum the vectors
            if word in glove_vectors.index_to_key:
                mean_word2vec+=glove_vectors[word]
                num_words+=1
        embed_matrix[ind]=mean_word2vec/num_words
        embed_matrix[ind]=embed_matrix[ind]/np.sqrt(np.sum(embed_matrix[ind]**2))  # Normalize the final vector for a single text
    return embed_matrix

In [11]:
#As vectors we use embedding model glove-wiki-gigaword-300
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')

In [12]:
# Fill the vector matrix and calculate the pairwise cosine distance
mean_word2vec_matrix=create_embed_matrix(df['description'].values)

In [13]:
cosine_similarities_word2vec = linear_kernel(mean_word2vec_matrix, mean_word2vec_matrix)

In [14]:
recommend(cosine_similarities_word2vec, item_id=14, num=5)

Top 5 items similar to Better sweater jkt:
--------------------------------------------------
Item: Pique fleece 1/4 zip [score:0.931459435337409]
Item: Araveto jkt [score:0.9112686511082095]
Item: El cap jkt [score:0.9081624147587113]
Item: Synch vest [score:0.9065215591851343]
Item: Synch vest [score:0.9029550854715328]


In [15]:
recommend(cosine_similarities_word2vec, item_id=73, num=5)

Top 5 items similar to Live simply guitar t-shirt:
--------------------------------------------------
Item: Live simply guitar t-shirt [score:0.9868952161194144]
Item: Planer t-shirt [score:0.9603014565745474]
Item: Trampoli t-shirt [score:0.9589097239833487]
Item: Baby live simply seal t-shirt [score:0.9560173969737962]
Item: Birdwalk t-shirt [score:0.9546466944634275]


In [16]:
recommend(cosine_similarities_word2vec, item_id=41, num=5)

Top 5 items similar to Fish frenzy t-shirt:
--------------------------------------------------
Item: City by the sea t-shirt [score:0.9556637081416329]
Item: Tarpon t-shirt [score:0.9551927830984399]
Item: Trout head t-shirt [score:0.9528302470727666]
Item: Squid t-shirt [score:0.9511103522027496]
Item: Flying fish 2 t-shirt [score:0.9493322125049987]
