 Import Library

In [89]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

## Convert the pictures (item) below into item characters

In [118]:
ds = pd.DataFrame({'id': [1,2,3,4,5],
            'description':['blue short-sleeve shirts for man', 
           'black long-sleeve shirts for man', 
           'black short-sleeve shirts with dotted for man', 
           'blue t-shirts for woman', 
           'long-sleeve floral shirts for woman']})

ds

Unnamed: 0,id,description
0,1,blue short-sleeve shirts for man
1,2,black long-sleeve shirts for man
2,3,black short-sleeve shirts with dotted for man
3,4,blue t-shirts for woman
4,5,long-sleeve floral shirts for woman


- TF-IDF Vectorizer  
The TF*IDF algorithm is used to weigh a keyword in any document and assign the importance to that keyword based on the number of times it appears in the document.

## Encoding the Data 

## Method 1

In [121]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(ds['description'])
tfidf.get_feature_names()

['black',
 'blue',
 'dotted',
 'floral',
 'long',
 'man',
 'shirts',
 'short',
 'sleeve',
 'woman']

In [115]:
print(tfidf_matrix)

  (0, 5)	0.44209452928461
  (0, 6)	0.31455389873014655
  (0, 8)	0.37190385524223485
  (0, 7)	0.5325860467690863
  (0, 1)	0.5325860467690863
  (1, 4)	0.5325860467690863
  (1, 0)	0.5325860467690863
  (1, 5)	0.44209452928461
  (1, 6)	0.31455389873014655
  (1, 8)	0.37190385524223485
  (2, 2)	0.5509158478195675
  (2, 0)	0.4444754371725732
  (2, 5)	0.3689547639624452
  (2, 6)	0.262514353315451
  (2, 8)	0.31037637889268227
  (2, 7)	0.4444754371725732
  (3, 9)	0.652490884512534
  (3, 6)	0.3853716274664007
  (3, 1)	0.652490884512534
  (4, 3)	0.5927348611982239
  (4, 9)	0.47821475385255313
  (4, 4)	0.47821475385255313
  (4, 6)	0.2824413372583483
  (4, 8)	0.3339364815702568


## Method 2

In [125]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
ds['description'] = ds['description'].fillna('')
tfidf_matrix = tfidf.fit_transform(ds['description'])

tfidf_matrix.shape

(5, 35)

In [110]:
tfidf.get_feature_names()

['black',
 'black long',
 'black long sleeve',
 'black short',
 'black short sleeve',
 'blue',
 'blue shirts',
 'blue shirts woman',
 'blue short',
 'blue short sleeve',
 'dotted',
 'dotted man',
 'floral',
 'floral shirts',
 'floral shirts woman',
 'long',
 'long sleeve',
 'long sleeve floral',
 'long sleeve shirts',
 'man',
 'shirts',
 'shirts dotted',
 'shirts dotted man',
 'shirts man',
 'shirts woman',
 'short',
 'short sleeve',
 'short sleeve shirts',
 'sleeve',
 'sleeve floral',
 'sleeve floral shirts',
 'sleeve shirts',
 'sleeve shirts dotted',
 'sleeve shirts man',
 'woman']

In [112]:
tfidf_matrix[0,5]

0.2976514740312114

In [129]:
tfidf_matrix[:]

<5x35 sparse matrix of type '<class 'numpy.float64'>'
	with 57 stored elements in Compressed Sparse Row format>

In [126]:
print(tfidf_matrix)

  (0, 33)	0.2976514740312114
  (0, 27)	0.2976514740312114
  (0, 9)	0.3689313299600426
  (0, 23)	0.2976514740312114
  (0, 31)	0.24707761140380072
  (0, 26)	0.2976514740312114
  (0, 8)	0.3689313299600426
  (0, 19)	0.24707761140380072
  (0, 20)	0.17579775547496954
  (0, 28)	0.20784947593404896
  (0, 25)	0.2976514740312114
  (0, 5)	0.2976514740312114
  (1, 18)	0.36046710176591584
  (1, 2)	0.36046710176591584
  (1, 16)	0.29082258801930444
  (1, 1)	0.36046710176591584
  (1, 15)	0.29082258801930444
  (1, 0)	0.29082258801930444
  (1, 33)	0.29082258801930444
  (1, 23)	0.29082258801930444
  (1, 31)	0.24140901913540203
  (1, 19)	0.24140901913540203
  (1, 20)	0.17176450538879065
  (1, 28)	0.20308087741321856
  (2, 22)	0.300894473312317
  :	:
  (2, 27)	0.24275976648271158
  (2, 31)	0.20151253556770002
  (2, 26)	0.24275976648271158
  (2, 19)	0.20151253556770002
  (2, 20)	0.1433778287380946
  (2, 28)	0.16951869768336103
  (2, 25)	0.24275976648271158
  (3, 7)	0.48912736236523036
  (3, 24)	0.3946248761

## Calculation of cosine similarity

In [133]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.36482714, 0.37679175, 0.15843418, 0.06641151],
       [0.36482714, 1.        , 0.22694714, 0.04003346, 0.22003889],
       [0.37679175, 0.22694714, 1.        , 0.03341733, 0.05416416],
       [0.15843418, 0.04003346, 0.03341733, 1.        , 0.24724761],
       [0.06641151, 0.22003889, 0.05416416, 0.24724761, 1.        ]])

## Use cosine similarity for the recommendation

In [134]:
results = {}
for idx, row in ds.iterrows():
   similar_indices = cosine_sim[idx].argsort()[:-100:-1] 
   similar_items = [(round(cosine_sim[idx][i],2), ds['id'][i]) for i in similar_indices] 
   results[row['id']] = similar_items[1:]

In [135]:
results

{1: [(0.38, 3), (0.36, 2), (0.16, 4), (0.07, 5)],
 2: [(0.36, 1), (0.23, 3), (0.22, 5), (0.04, 4)],
 3: [(0.38, 1), (0.23, 2), (0.05, 5), (0.03, 4)],
 4: [(0.25, 5), (0.16, 1), (0.04, 2), (0.03, 3)],
 5: [(0.25, 4), (0.22, 2), (0.07, 1), (0.05, 3)]}