In [63]:
# import pandas
import pandas as pd
import numpy as np
# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [114]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [115]:
df

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...,...
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...


In [167]:
# extract first words for item
df['item'] = df['description'].str.extract(r"^(.+?) - ")
df['description'] = df['description'].str.replace(r"^(.+?) - ", "")

  df['description'] = df['description'].str.replace(r"^(.+?) - ", "")


In [169]:
import string
puncts = string.punctuation

def del_punct(text):
    text="".join([char for char in text if char not in puncts])
    return text

In [170]:
df['description'] = df['description'].apply(del_punct)

In [172]:
df['description'] = df['description'].str.lower()

In [173]:
df.head()

Unnamed: 0,id,description,item
0,1,theres a reason why our boxers are a cult favo...,Active classic boxers
1,2,skinning up glory requires enough movement wit...,Active sport boxer briefs
2,3,these superbreathable nofly briefs are the min...,Active sport briefs
3,4,skin in climb ice switch to rock traverse a kn...,Alpine guide pants
4,5,on high ridges steep ice and anything alpine t...,Alpine wind jkt


In [174]:
df['item'].duplicated().sum()

0

In [175]:
df.loc[df['item'].duplicated()]

Unnamed: 0,id,description,item


In [176]:
print(df.loc[df['item']=='Compound cargo pants']['description'])

26    long  the ultimate doeverything pants built to...
Name: description, dtype: object


In [177]:
df.drop_duplicates(subset='item', keep='first', inplace=True)

In [178]:
df.loc[df['item'].duplicated()]

Unnamed: 0,id,description,item


* explore DataFrame

We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [179]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word', use_idf=True, min_df=2, max_df=0.90, stop_words='english')

* fit and transform 'description' column with TFIDF

In [180]:
desc_matrix = vectorizer.fit_transform(df['description'])

* calculate the cosine similarity of each item with every other item in the dataset, 

In [181]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(desc_matrix, desc_matrix)

In [182]:
cosine_sim.shape

(417, 417)

In [183]:
type(cosine_sim)

numpy.ndarray

In [184]:
cosine_sim[3,0:10]

array([0.09806029, 0.0639605 , 0.05596036, 1.        , 0.1690285 ,
       0.1531658 , 0.15439078, 0.        , 0.06214684, 0.05074309])

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [185]:
cosineSim_arg_list = cosine_sim.argsort()

In [186]:
id_list = df['id'].values.tolist()

In [187]:
results = dict(zip(id_list, cosineSim_arg_list))

In [191]:
results[1]

array([203, 222, 371,   7, 202,   9, 129, 263, 201, 361, 137, 241, 235,
        16, 240, 204, 223,  47, 234,  65, 196, 252, 233, 225,  82, 372,
        48, 363, 308,  40, 360, 288,  80,  67, 299, 411, 312,  46,  29,
       341, 221, 323,  83, 108, 219, 130, 198, 278,  96,  37, 191,  31,
       184, 385, 285, 311, 407, 390, 110, 152, 150, 313,  71, 145, 132,
       366,  33, 302,  36, 242, 253,  12, 208, 307, 128,  52, 148, 271,
       131, 200, 384, 343, 374, 328,  75, 125, 214, 179, 269, 231, 104,
       367, 138, 224, 232, 100,  51, 155, 368, 303, 379,  98, 373, 280,
       126, 309, 401, 400, 286,  49, 282, 207,  99, 265, 258, 396,  26,
       103, 395, 270, 238, 227, 262, 329, 254, 144, 136, 124, 216, 116,
       210, 300, 333,  76, 146,   6,  54, 117,  93, 330, 220, 317, 147,
        57, 118, 236, 306,  72, 281, 391,  90, 247,  81, 291, 229,  39,
       377,  13, 410, 369, 370,  63,  56,  61,  62, 111, 178, 324,  91,
       339, 245, 264,  74, 342,  88, 158, 218, 268,  92, 314, 12

* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [192]:
id_list[0:10]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [195]:
indices = pd.Series(df.index, index=df['item']).drop_duplicates()

In [208]:
def recommender(item_id, count):
    idx = indices[item_id]

    # pairwise similarity # of each index to each item 
    sim_scores = list(enumerate(cosine_sim[idx]))

     # Sort based on the similarity scores, lambda x[1] ? 
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores from most similar items based on count wanted
    sim_scores = sim_scores[1:count+1]

    # get the indices, from sim_scores # simialr to created dictionary at top
    item_indices = [i[0] for i in sim_scores] # list of indices

    return df['item'].iloc[item_indices]

* show top 5 the most similar items for item with idem_id = 11

In [211]:
df.iloc[10]

id                                                            11
description    soft stretchy polyester fabric is fast wicking...
item                                           Baby sunshade top
Name: 10, dtype: object

In [212]:
recommender(10,5)

418              Sunshade hoody
410                    Sun mask
464    Baby baggies apron dress
387                Lw sun hoody
142                Traverse jkt
Name: item, dtype: object