In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_products = pd.read_csv('products.csv')

In [4]:
# Separate the important columns
df_products = df_products[['product_id', 'product_name', 'category', 'subcategory']]

In [5]:
# Splitting both product_name and category into separated words (list)
df_products['product_name'] = df_products['product_name'].apply(lambda x: x.split())
df_products['category'] = df_products['category'].apply(lambda x: x.replace(",", "").split())

In [7]:
# Some adjustments to subcategory turn into a list
df_products['subcategory'] = df_products['subcategory'].fillna('Genérico')
df_products['subcategory'] = df_products['subcategory'].apply(lambda x: x.replace(",", "").split())

In [8]:
# Remove some symbols from subcategory
df_products['subcategory'] = df_products['subcategory'].apply(lambda x: [i for i in x if i.lower() != "/"])
df_products['subcategory'] = df_products['subcategory'].apply(lambda x: [i for i in x if i.lower() != "|"])

In [9]:
# Create a columns called tags using all the other ones
df_products['tags'] = df_products['product_name']+df_products['category']+df_products['subcategory']

In [10]:
# Creates a new df using the product_id and the tags
new_df = df_products[['product_id', 'tags']]

In [11]:
# Join the words on the list
new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))


In [12]:
# Make everything lowercase
new_df['tags'] = new_df['tags'].apply(lambda X: X.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda X: X.lower())


In [13]:
# Vectorizing all the words and excluding some of them (portuguese)
from sklearn.feature_extraction.text import CountVectorizer
stop_words_portuguese = [
    "a", "o", "e", "de", "da", "do", "em", "para", "com", "que", "é", 
    "dos", "das", "um", "uma", "não", "na", "no", "por", "se", "nao",
    # Feel free to add more words
]
cv = CountVectorizer(max_features=5000, stop_words=stop_words_portuguese)

In [14]:
cv.fit_transform(new_df['tags']).toarray().shape

(26575, 5000)

In [15]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [16]:
# Using PorterStemmer to simplify the words, dont even know if that works in portuguese
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [17]:
# Function for the PorterStemmer
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [18]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [19]:
# Gets the cosine similarity between all the words
similarity = cosine_similarity(vectors)

In [20]:
# Function responsible for the recommendation of products, based on the similarity
def recommend(product):
    product_index = new_df[new_df['product_id'] == product].index[0]
    distances = similarity[product_index]
    products_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x: x[1])[1:6]

    for i in products_list:
        print(new_df.iloc[i[0]].tags)

In [21]:
recommend('(product_id of a apple watch)')

appl watch se gp 40mm caixa de alumínio com pulseira esportiva smartphon tablet e telefon smartwatch e smartband
appl watch se gp 44mm com pulseira esportiva smartphon tablet e telefon smartwatch e smartband
smartwatch appl watch seri 3 38mm smartphon tablet e telefon smartwatch e smartband
smartwatch appl watch seri 3 38mm smartphon tablet e telefon smartwatch e smartband
smartwatch appl watch seri 3 42mm smartphon tablet e telefon smartwatch e smartband
