In [20]:
import pandas as pd
import ast
import numpy as np
from IPython.display import display, clear_output, HTML
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt

# Similarity Scheme
Each product's ingredients will be given a weight based on its position in the list. To generate the similarity score between two different products, each matched ingredient's weights will be multiplied together, and then all of those will be summed. Finally, this sum is divided by the maximum possible score (which is the weights from the shorter list squared and then summed).

To get the weights, we will use the geometric distribution.

![Geometric Distribution (from Wikipedia)](images/geometric_distribution.png)

**Geometric distribution:**
The probability that the first occurrence of success requires $i$ independent trials each with success probability $p$.
$$
p(1 - p) ^ {i - 1}
$$
A smaller p will produce more evenly distributed weights, whereas a larger p will put more importance on earlier ingredients.

In [7]:
# uses a geometric distribution so each weight
# decreases geometrically according to its position.
def generate_weights(n, p):
    weights = []
    total_weight = 0
    for i in range(1, n + 1):
        weight = p * ((1 - p) ** (i - 1)) # geometric pdf
        weights.append(weight)
        total_weight += weight
    normalized_weights = [weight / total_weight for weight in weights]
    return normalized_weights

generate_weights(5, 0.2)

[0.29747739171822934,
 0.23798191337458352,
 0.19038553069966682,
 0.15230842455973348,
 0.12184673964778678]

## Weighted Distance Matrix
The goal of this similarity scheme is to produce a distance matrix that judges the "distance" of the products based on the geometric weights assigned to each ingredient. We will use every unique ingredient included in the dataset so that each product has a weight assigned to each possible ingredient (if it doesn't have a particular ingredient, the weight is 0).

First, we need to import the data (formatted in `0_data_preprocess.ipynb`) and add the weights column.

In [86]:
# Dataframe exported/imported as a pickle to preserve 
# the columns with a list format (csv gets messy)
df = pd.read_pickle('data/skincare_products_listed.pickle')
df.head(3)

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Cosing Ref No,INCI Name,Function
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"[Algae (Seaweed) Extract, Mineral Oil, Petro...",1,1,1,1,1,"[54290.0, 95058.0, 79504.0, 34040.0, 34654.0, ...","[ALGAE EXTRACT, HYDROGENATED MINERAL OIL, PETR...","[FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITI..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"[Galactomyces Ferment Filtrate (Pitera), Buty...",1,1,1,1,1,"[84397, 74756, 58983, 92472, 37735, 35342, 38173]","[GALACTOMYCES FERMENT FILTRATE, BUTYLENE GLYCO...","[HUMECTANT, FRAGRANCE, HUMECTANT, SKIN CONDITI..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"[Water, Dicaprylyl Carbonate, Glycerin, Cet...",1,1,1,1,0,"[92472, 55832, 34040, 75132, 55337, 38182, 583...","[WATER, DICAPRYLYL CARBONATE, GLYCERIN, CETEAR...","[ANTIPLAQUE, SKIN CONDITIONING, SOLVENT, SKIN ..."


In [61]:
# Adds weights column to df
df['Weights'] = \
    df['INCI Name'].apply(lambda x: generate_weights(len(x), p=0.2))

df.head(3)

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Cosing Ref No,INCI Name,Function,Weights
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"[Algae (Seaweed) Extract, Mineral Oil, Petro...",1,1,1,1,1,"[54290.0, 95058.0, 79504.0, 34040.0, 34654.0, ...","[ALGAE EXTRACT, HYDROGENATED MINERAL OIL, PETR...","[FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITI...","[0.20002126990973731, 0.16001701592778989, 0.1..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"[Galactomyces Ferment Filtrate (Pitera), Buty...",1,1,1,1,1,"[84397, 74756, 58983, 92472, 37735, 35342, 38173]","[GALACTOMYCES FERMENT FILTRATE, BUTYLENE GLYCO...","[HUMECTANT, FRAGRANCE, HUMECTANT, SKIN CONDITI...","[0.25307332242756025, 0.2024586579420482, 0.16..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"[Water, Dicaprylyl Carbonate, Glycerin, Cet...",1,1,1,1,0,"[92472, 55832, 34040, 75132, 55337, 38182, 583...","[WATER, DICAPRYLYL CARBONATE, GLYCERIN, CETEAR...","[ANTIPLAQUE, SKIN CONDITIONING, SOLVENT, SKIN ...","[0.20000038312461918, 0.16000030649969535, 0.1..."


Now, we can define a function that will create a weight matrix out of the dataframe as it is currently formatted. 

In [66]:
def create_weight_matrix(df):
    '''
    Creates a (n,m) array, with n rows of
    products each containing weights for each of
    the m unique ingredients.
    '''
    names = df['Name']

    ingredients = df.explode("INCI Name")['INCI Name'].unique()
    ingredients.sort()

    weight_matrix = \
        np.zeros((len(names), len(ingredients)))
    
    curr_row = 0

    for i in df.index:
        curr_ingredient_vector = np.zeros(len(ingredients))

        indices = np.searchsorted(
            ingredients, df.loc[i, 'INCI Name'])
        
        curr_ingredient_vector[indices] = \
            df.loc[i, 'Weights']
        
        weight_matrix[curr_row, :] = curr_ingredient_vector
        curr_row += 1

    return weight_matrix 


In [74]:
# 1248 products, 3248 unique ingredients
weight_matrix = create_weight_matrix(df)
weight_matrix.shape

(1248, 3248)

In [76]:
# Saved in data/weighted_distance_matrix.npy
# Otherwise, takes ~1min to compute
weighted_distance_matrix = distance_matrix(weight_matrix, weight_matrix)
np.save('data/weighted_distance_matrix.npy', weighted_distance_matrix)

In [81]:
weighted_distance_matrix[0:3, 0:3]

array([[0.        , 0.53000315, 0.442218  ],
       [0.53000315, 0.        , 0.47889144],
       [0.442218  , 0.47889144, 0.        ]])

As expected, diagonals will all be 0 (the closest distance for each product is itself). Diagonals will also be duplicates.

In [82]:
# Creates a list of indices that would sort row 1 (aka, product 2) in
# ascending order, i.e. which products are "closest"
sorted_indices = sorted(range(len(weighted_distance_matrix[1])),
                        key = lambda i: weighted_distance_matrix[1][i])

sorted_indices[0:5] 

[1, 33, 103, 734, 128]

As expected, the most similar product is itself (ID 1).

In [83]:
# Shows the scores associated with the top 5 closest
# products for product no. 2 (including itself)
weighted_distance_matrix[1][sorted_indices[0:4]]

array([0.        , 0.        , 0.07237208, 0.27207928])

The smallest score will always be 0 for its distance from itself. Here, though, we see another 0. We can look at which products are associated with these scores to see why that is.

In [85]:
# Gives the product names for the top 5 closest
# products.
most_similar_products = df[['Brand', 'Name']].drop_duplicates().iloc[sorted_indices[0:4],:]['Name'].values
print(most_similar_products)

['Facial Treatment Essence' 'Facial Treatment Essence Mini'
 'Facial Treatment Essence Karan Singh Limited Edition'
 'Brightening Derm Revival Mask']


As we can see from the results, the product that also had a score of 0 was the mini version of the same product (that tracks).
Now we want to add the top 5 similar products to the dataframe (besides itself).