In [1]:
import pandas as pd
import ast
import numpy as np
from IPython.display import display, clear_output, HTML
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt

# Similarity Scheme
Each product's ingredients will be given a weight based on its position in the list. To generate the similarity score between two different products, each matched ingredient's weights will be multiplied together, and then all of those will be summed. Finally, this sum is divided by the maximum possible score (which is the weights from the shorter list squared and then summed).

To get the weights, we will use the geometric distribution.

![Geometric Distribution (from Wikipedia)](images/geometric_distribution.png)

**Geometric distribution:**
The probability that the first occurrence of success requires $i$ independent trials each with success probability $p$.
$$
p(1 - p) ^ {i - 1}
$$
A smaller p will produce more evenly distributed weights, whereas a larger p will put more importance on earlier ingredients.

In [2]:
# uses a geometric distribution so each weight
# decreases geometrically according to its position.
def generate_weights(n: int, p: float = 0.2) -> list:
    '''
    n: length of desired weight list
    p: parameter for geometric distribution (between 0 and 1)

    Returns a list of weights that sum to 1 based on the
    geometric distribution.
    '''
    weights = []
    total_weight = 0
    for i in range(1, n + 1):
        weight = p * ((1 - p) ** (i - 1)) # geometric pdf
        weights.append(weight)
        total_weight += weight
    normalized_weights = [weight / total_weight for weight in weights]
    return normalized_weights

generate_weights(5, 0.2)

[0.29747739171822934,
 0.23798191337458352,
 0.19038553069966682,
 0.15230842455973348,
 0.12184673964778678]

## Weighted Distance Matrix
The goal of this similarity scheme is to produce a distance matrix that judges the "distance" of the products based on the geometric weights assigned to each ingredient. We will use every unique ingredient included in the dataset so that each product has a weight assigned to each possible ingredient (if it doesn't have a particular ingredient, the weight is 0).

First, we need to import the data (formatted in `0_data_preprocess.ipynb`) and add the weights column.

In [3]:
# Dataframe exported/imported as a pickle to preserve 
# the columns with a list format (csv gets messy)
df_full = pd.read_pickle('data/skincare_products_listed.pickle')
df = df_full[df_full['Cosing Ref No'].apply(lambda x: len(x) > 1)]
df.head(3)

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Cosing Ref No,INCI Name,Function
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"[Algae (Seaweed) Extract, Mineral Oil, Petro...",1,1,1,1,1,"[54290.0, 95058.0, 79504.0, 34040.0, 34654.0, ...","[ALGAE EXTRACT, HYDROGENATED MINERAL OIL, PETR...","[FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITI..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"[Galactomyces Ferment Filtrate (Pitera), Buty...",1,1,1,1,1,"[84397, 74756, 58983, 92472, 37735, 35342, 38173]","[GALACTOMYCES FERMENT FILTRATE, BUTYLENE GLYCO...","[HUMECTANT, FRAGRANCE, HUMECTANT, SKIN CONDITI..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"[Water, Dicaprylyl Carbonate, Glycerin, Cet...",1,1,1,1,0,"[92472, 55832, 34040, 75132, 55337, 38182, 583...","[WATER, DICAPRYLYL CARBONATE, GLYCERIN, CETEAR...","[ANTIPLAQUE, SKIN CONDITIONING, SOLVENT, SKIN ..."


In [4]:
# Adds weights column to df
df['Weights'] = \
    df['INCI Name'].apply(lambda x: generate_weights(len(x), p=0.2))

df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Weights'] = \


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Cosing Ref No,INCI Name,Function,Weights
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"[Algae (Seaweed) Extract, Mineral Oil, Petro...",1,1,1,1,1,"[54290.0, 95058.0, 79504.0, 34040.0, 34654.0, ...","[ALGAE EXTRACT, HYDROGENATED MINERAL OIL, PETR...","[FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITI...","[0.20002126990973731, 0.16001701592778989, 0.1..."
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"[Galactomyces Ferment Filtrate (Pitera), Buty...",1,1,1,1,1,"[84397, 74756, 58983, 92472, 37735, 35342, 38173]","[GALACTOMYCES FERMENT FILTRATE, BUTYLENE GLYCO...","[HUMECTANT, FRAGRANCE, HUMECTANT, SKIN CONDITI...","[0.25307332242756025, 0.2024586579420482, 0.16..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"[Water, Dicaprylyl Carbonate, Glycerin, Cet...",1,1,1,1,0,"[92472, 55832, 34040, 75132, 55337, 38182, 583...","[WATER, DICAPRYLYL CARBONATE, GLYCERIN, CETEAR...","[ANTIPLAQUE, SKIN CONDITIONING, SOLVENT, SKIN ...","[0.20000038312461918, 0.16000030649969535, 0.1..."


Now, we can define a function that will create a weight matrix out of the dataframe as it is currently formatted. 

In [5]:
def create_weight_matrix(df):
    '''
    Creates a (n,m) array, with n rows of
    products each containing weights for each of
    the m unique ingredients.
    '''
    names = df['Name']

    ingredients = df.explode("INCI Name")['INCI Name'].unique()
    ingredients.sort()

    weight_matrix = \
        np.zeros((len(names), len(ingredients)))
    
    curr_row = 0

    for i in df.index:
        curr_ingredient_vector = np.zeros(len(ingredients))

        indices = np.searchsorted(
            ingredients, df.loc[i, 'INCI Name'])
        
        curr_ingredient_vector[indices] = \
            df.loc[i, 'Weights']
        
        weight_matrix[curr_row, :] = curr_ingredient_vector
        curr_row += 1

    return weight_matrix 


In [6]:
# 1248 products, 3248 unique ingredients
weight_matrix = create_weight_matrix(df)
weight_matrix.shape

(1248, 3248)

In [7]:
# Saved in data/weighted_distance_matrix.npy
# Otherwise, takes ~1min to compute.
# Uncomment to recompute.

#weighted_distance_matrix = distance_matrix(weight_matrix, weight_matrix)
#np.save('data/weighted_distance_matrix.npy', weighted_distance_matrix)

In [8]:
weighted_distance_matrix = np.load("data/weighted_distance_matrix.npy")
weighted_distance_matrix[0:3, 0:3]

array([[0.        , 0.53000315, 0.442218  ],
       [0.53000315, 0.        , 0.47889144],
       [0.442218  , 0.47889144, 0.        ]])

As expected, diagonals will all be 0 (the closest distance for each product is itself). Diagonals will also be duplicates.

In [9]:
# Creates a list of indices that would sort row 1 (aka, product 2) in
# ascending order, i.e. which products are "closest"
sorted_indices = sorted(range(len(weighted_distance_matrix[1])),
                        key = lambda i: weighted_distance_matrix[1][i])

sorted_indices[0:5] 

[1, 33, 103, 734, 128]

As expected, the most similar product is itself (ID 1).

In [10]:
# Shows the scores associated with the top 5 closest
# products for product no. 2 (including itself)
weighted_distance_matrix[1][sorted_indices[0:4]]

array([0.        , 0.        , 0.07237208, 0.27207928])

The smallest score will always be 0 for its distance from itself. Here, though, we see another 0. We can look at which products are associated with these scores to see why that is.

In [11]:
# Gives the product names for the top 5 closest
# products.
most_similar_products = df.iloc[sorted_indices[1:6],:]['Name'].values
most_similar_products

array(['Facial Treatment Essence Mini',
       'Facial Treatment Essence Karan Singh Limited Edition',
       'Brightening Derm Revival Mask', 'GenOptics Spot Essence Serum',
       'GenOptics Aura Essence Serum'], dtype=object)

As we can see from the results, the product that also had a score of 0 was the mini version of the same product (that tracks).
Now we want to add the top 5 similar products to the dataframe (besides itself). Now, we can put these steps into a function that we can apply to the dataframe.

In [12]:
def get_5_most_similar_products(weighted_distance_matrix: np.ndarray, index: int) -> np.ndarray:
    '''
    weighted_distance_matrix: the distance matrix including the
    similarity "distances" of products
    index: the positional index of the product
    Gets the 5 most similar products (aside from itself) of the 
    product at the given position using the weighted distance matrix. 
    Returns a list.
    '''
    sorted_indices = sorted(range(len(weighted_distance_matrix[index])),
                        key = lambda i: weighted_distance_matrix[index][i])
    
    most_similar_products = df.iloc[sorted_indices[1:6],:]['Name'].values

    return most_similar_products

In [13]:
# Need to reset index here so that when making a
# new column, it uses the same "positional" index
df2 = df.reset_index()

df2['Weighted Similarity Products'] = \
    df.reset_index().reset_index()['level_0'].apply(lambda x: get_5_most_similar_products(weighted_distance_matrix, x))
    
df2.head()

Unnamed: 0,index,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Cosing Ref No,INCI Name,Function,Weights,Weighted Similarity Products
0,0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"[Algae (Seaweed) Extract, Mineral Oil, Petro...",1,1,1,1,1,"[54290.0, 95058.0, 79504.0, 34040.0, 34654.0, ...","[ALGAE EXTRACT, HYDROGENATED MINERAL OIL, PETR...","[FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITI...","[0.20002126990973731, 0.16001701592778989, 0.1...",[Little Miss Miracle Limited-Edition Crème de ...
1,1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"[Galactomyces Ferment Filtrate (Pitera), Buty...",1,1,1,1,1,"[84397, 74756, 58983, 92472, 37735, 35342, 38173]","[GALACTOMYCES FERMENT FILTRATE, BUTYLENE GLYCO...","[HUMECTANT, FRAGRANCE, HUMECTANT, SKIN CONDITI...","[0.25307332242756025, 0.2024586579420482, 0.16...","[Facial Treatment Essence Mini, Facial Treatme..."
2,2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"[Water, Dicaprylyl Carbonate, Glycerin, Cet...",1,1,1,1,0,"[92472, 55832, 34040, 75132, 55337, 38182, 583...","[WATER, DICAPRYLYL CARBONATE, GLYCERIN, CETEAR...","[ANTIPLAQUE, SKIN CONDITIONING, SOLVENT, SKIN ...","[0.20000038312461918, 0.16000030649969535, 0.1...","[C-Tango™ Multivitamin Eye Cream, After-Sun Mi..."
3,3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"[Algae (Seaweed) Extract, Cyclopentasiloxane,...",1,1,1,1,1,"[54290.0, 75413.0, 79504.0, 34067.0, 79701.0, ...","[ALGAE EXTRACT, CYCLOPENTASILOXANE, PETROLATUM...","[FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITI...","[0.20000000862718334, 0.16000000690174668, 0.1...","[The Moisturizing Soft Lotion, The Concentrate..."
4,4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"[Water, Snail Secretion Filtrate, Phenyl Tri...",1,1,1,1,1,"[92472.0, 58704.0, 79701.0, 33401.0, 74756.0, ...","[WATER, SNAIL SECRETION FILTRATE, PHENYL TRIME...","[ANTIPLAQUE, SKIN CONDITIONING, SOLVENT, SKIN ...","[0.20000000353369418, 0.16000000282695537, 0.1...",[Your Skin But Better™ CC+Illumination™ Cream ...


In [14]:
# Setting the list of similar products to an empty list
# for products with <2 ingredients

df_full = df_full[~(df_full['Cosing Ref No'].apply(lambda x: len(x) > 1))]
df_full['Weighted Similarity Products'] = [[] for i in range(len(df_full))]
df_full.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Cosing Ref No,INCI Name,Function,Weighted Similarity Products
7,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil,72,4.4,[100% Unrefined Sclerocraya Birrea (Marula) Ke...,1,1,1,1,0,[],[],[],[]
11,Moisturizer,KIEHL'S SINCE 1851,Midnight Recovery Concentrate,47,4.4,[Caprylic/Capric Triglyceride Dicaprylyl Carbo...,1,1,1,1,1,[58059.0],[OZONIZED SUNFLOWER SEED OIL],[SKIN CONDITIONING],[]
26,Moisturizer,DRUNK ELEPHANT,Virgin Marula Luxury Facial Oil Mini,40,4.5,[100% Unrefined Sclerocraya Birrea (Marula) Ke...,1,1,1,1,0,[],[],[],[]
32,Moisturizer,OLEHENRIKSEN,Sheer Transformation® Perfecting Moisturizer,38,4.2,[Visit the OLEHENRIKSEN boutique],1,1,1,1,1,[],[],[],[]
33,Moisturizer,JOSIE MARAN,100 percent Pure Argan Oil,48,4.5,[Organic Argania Spinosa (Argan) Kernel Oil*. ...,0,1,0,1,1,[],[],[],[]


In [15]:
df2 = df2.set_index("index")
df2.index.name = None

merged_df = pd.concat([df2,df_full])
merged_df.sort_index().head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Cosing Ref No,INCI Name,Function,Weights,Weighted Similarity Products
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"[Algae (Seaweed) Extract, Mineral Oil, Petro...",1,1,1,1,1,"[54290.0, 95058.0, 79504.0, 34040.0, 34654.0, ...","[ALGAE EXTRACT, HYDROGENATED MINERAL OIL, PETR...","[FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITI...","[0.20002126990973731, 0.16001701592778989, 0.1...",[Little Miss Miracle Limited-Edition Crème de ...
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"[Galactomyces Ferment Filtrate (Pitera), Buty...",1,1,1,1,1,"[84397, 74756, 58983, 92472, 37735, 35342, 38173]","[GALACTOMYCES FERMENT FILTRATE, BUTYLENE GLYCO...","[HUMECTANT, FRAGRANCE, HUMECTANT, SKIN CONDITI...","[0.25307332242756025, 0.2024586579420482, 0.16...","[Facial Treatment Essence Mini, Facial Treatme..."
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"[Water, Dicaprylyl Carbonate, Glycerin, Cet...",1,1,1,1,0,"[92472, 55832, 34040, 75132, 55337, 38182, 583...","[WATER, DICAPRYLYL CARBONATE, GLYCERIN, CETEAR...","[ANTIPLAQUE, SKIN CONDITIONING, SOLVENT, SKIN ...","[0.20000038312461918, 0.16000030649969535, 0.1...","[C-Tango™ Multivitamin Eye Cream, After-Sun Mi..."
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"[Algae (Seaweed) Extract, Cyclopentasiloxane,...",1,1,1,1,1,"[54290.0, 75413.0, 79504.0, 34067.0, 79701.0, ...","[ALGAE EXTRACT, CYCLOPENTASILOXANE, PETROLATUM...","[FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITI...","[0.20000000862718334, 0.16000000690174668, 0.1...","[The Moisturizing Soft Lotion, The Concentrate..."
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"[Water, Snail Secretion Filtrate, Phenyl Tri...",1,1,1,1,1,"[92472.0, 58704.0, 79701.0, 33401.0, 74756.0, ...","[WATER, SNAIL SECRETION FILTRATE, PHENYL TRIME...","[ANTIPLAQUE, SKIN CONDITIONING, SOLVENT, SKIN ...","[0.20000000353369418, 0.16000000282695537, 0.1...",[Your Skin But Better™ CC+Illumination™ Cream ...


In [17]:
# Uncomment to re-save dataframe
#merged_df.sort_index().to_json("data/skincare_products_1.json")