In [104]:
import pandas as pd
import numpy as np
from IPython.display import display, clear_output, HTML
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt

# Similarity Scheme
Each product's ingredients will be given a weight based on its position in the list. To generate the similarity score between two different products, each matched ingredient's weights will be multiplied together, and then all of those will be summed. Finally, this sum is divided by the maximum possible score (which is the weights from the shorter list squared and then summed).

To get the weights, we will use the geometric distribution.

![Geometric Distribution (from Wikipedia)](images/geometric_distribution.png)

**Geometric distribution:**
The probability that the first occurrence of success requires $i$ independent trials each with success probability $p$.
$$
p(1 - p) ^ {i - 1}
$$

In [11]:
# uses a geometric distribution so each weight
# decreases geometrically according to its position.
def generate_weights(n, p):
    weights = []
    total_weight = 0
    for i in range(1, n + 1):
        weight = p * ((1 - p) ** (i - 1)) # geometric pdf
        weights.append(weight)
        total_weight += weight
    normalized_weights = [weight / total_weight for weight in weights]
    return normalized_weights

In [13]:
df = pd.read_csv("data/skincare_products_merged.csv")

In [14]:
def remove_na_ingredients(df):
    # WARNING: modifies df in-place
    
    # Find the brand and names that contain na
    brand_name_contains_na = \
        df[df['INCI name'].isna()][['Brand', 'Name']].drop_duplicates()
    

    # loop through each of these items, replace na ingredient positions
    # with na, adjust the other numbers accordingly, and then drop nas
    for i in range(len(brand_name_contains_na)):
        curr_df = \
            df[(df['Brand'] == brand_name_contains_na.iloc[i]['Brand']) & \
               (df['Name'] == brand_name_contains_na.iloc[i]['Name'])]

        # get the indices of the item and indices where ingredient is nan
        curr_indices = curr_df.index 
        na_indices = curr_df[curr_df['INCI name'].isna()].index

        message = HTML(
            f"<p>Product: {brand_name_contains_na.iloc[i]['Brand']} "
            f"{brand_name_contains_na.iloc[i]['Name']}<br>"
            f"NAN indices: {na_indices}</p>"
        )
        display(message)
            
        for j in na_indices:

            df.loc[j, 'Ingredient_Placement'] = np.nan

            # get integer position of na index to check if it's at the end
            integer_pos = curr_df.index.get_loc(j)

            # if there are more ingredients after, shift their position
            # up by 1
            if len(curr_df) > (integer_pos + 1):
                df.loc[j:curr_indices[-1], 'Ingredient_Placement'] = \
                    df.loc[j:curr_indices[-1], 'Ingredient_Placement'] - 1

        clear_output(wait=True)

    # now dropping nans will yield correct ingredient placement
    return df.dropna(subset=['INCI name'])

    

In [15]:
df2 = remove_na_ingredients(df)

In [16]:
def add_weghts(df, p=0.3):
    new_df = df.copy(deep=True)
    brand_and_names = df[['Brand', 'Name']].drop_duplicates()
    total_weights = []

    for i in range(len(brand_and_names)):
        curr_df = \
            df[(df['Brand'] == brand_and_names.iloc[i]['Brand']) & \
               (df['Name'] == brand_and_names.iloc[i]['Name'])]
        
        curr_weights = generate_weights(len(curr_df), p)

        total_weights += curr_weights
    
    new_df.loc[:,'Weights'] = total_weights

    return new_df
        

In [17]:
df3 = add_weghts(df2)
cols = ['Brand', 'Name', 'Ingredient_Placement', 
        'INCI name', 'Weights']
df3[cols].head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'Weights'] = total_weights


Unnamed: 0,Brand,Name,Ingredient_Placement,INCI name,Weights
0,LA MER,Crème de la Mer,1.0,ALGAE EXTRACT,0.3
1,LA MER,Crème de la Mer,2.0,HYDROGENATED MINERAL OIL,0.21
2,LA MER,Crème de la Mer,3.0,PETROLATUM,0.147


In [141]:
df4 = df3[df3["Weights"] != 1]

In [142]:
def create_weight_matrix(df):
    brand_and_names = df.loc[:,['Brand', 'Name']].drop_duplicates()

    ingredients = df['INCI name'].unique()
    ingredients.sort()

    weight_matrix = \
        np.zeros((len(brand_and_names), len(ingredients)))

    for i in range(len(brand_and_names)):
        curr_df = \
            df[(df['Brand'] == brand_and_names.iloc[i]['Brand']) & \
               (df['Name'] == brand_and_names.iloc[i]['Name'])]
        
        curr_ingredient_vector = np.zeros(len(ingredients))

        indices = np.searchsorted(
            ingredients, curr_df.loc[:,'INCI name'].values)

        curr_ingredient_vector[indices] = \
            curr_df.loc[:,'Weights'].values        

        weight_matrix[i,:] = curr_ingredient_vector
    
    return weight_matrix

In [143]:
weight_matrix = create_weight_matrix(df4)

In [144]:
# 1248 products, 3248 ingredients
weight_matrix.shape

(1248, 3248)

let's say I have 100 ingredients and I'm comparing 20 different products that have different amounts of these ingredients. If I wanted to emphasize ingredients that appear first in the list with decreasing importance, does it make sense to have the weights assigned based on position in the list for each product's vector?

"While ingredients listed first may be more common or fundamental in certain contexts, there could be exceptions where less common ingredients listed later are crucial determinants of product similarity."

In [145]:
weighted_distance_matrix = distance_matrix(weight_matrix, weight_matrix)

In [146]:
weighted_distance_matrix[1].shape


(1248,)

In [159]:
sorted_indices = sorted(range(len(weighted_distance_matrix[1])),
                        key = lambda i: weighted_distance_matrix[1][i], reverse=True)

In [160]:
sorted_indices[-1:-5:-1]

[33, 1, 103, 720]

In [163]:
weighted_distance_matrix[1][sorted_indices[-1:-5:-1]]

array([0.        , 0.        , 0.04254434, 0.30658226])

In [164]:
df3[['Brand', 'Name']].drop_duplicates().iloc[sorted_indices[-1:-5:-1],:]['Name'].values

array(['Water Bank Moisture Cream', 'Facial Treatment Essence',
       "Charlotte's Magic Cream Mini",
       '24K Gold Pure Luxury Lift & Firm Hydra-Gel Eye Patches'],
      dtype=object)

In [154]:
weighted_distance_matrix[1][1]

0.0

In [140]:
df3[df3['Name'].isin()]

Unnamed: 0,Label,Brand,Name,Price,Rank,Combination,Dry,Normal,Oily,Sensitive,Ingredient_Placement,Ingredient,INCI name,COSING Ref No,Ingredient Description,Ingredient Function,Weights
453,Moisturizer,KIEHL'S SINCE 1851,Midnight Recovery Concentrate,47,4.4,1,1,1,1,1,1.0,Sunflower Seed Oil.,OZONIZED SUNFLOWER SEED OIL,58059.0,"Helianthus annuus (sunflower) seed oil, produc...",SKIN CONDITIONING,1.0
3540,Moisturizer,JOSIE MARAN,Argan Daily Moisturizer SPF 47,32,3.9,1,1,1,1,1,1.0,**Natural.,HUMAN UMBILICAL BLOOD DERIVED NATURAL KILLER C...,95432.0,Human Umbilical Blood Derived Natural Killer ...,"SKIN CONDITIONING, SKIN PROTECTING",1.0
6096,Moisturizer,FARSÁLI,Volcanic Elixir Polynesian Beauty Oil,39,4.5,0,0,0,0,0,1.0,Calophyllum Tacamahaca Seed oil.,CALOPHYLLUM TACAMAHACA SEED OIL,55204.0,Calophyllum Tacamahaca Seed Oil is the fixed o...,"SKIN CONDITIONING, SKIN CONDITIONING - EMOLLIENT",1.0
19506,Treatment,CAUDALIE,Premier Cru Serum,150,4.1,1,1,1,1,1,1.0,Polyphenols,CAMELLIA SINENSIS POLYPHENOLS,98488.0,Camellia Sinensis Polyphenols is a fraction of...,ANTIOXIDANT,1.0
20382,Treatment,PEACE OUT,Pore Treatment Strips,19,4.2,1,1,1,1,1,1.0,oil,EGG OIL,92397.0,"Egg Yolk Fatty Oil,Egg Yolk Oil,Ovum (EU),Ovum...","HAIR CONDITIONING, SKIN CONDITIONING",1.0
27596,Face Mask,GLAMGLOW,GLAMGLOW x MY LITTLE PONY® #GLITTERMASK GRAVIT...,59,3.9,1,1,1,1,0,1.0,Hyaluronic Acid,HYALURONIC ACID,34315.0,Hyaluronic acid,"ANTISTATIC, HUMECTANT, MOISTURISING, SKIN COND...",1.0
28925,Face Mask,FOREO,Call It A Night Revitalizing & Nourishing UFO™...,9,4.7,1,1,1,1,1,1.0,-Ginseng -Olive Oil,PSEUDOZYMA EPICOLA/SOYBEAN FLOUR/APRICOT KERNE...,93179.0,Pseudozyma Epicola/Soybean Flour/Apricot Kerne...,"EMULSION STABILISING, HAIR CONDITIONING, HUMEC...",1.0
30721,Face Mask,FOREO,H2Overdose Mask,19,5.0,1,1,1,1,1,1.0,-Hyaluronic Acid -Ceramide,ACETOBACTER/BACILLUS/LACTOBACILLUS/SACCHAROMYC...,91507.0,This is a filtrate of the product obtained by ...,SKIN CONDITIONING,1.0
31059,Face Mask,OMOROVICZA,Ultramoor Mud Mask,125,4.5,1,1,1,1,0,1.0,Rosemary,BACILLUS/ROSEMARY LEAF/SOYBEAN FERMENT EXTRACT...,88007.0,Bacillus/Rosemary Leaf/Soybean Ferment Extract...,"ANTIMICROBIAL, HUMECTANT, SKIN CONDITIONING, S...",1.0
40905,Eye cream,APIVITA,Express Beauty Mask With Ginkgo Biloba,30,4.0,0,0,0,0,0,1.0,** Camelia Sinensis,CARTHAMUS TINCTORIUS FLOWER/VITIS VINIFERA FRU...,89462.0,Carthamus Tinctorius Flower/Vitis Vinifera Fru...,"ANTI-SEBUM, ANTIMICROBIAL, ANTIOXIDANT, SKIN C...",1.0
