In [10]:
import pandas as pd
import numpy as np
from IPython.display import display, clear_output, HTML

# Similarity Scheme
Each product's ingredients will be given a weight based on its position in the list. To generate the similarity score between two different products, each matched ingredient's weights will be multiplied together, and then all of those will be summed. Finally, this sum is divided by the maximum possible score (which is the weights from the shorter list squared and then summed).

To get the weights, we will use the geometric distribution.

**Geometric distribution:**
The probability that the first occurrence of success requires $i$ independent trials each with success probability $p$.
$$
p(1 - p) ^ {i - 1}
$$

![Geometric Distribution (from Wikipedia)](images/geometric_distribution.png)

In [11]:
# uses a geometric distribution so each weight
# decreases geometrically according to its position.
def generate_weights(n, p):
    weights = []
    total_weight = 0
    for i in range(1, n + 1):
        weight = p * ((1 - p) ** (i - 1)) # geometric pdf
        weights.append(weight)
        total_weight += weight
    normalized_weights = [weight / total_weight for weight in weights]
    return normalized_weights

In [12]:
generate_weights(4, 0.3)

[0.3947887879984208,
 0.2763521515988946,
 0.19344650611922617,
 0.1354125542834583]

In [13]:
df = pd.read_csv("data/skincare_products_merged.csv")

In [14]:
def remove_na_ingredients(df):
    # WARNING: modifies df in-place
    
    # Find the brand and names that contain na
    brand_name_contains_na = \
        df[df['INCI name'].isna()][['Brand', 'Name']].drop_duplicates()
    

    # loop through each of these items, replace na ingredient positions
    # with na, adjust the other numbers accordingly, and then drop nas
    for i in range(len(brand_name_contains_na)):
        curr_df = \
            df[(df['Brand'] == brand_name_contains_na.iloc[i]['Brand']) & \
               (df['Name'] == brand_name_contains_na.iloc[i]['Name'])]

        # get the indices of the item and indices where ingredient is nan
        curr_indices = curr_df.index 
        na_indices = curr_df[curr_df['INCI name'].isna()].index

        message = HTML(
            f"<p>Product: {brand_name_contains_na.iloc[i]['Brand']} "
            f"{brand_name_contains_na.iloc[i]['Name']}<br>"
            f"NAN indices: {na_indices}</p>"
        )
        display(message)
            
        for j in na_indices:

            df.loc[j, 'Ingredient_Placement'] = np.nan

            # get integer position of na index to check if it's at the end
            integer_pos = curr_df.index.get_loc(j)

            # if there are more ingredients after, shift their position
            # up by 1
            if len(curr_df) > (integer_pos + 1):
                df.loc[j:curr_indices[-1], 'Ingredient_Placement'] = \
                    df.loc[j:curr_indices[-1], 'Ingredient_Placement'] - 1

        clear_output(wait=True)

    # now dropping nans will yield correct ingredient placement
    return df.dropna(subset=['INCI name'])

    

In [15]:
df2 = remove_na_ingredients(df)

In [16]:
def add_weghts(df, p=0.3):
    brand_and_names = df.loc[:,['Brand', 'Name']].drop_duplicates()
    total_weights = []

    for i in range(len(brand_and_names)):
        curr_df = \
            df[(df['Brand'] == brand_and_names.iloc[i]['Brand']) & \
               (df['Name'] == brand_and_names.iloc[i]['Name'])]
        
        curr_weights = generate_weights(len(curr_df), p)

        total_weights += curr_weights
    
    df.loc[:,'Weights'] = total_weights

    return df
        

In [17]:
df3 = add_weghts(df2)
cols = ['Brand', 'Name', 'Ingredient_Placement', 
        'INCI name', 'Weights']
df3[cols].head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'Weights'] = total_weights


Unnamed: 0,Brand,Name,Ingredient_Placement,INCI name,Weights
0,LA MER,Crème de la Mer,1.0,ALGAE EXTRACT,0.3
1,LA MER,Crème de la Mer,2.0,HYDROGENATED MINERAL OIL,0.21
2,LA MER,Crème de la Mer,3.0,PETROLATUM,0.147


In [None]:
def create_weight_matrix(df):
    brand_and_names = df.loc[:,['Brand', 'Name']].drop_duplicates()
    ingredients = df['INCI name'].unique()
    ingredients.sort()
    weight_matrix = np.zeros((len(brand_and_names), len(ingredients)))

    for i in range(len(brand_and_names)):
        curr_df = \
            df[(df['Brand'] == brand_and_names.iloc[i]['Brand']) & \
               (df['Name'] == brand_and_names.iloc[i]['Name'])]
        
        curr_ingredient_vector = np.zeros(len(ingredients))
        indices = np.searchsorted(ingredients, curr_df['INCI name'])

        curr_ingredient_vector[indices] = curr_df['Weights']

In [38]:
ingredients = df3['INCI name'].unique()
ingredients.sort()
arr = np.zeros(len(ingredients))
indices = np.searchsorted(ingredients, df3.loc[40:60, 'INCI name'].values)
arr[indices] = df3.loc[40:60, 'Weights'].values
arr

array([0., 0., 0., ..., 0., 0., 0.])

In [40]:
arr[arr != 0]

array([1.91004258e-07, 2.47062900e-02, 2.28846493e-01, 1.02900000e-01,
       7.20300001e-02, 2.10000000e-01, 3.26923561e-01, 1.47000000e-01,
       5.49460429e-02, 1.72944030e-02, 1.60192545e-01, 3.52947000e-02,
       1.21060821e-02, 8.47425748e-03, 5.93198023e-03, 7.84943470e-02,
       3.84622300e-02, 5.04210000e-02, 3.00000000e-01])