# Merging Data
Steps to consider:
* Want to use weighted similarity
    * multiply the similarity score of each ingredient pair by its corresponding weight based on the relative positions of the ingredients in their respective lists
    * Sum up the weighted similarity scores across all ingredient pairs to obtain an overall similarity score between the two products
    * Assign higher weights to ingredients that are closer together, and higher weights if they are closer to 1 in the list
* Need to match ingredients to their place in the ingredient_info data set

In [2]:
import pandas as pd
import re as re

In [3]:
ingredients = pd.read_csv("data/ingredient_info_processed.csv")
products = pd.read_csv("data/skincare_products_processed.csv")

In [16]:
products[products['Name'] == "Crème de la Mer"].head(3)

Unnamed: 0,Label,Brand,Name,Price,Rank,Combination,Dry,Normal,Oily,Sensitive,Ingredient_Placement,Ingredient
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,1,1,1,1,1,1,Algae (Seaweed) Extract
1,Moisturizer,LA MER,Crème de la Mer,175,4.1,1,1,1,1,1,2,Mineral Oil
2,Moisturizer,LA MER,Crème de la Mer,175,4.1,1,1,1,1,1,3,Petrolatum


In [15]:
def ingredient_in_INCI(inci, ingredient_name):
    cleaned_inci_text = re.sub(r'[^\w\s]', ' ', inci).lower()
    words_in_inci = set(cleaned_inci_text.split())
    cleaned_ingredient_text = re.sub(r'[^\w\s]', ' ', ingredient_name).lower()
    words_in_name = set(cleaned_ingredient_text.split())

    return (words_in_name <= words_in_inci)


def get_shortest_INCI(df, ingredient_name):
    ''' 
    Takes the ingredients dataframe (df) and an ingredient name
    as arguments, then returns the matching row from the dataframe if
    a match is found. Otherwise, returns an empty row.
    '''
    filtered_df = \
        df[df['INCI name'].apply(lambda x: ingredient_in_INCI(x, ingredient_name))]
    
    if filtered_df.size == 0:
        ingredient_name_clean = re.sub(r'\([^)]*\)', '', ingredient_name)

        filtered_df = \
            df[df['INCI name'].apply(lambda x: ingredient_in_INCI(x, ingredient_name_clean))]
        
        if filtered_df.size == 0:
            return filtered_df
        
    shortest_index = filtered_df['INCI name'].str.len().idxmin()
    return filtered_df.loc[[shortest_index]]
    

In [84]:
get_shortest_INCI(ingredients, "Algae (Seaweed) Extract")

Unnamed: 0,COSING Ref No,INCI name,INN name,Ph. Eur. Name,CAS No,EC No,Chem/IUPAC Name / Description,Restriction,Function,Update Date
1475,54290,ALGAE EXTRACT,,,92128-82-0 / 68917-51-1,295-780-4 / -,Algae Extract is an extract of various species...,,"FRAGRANCE, HUMECTANT, ORAL CARE, SKIN CONDITIO...",15/10/2010


In [14]:
# Gets the function for the ingredient CI 14700 (Red 4)
get_shortest_INCI(ingredients, "CI 14700 (Red 4)")['Function'].iloc[0]

'COLORANT'