In [None]:
import pandas as pd
from rapidfuzz import process, fuzz
import re
import os

from dotenv import load_dotenv
load_dotenv()
FOLDER_PATH = os.getenv("FOLDER_PATH")

if not FOLDER_PATH:
    raise ValueError("FOLDER_PATH not set in .env file!")

In [27]:
prods_data = pd.read_csv(os.path.join(FOLDER_PATH, "data", "products_data.csv"))
ingreds_data = pd.read_csv(os.path.join(FOLDER_PATH, "data", "ingredients_data.csv"))

In [4]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r'[^a-z0-9\s]', '', text.lower().strip())

### Product Name Matching

In [5]:
output = ['SW |SS', 'BEAuTh', 'ALL-IN-ONE', 'CC', 'CREAM', 'cover', 'correct', 'conceal', 'with SPF 30']
output_text = ' '.join(output).lower()

output_text = clean_text(output_text)
print(output_text)

sw ss beauth allinone cc cream cover correct conceal with spf 30


In [28]:
prods_data.head()

Unnamed: 0,brand_name,prod_name,prod_descrp,list_of_ingreds
0,Summer Fridays,Lip Butter Balm for Hydration & Shine,summer fridays lip butter balm for hydration ...,"Phytosteryl/Behenyl Dimer Dilinoleate, Diisost..."
1,Glow Recipe,Watermelon Glow PHA + BHA Pore-Tight Toner,glow recipe watermelon glow pha bha poretight...,"Opuntia Ficus-Indica Stem Extract, Citrullus L..."
2,Touchland,Power Mist Hydrating Hand Sanitizer,touchland power mist hydrating hand sanitizer,"Alcohol, Deionized/Demineralized Water, Aloe B..."
3,The Ordinary,Hyaluronic Acid 2% + B5 Hydrating Serum,the ordinary hyaluronic acid 2 b5 hydrating s...,"Aqua/Water/Eau, Hydrolyzed Sodium Hyaluronate,..."
4,LANEIGE,Lip Sleeping Mask Intense Hydration with Vitam...,laneige lip sleeping mask intense hydration wi...,"Diisostearyl Malate, Hydrogenated Polyisobuten..."


In [29]:
# Products Knowledge Base
prods_choices = [clean_text(p) for p in prods_data['prod_descrp'].tolist()]

In [None]:
# Match product name (for image with name/tagline)
def match_product_name(output_text, knowledge_base):
    cleaned_output = clean_text(output_text)
    _, score, idx = process.extractOne(cleaned_output, knowledge_base, scorer=fuzz.token_sort_ratio)
    return prods_data.iloc[idx], score

In [None]:
# For product name image
product_match, score = match_product_name(output_text, prods_choices)
print("Matched Product Name:", product_match)
print("Score:", score)

Matched Product Name: it cosmetics your skin but better cc cream with spf 50
List of Ingredients: ['Water', 'Snail Secretion Filtrate', 'Phenyl Trimethicone', 'Dimethicone', 'Butylene Glycol', 'Butylene Glycol Dicaprylate/Dicaprate', 'Orbignya Oleifera Seed Oil', 'Butyloctyl Salicylate', 'Cetyl Peg/Ppg-10/1 Dimethicone', 'Cyclopentasiloxane', 'Cyclohexasiloxane', 'Magnesium Sulfate', 'Polyglyceryl-4 Isostearate', 'Dimethicone/Vinyl Dimethicone Crosspolymer', 'Aluminum Hydroxide', 'Hexyl Laurate', 'Stearic Acid', 'Calcium Stearate', 'Caprylyl Glycol', 'Triethoxycaprylylsilane', 'Ethylhexylglycerin', 'Citrus Medica Limonum (Lemon) Peel Oil', 'Tocopheryl Acetate', 'Sorbitan Isostearate', 'Phenoxyethanol', 'Citrus Aurantium Bergamia (Bergamot) Fruit Oil', '1,2-Hexanediol', 'Disodium Edta', 'Citrus Aurantium Dulcis (Orange) Peel Oil', 'Citrus Aurantifolia (Lime) Oil', 'Vitis Vinifera (Grape) Seed Oil', 'Punica Granatum Seed Oil', 'Pinus Sylvestris Leaf Oil', 'Persea Gratissima (Avocado) Oil

In [None]:
output_ingreds = product_match['list_of_ingreds'][:10]

### Ingredients Matching

In [45]:
output = ['INGREDIENTS: WATERIAQUAIEAU-GLYCERIN- SODIUM METHYL CO-', 'COYLTAURATE- BUTYLENE GLYCOL:SUCROSE- SALICYLIC ACID.', 'CAFFEINE : ACETYL GLUCOSAMINE: LAMINARIA SACCHARINA EX-', 'TRACT - ARGININE COCOATE- GENTIANA LUTEA (GENTIAN) ROOT', 'EXTRACT -PEG/PPG-18/18 DIMETHICONE: LAMINARIA DIGITATA EX-', 'TRACT - SODIUM HYALURONATE: PPG-6-DECYLTETRADECETH-30:', 'CITRIC ACID-POLYQUATERNIUM', '7-Dl-C12-18 ALKYL DIMONIUM', 'CHLORIDE. DISODIUM', 'PHOSPHATE- SODIUM   HYDROXIDE- DISO-', 'DIUM EDTA- SODIUM BENZOATE- PHENOXYETHANOL : CHLOROXY-', 'LENOL [ILN50436]']

In [39]:
# Ingredients Knowledge Base
ingreds_choices = [clean_text(i) for i in ingreds_data['ingred_name'].tolist()]

In [None]:
def match_ingredients(extracted_ingreds, knowledge_base):
    matched_rows = []

    if len(extracted_ingreds) > 20:
        extracted_ingreds = extracted_ingreds[:20]

    for ing in extracted_ingreds:
        clean_ing = clean_text(ing)
        _, score, idx = process.extractOne(clean_ing, knowledge_base, scorer=fuzz.token_sort_ratio)
        if score > 50:
            matched_rows.append((idx, score))

    return sorted(matched_rows, key=lambda x: x[1], reverse=True)

In [78]:
# For ingredients image
ingreds_matches_idx = match_ingredient_product(output, ingreds_choices)
print("Matched Products by Ingredients:\n", ingreds_matches_idx)

Matched Products by Ingredients:
 [(423, 93.75), (7322, 72.94117647058825), (64200, 71.7948717948718), (704, 71.42857142857143), (1079, 66.66666666666667), (7868, 63.76811594202898), (629, 63.29113924050633), (77313, 61.1764705882353), (10019, 58.82352941176471), (37592, 56.310679611650485), (3331, 55.55555555555556)]


In [79]:
ingreds_data.iloc[[idx[0] for idx in ingreds_matches_idx]]

Unnamed: 0,ingred_name,ratingscore,skin_profile
423,Sodium Chloride,3.9,Normal Dry Combination Oily concerns: Fine Lin...
7322,Gentiana Lutea (Gentian) Root Extract,4.6,Normal Dry Oily Combination concerns: Dryness ...
64200,polyquaternium,4.1,Normal Dry Combination Oily Sensitive concerns:
704,Sodium Ascorbyl Phosphate,4.6,
1079,Laminaria Digitata Extract,4.5,Normal Dry Combination Oily concerns:
7868,Ppg-6-Decyltetradeceth-30,0.0,
629,Laminaria Saccharina Extract,4.2,
77313,Phenoxyethanol (Source Aromatic Ether),4.0,
10019,Butylene Glycol Dicaprylate/Dicaprate,4.5,
37592,Sodium Benzoate. Pineapple Face Mask:INGREDIEN...,5.0,Normal Dry Combination Oily Sensitive concerns:


In [59]:
output[:10]

['INGREDIENTS: WATERIAQUAIEAU-GLYCERIN- SODIUM METHYL CO-',
 'COYLTAURATE- BUTYLENE GLYCOL:SUCROSE- SALICYLIC ACID.',
 'CAFFEINE : ACETYL GLUCOSAMINE: LAMINARIA SACCHARINA EX-',
 'TRACT - ARGININE COCOATE- GENTIANA LUTEA (GENTIAN) ROOT',
 'EXTRACT -PEG/PPG-18/18 DIMETHICONE: LAMINARIA DIGITATA EX-',
 'TRACT - SODIUM HYALURONATE: PPG-6-DECYLTETRADECETH-30:',
 'CITRIC ACID-POLYQUATERNIUM',
 '7-Dl-C12-18 ALKYL DIMONIUM',
 'CHLORIDE. DISODIUM',
 'PHOSPHATE- SODIUM   HYDROXIDE- DISO-']