# Ingredient List Cleaning & Processing

## Data Upload & Pre-Cleaning  

In [1]:
# Packages
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from math import pi
import plotly.express as px

### Upload list of products with their respective ingredient lists.
* Data was collected manually from different sources (manufacturers. retails & skincare focused websites)
* The ingredient lists were formatted and pre-cleaned.

In [18]:
# ImportProduct List with respective ingredient lists
spf = pd.read_csv('Skinlytics - IngredientList-SPF.csv')

# Dataset pre-cleaning & formatting
def clean_data(df):
    """Clean dataframe by renaming columns, adjusting string formats, and creating a cleaned_list column."""

    # Rename columns
    df.rename(columns={'Ingredient List ': 'ingredient_list', 'Product ': 'Product'}, inplace=True)
    
    # Remove whitespace from 'Brand' and combine 'Brand' and 'Product' into 'Product'
    df['Brand'] = df['Brand'].str.replace(' ', '', regex=False)
    df['Product'] = df['Brand'] + '-' + df['Product']

    # Clean 'ingredient_list'
    df['ingredient_list'] = df['ingredient_list'].str.lower()
    df['ingredient_list'] = df['ingredient_list'].str.replace('|', ',', regex=False)
    df['ingredient_list'] = df['ingredient_list'].str.replace('*', ' ', regex=False)
    df['ingredient_list'] = df['ingredient_list'].str.replace('%', ' ', regex=False)
    df['ingredient_list'] = df['ingredient_list'].str.replace('\n', ' ', regex=False)
    df['ingredient_list'] = df['ingredient_list'].str.replace('\u200b', ' ', regex=False)
    df['ingredient_list'] = df['ingredient_list'].str.replace(' ', '', regex=False)
    df['ingredient_list'] = df['ingredient_list'].str.replace('\(\d\.\d\d\)|\(\d\d\.\d\d\)', '', regex=True)

    # Duplicate 'ingredient_list' to 'cleaned_list'
    df['cleaned_list'] = df['ingredient_list']
    
    return df

# Use the function
spf = clean_data(spf)
spf[:5]

Unnamed: 0,Category,Market,Brand,Product,ingredient_list,cleaned_list
0,SPF,AU,BondiSands,BondiSands-Fragrance Free Body lotion SPF 50,"homosalate,octocrylene,octylsalicylate,butylme...","homosalate,octocrylene,octylsalicylate,butylme..."
1,SPF,DE,DM,DM-Sun Dance Ultra sensitive Gel creme SPF 50,"aqua,alcoholdenat.,dibutyladipate,diethylamino...","aqua,alcoholdenat.,dibutyladipate,diethylamino..."
2,SPF,DE,DM,DM-Balea Niacinamide 10% moisturizer SPF 30,"aqua,niacinamide,dibutyladipate,glycerin,ethyl...","aqua,niacinamide,dibutyladipate,glycerin,ethyl..."
3,SPF,DE,DM,DM-Alverde Clear Beauty Getonte tagescreme,"aqua,oleaeuropaeafruitoil,ci77891,alcoholdenat...","aqua,oleaeuropaeafruitoil,ci77891,alcoholdenat..."
4,SPF,DE,Eucerin,Eucerin-Sun Oil Control,"aqua,homosalate,polymethylsilsesquioxane,butyl...","aqua,homosalate,polymethylsilsesquioxane,butyl..."


### Upload Ingredient Classification from INCIDecoder 
* ingredient_classification.csv stores webscrapped data from https://incidecoder.com/ingredient-functions. INCI Decoder uses a list of standardized and internationally accepted names used in the declaration of ingredients on cosmetic and personal care products published by the Personal Care Product Council, the leading US trade association representing cosmetics and personal care companies. 

In [3]:
ingredient_class = pd.read_csv('ingredient_classification.csv')
ingredient_class['Name_cleaned'] = ingredient_class['Name'].str.lower()
ingredient_class['Name_cleaned'] = ingredient_class['Name_cleaned'].str.replace('-', '', regex=False)
ingredient_class['Subcategory'] = 'N/A'
ingredient_class.head()

Unnamed: 0,Classification,Name,Name_cleaned,Subcategory
0,abrasive-scrub,Actinidia-Chinensis-Seed,actinidiachinensisseed,
1,abrasive-scrub,Adansonia-Digitata-Seed-Powder,adansoniadigitataseedpowder,
2,abrasive-scrub,Alumina,alumina,
3,abrasive-scrub,Aluminum-Silicate,aluminumsilicate,
4,abrasive-scrub,Amethyst-Powder,amethystpowder,


In [17]:
ingredient_class[ingredient_class['Classification'] == 'soothing']

array(['4-T-Butylcyclohexanol', 'Acanthopanax-Senticosus-Root-Extract',
       'Acetyl-Dipeptide-1-Cetyl-Ester', 'Achillea-Millefolium-Extract',
       'Achillea-Millefolium-Oil', 'Aesculus-Hippocastanum-Seed-Extract',
       'Albatrellus-Confluens-Extract', 'Aleurites-Moluccana-Seed-Oil',
       'Allantoin', 'Allantoin-Acetyl-Methionine', 'Allantoin-Ascorbate',
       'Allantoin-Calcium-Pantothenate', 'Allantoin-Glycyrrhetinic-Acid',
       'Aloe-Arborescens-Leaf-Extract', 'Aloe-Barbadensis-Leaf-Extract',
       'Aloe-Barbadensis-Leaf-Juice',
       'Aloe-Barbadensis-Leaf-Juice-Powder', 'Aloe-Ferox-Leaf-Gel',
       'Alteromonas-Ferment-Extract', 'Ananas-Sativus-Fruit-Extract',
       'Anthemis-Nobilis-Flower-Extract', 'Anthemis-Nobilis-Flower-Oil',
       'Arctium-Lappa-Root-Extract', 'Avena-Sativa-Bran-Extract',
       'Avena-Sativa-Kernel-Extract', 'Avena-Sativa-Kernel-Flour',
       'Avena-Sativa-Meal-Extract', 'Azelaic-Acid', 'Azulene',
       'Berberis-Vulgaris-Root-Extract', 'B

#### Add subclassifications to INCIDecoder data

* Skin-identical ingredient Category has two separate sub-categories with disticnt roles:
    - Natural Moisturizing Factors (NMF): Glycerin, Hyaluronic acid
    - Stratum Corneum Lipids (SCL): ceramides, cholesterol, fatty acids 
    - aminoacids 

* Sunscreen has many different types of sub categories:
    - Physical Filters
    - Chemical Filters
    - Hydrid Filters
    - Broad Spectrum 
    - UVA Protection Filters
    - UVB Protection Filters
    
    
* There are also ingredients with water resistant abilities, useful in SPFs

In [11]:
ingredient_class['Classification'].unique()

array(['abrasive-scrub', 'absorbent-mattifier', 'anti-acne',
       'antimicrobial-antibacterial', 'antioxidant', 'astringent',
       'buffering', 'cell-communicating-ingredient', 'chelating',
       'colorant', 'deodorant', 'emollient', 'emulsifying',
       'emulsion-stabilising', 'exfoliant', 'moisturizer-humectant',
       'perfuming', 'preservative', 'skin-brightening',
       'skin-identical-ingredient', 'solvent', 'soothing', 'sunscreen',
       'surfactant-cleansing', 'viscosity-controlling'], dtype=object)

In [13]:
ingredient_class[ingredient_class['Classification'] == 'skin-identical-ingredient']['Name'].unique()

array(['Acetyl-Glucosamine', 'Alanine', 'Arginine', 'Aspartic-Acid',
       'Ceramide', 'Ceramide-AP', 'Ceramide-EOP', 'Ceramide-NG',
       'Ceramide-NP', 'Cholesterol', 'Collagen-Amino-Acids', 'Diglycerin',
       'Glycerin', 'Glycine',
       'Helianthus-Annuus-Seed-Oil-Unsaponifiables', 'Histidine',
       'Hyaluronic-Acid', 'Hydroxypalmitoyl-Sphinganine',
       'Hydroxypropyl-Bispalmitamide-MEA', 'Isoleucine',
       'Jojoba-Oil/Macadamia-Seed-Oil-Esters', 'Lacto-Ceramides',
       'Linoleic-Acid', 'Linolenic-Acid', 'Palmitic-Acid', 'Pca',
       'Phenylalanine', 'Phospholipids', 'Phytosphingosine',
       'Phytosteryl-Macadamiate', 'Proline', 'Serine',
       'Sodium-Chondroitin-Sulfate', 'Sodium-Hyaluronate',
       'Sodium-Hyaluronate-Crosspolymer', 'Sodium-PCA', 'Squalane',
       'Squalene', 'Threonine', 'Urea', 'Valine'], dtype=object)

In [None]:
nmf = ['Acetyl-Glucosamine', 'Alanine', 'Arginine', 'Aspartic-Acid',
       , 'Collagen-Amino-Acids', 'Diglycerin',
       'Glycerin', 'Glycine',
       'Helianthus-Annuus-Seed-Oil-Unsaponifiables', 'Histidine',
       'Hyaluronic-Acid', 'Hydroxypalmitoyl-Sphinganine',
       'Hydroxypropyl-Bispalmitamide-MEA', 'Isoleucine',
       'Jojoba-Oil/Macadamia-Seed-Oil-Esters', 'Lacto-Ceramides',
       'Linoleic-Acid', 'Linolenic-Acid', 'Palmitic-Acid', 'Pca',
       'Sodium-Chondroitin-Sulfate', 'Sodium-Hyaluronate',
       'Sodium-Hyaluronate-Crosspolymer', 'Sodium-PCA',
       'Squalene', 'Threonine', 'Urea']
scl = [ 'Alanine', 'Arginine', 'Aspartic-Acid',
       'Ceramide', 'Ceramide-AP', 'Ceramide-EOP', 'Ceramide-NG',
       'Ceramide-NP', 'Cholesterol', 'Collagen-Amino-Acids', 'Diglycerin',
       , 'Glycine',
       'Helianthus-Annuus-Seed-Oil-Unsaponifiables', 'Histidine',
 'Hydroxypalmitoyl-Sphinganine',
       'Hydroxypropyl-Bispalmitamide-MEA', 'Isoleucine',
       'Jojoba-Oil/Macadamia-Seed-Oil-Esters', 'Lacto-Ceramides',
       'Linoleic-Acid', 'Linolenic-Acid', 'Palmitic-Acid',
        'Phospholipids', 'Phytosphingosine',
       'Phytosteryl-Macadamiate', ,'Squalane']
aminoacids = ['Valine','Threonine', 'Serine', 'Proline', 'Phenylalanine']

In [None]:
## Add subclassifications to INCIDecoder data 
spf_physical = []
spf_chemical = ['avobenzone', 'uvinul_A_plus', 'ensulizole',
                'homosalate', 'tinosorb_S', 'uvasorb_heb', 'uvinul_A_plus']
spf_UVA = ['avobenzone', 'uvinul_A_plus']
spf_UVB = ['ensulizole', 'homosalate',
           'octisalate', 'octocrylene', 'uvasorb_heb']
spf_broad_spectrum = ['tinosorb_S']
spf_water_resistance = ['acrylates/dimethiconecopolymer',
                        'acrylatescopolymer', 'octocrylene', 'uvasorb_heb']
spf_booster = ['c2-5alkylbenzoate', 'dibutyladipate',
               'galactoarabinan', 'aluminumhydroxide', 'butyloctisalate', ]
spf_chemical = ['avobenzone', 'uvinul_A_plus', 'ensulizole',
                'homosalate', 'tinosorb_S', 'uvasorb_heb', 'uvinul_A_plus']
spf_UVB = ['ensulizole', 'homosalate',
           'octisalate', 'octocrylene', 'uvasorb_heb']
spf_broad_spectrum = ['tinosorb_S']

len(spf_chemical)
spf_chemical = ['avobenzone', 'uvinul_A_plus', 'ensulizole',
                'homosalate', 'tinosorb_S', 'uvasorb_heb', 'uvinul_A_plus']
spf_UVB = ['ensulizole', 'homosalate',
           'octisalate', 'octocrylene', 'uvasorb_heb']
spf_broad_spectrum = ['tinosorb_S']

len(spf_chemical)
fragrance_and_essential_oils = [
    'aroma', 'benzylbenzoate', 'benzylsalicylate', ]
drying_alcohols = ['alcohol', 'alcoholdenat']
fatty_alcohols = ['arachidylalcohol', 'behenylalcohol', ]
abrasive_scrub = ['bambusaarundinaceastempowder', ]

## Preparing ingredient list for the Similarity Matrix Calculation



* Standardization: Standardize the format and naming of your ingredients. This might involve converting all text to lower case, removing punctuation, and using the same name for the same ingredient (for example, always using 'vitamin c' instead of sometimes using 'ascorbic acid').

* Remove concentration: Sometimes, the concentration of the ingredient is included in the list. Removing this information can make the cleaning process easier and ensure ingredients are recognized as the same, regardless of their concentration in different products.

* Group synonyms: Different names might be used for the same ingredient. Make sure you group these together under a single standardized name. For example, "Vitamin C", "L-Ascorbic Acid", "Sodium Ascorbyl Phosphate" could all be considered as "Vitamin C".

* Remove non-essential ingredients: Some ingredients such as colorings, fragrances, or preservatives are not typically the key functional ingredients in skincare products. Depending on the nature of your analysis, you may want to remove these to focus on the active ingredients.

* Tokenization: Split the ingredients list into separate 'tokens' or 'words', with one token for each ingredient. This is crucial for text analysis methods and also aids in cleaning, as you can work on each ingredient separately.

### Remove redundant ingredients & format names 

In [7]:
# Cleaning Tool 
all_ingreds = []

for i in spf['ingredient_list']:
    ingreds_list = i.split(',')
    for j in ingreds_list:
        all_ingreds.append(j)

all_ingreds = sorted(set(all_ingreds))
# all_ingreds[0:10]

len(all_ingreds)

572

### Standardization of the terminology for SPF filters

Broad spectrum Filters: 

- Bis-Ethylhexyloxyphenol Methoxyphenyl Triazine (also called Tinosorb S, Bemotrizinol)
- Drometrizole Trisiloxane (Mexoryl XL)
- Methylene Bis-Benzotriazolyl Tetramethylbutylphenol(Tinosorb M, Bisoctrizole)

Chemical SPF Filters:

- 4-Methylbenzylidene Camphor (also called Enzacamene)
- Benzophenone-3 (also called Oxybenzone)
- Benzophenone-4 (also called Sulisobenzone)
- Benzophenone-5 (also called Sodium Sulisobenzone)
- Butyl Methoxydibenzoylmethane (also called Avobenzone)
- Diethylamino Hydroxybenzoyl Hexyl Benzoate (Uvinil A Plus, DHHB) 
- Diethylhexyl Butamido Triazone (also called Iscotrizinol, Uvasorb HEB)
- Disodium Phenyl Dibenzimidazole Tetrasulfonate(Neo Heliopan AP, Bisdisulizole Disodium)
- Ethylhexyl Dimethyl PABA(Padimate O, Octyl Dimethyl PABA, Eusolex 6007) 
- Ethylhexyl Methoxycinnamate (Octinoxate, Octyl Methoxycinnamate)
- Ethylhexyl Salicylate (Octyl Salicylate, Octisalate)
- Ethylhexyl Triazone (Uvinul T 150, Octyltriazone)
- Homosalate
- Isoamyl p-Methoxycinnamate (Amiloxate, Neo Heliopan E1000)
- Methoxycinnamidopropyl Hydroxysultaine (Galaxy Sunbeat)
- Octocrylene
- Phenylbenzimidazole Sulfonic Acid (Ensulizole)
- Polysilicone-15 (Parsol SLX)
- Terephthalylidene Dicamphor Sulfonic Acid (Ecamsule, Mexoryl SX)
- Tris-Biphenyl Triazine (Tinosorb A2B)

Physical SPF Filters
- Titanium Dioxide
- Zinc Oxide

Filters that risk the coral sea life: oxybenzone, octocrylene and octinoxate

Acrylates increase water resistance and boost SPF protection.Common water resistant ingredients: 
- Acrylates/Octylacrylamide Copolymer 
- VA/Butyl Maleate/Isobornyl Acrylate Copolymer
- Styrene Acrylates Copolymer
- Acrylates/C12-22 Alkyl Methacrylate Copolymer
- VP/Eicosene Copolymer
- Acrylates 
- Styrene/Acrylates Copolymer - used to create water resistant formulas, boosts in SPF bwt 11%-18% per 1% of styrene

In [102]:
# Styrene/Acrylates Copolymer ( Water resistant formulas, SPF Booster)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'styrene\/acrylatescopolymer', 'styrene_acrylates_copolymer', regex=True)

# Zinc Oxide
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'microparticlezincoxide|zincoxide\(\d\)|zincoxide\(\d\.\d\)', 'zincoxide', regex=True)

# Titanium Dioxide #
# Ci77891 when used as a colorant
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'citralmaycontain:ci7789.*\w', 'ci7789/titaniumdioxide', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    '\w.*ci7789\)', 'ci7789/titaniumdioxide', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'ci7789\/titaniumdioxide', 'ci7789', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'titaniumoxide', 'titaniumdioxide', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    '\w.*titaniumdioxide', 'titaniumdioxide', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'titaniumdioxide\(\d\)|titaniumdioxide\(nano\)', 'titaniumdioxide', regex=True)

# Tris-Biphenyl Triazine as Tinosorb A2b
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'tris-biphenyltriazine(nano)', 'tinosorb_a2b', regex=False)

# Terephthalylidene Dicamphor Sulfonic Acid as Mexoryl SX
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'terephthalylidenedicamphorsulfonicacid', 'meroxyl_sx', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'terephthalylidene', 'meroxyl_sx', regex=False)

# 4-Methylbenzylidene Camphor as Enzacamene
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    '4-methylbenzylidenecamphor', 'enzacamene', regex=False)

# Butyl Methoxydibenzoylmethane as Avobenzone
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'avobenzone(25)', 'avobenzone', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'avobenzone(3)', 'avobenzone', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'butylmethoxydibenzoylmethane', 'avobenzone', regex=False)

# Bis-Ethylhexyloxyphenol Methoxyphenyl Triazine and Bemotrizinol as Tinosorb S
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'bis-ethylhexyloxyphenolmethoxyphenyltriazine', 'tinosorb_S', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'bisethylhexyloxyphenolmethoxyphenyltriazine', 'tinosorb_S', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'bis-ethylhexyloxyphenol', 'tinosorb_S', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'methoxyphenyltriazine', 'tinosorb_S', regex=False)

# Ethylhexyl Methoxycinnamate and Octyl Methoxycinnamate as Octinoxate
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'octinoxate(75)', 'octinoxate', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'ethylhexylmethoxycinnamate', 'octinoxate', regex=False)

# Ethylhexyl Salicylate and Octyl Salicylate as Octisalate
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'octylsalicylate', 'octisalate', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'octisalate(5)', 'octisalate', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'octisalate(2)', 'octisalate', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'ethylhexylsalicylate', 'octisalate', regex=False)

# Octocrylene
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'octocrylene(8)', 'octocrylene', regex=False)

# Ethylhexyl Triazone and Octyltriazone as Uvinul T 150
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'ethylhexyltriazone', 'uvinul_t150', regex=False)

# Diethylhexyl Butamido Triazone and Iscotrizinol as Uvasorb HEB
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'diethylhexylbutamidotriazone', 'uvasorb_heb', regex=False)

# Homosalate
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'homosalate(8)', 'homosalate', regex=False)

# Phenylbenzimidazole Sulfonic Acid as Ensulizole
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'phenylbenzimidazolesulfonicacid', 'ensulizole', regex=False)

# Methylene Bis-Benzotriazolyl Tetramethylbutylphenol and Bisoctrizole as Tinosorb M
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'methylenebis-benzotriazolyltetramethylbutylphenol', 'tinosorb_m', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'methylenebis-benzotriazolyltetramethylbutylphenol(nano)', 'tinosorb_m', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'methylenebis-benzotriazolyltetramethylbutyphenol[nano]//methylenebis-benzotriazolyltetramethylbutylphenol', 'tinosorb_m', regex=False)


# diethylaminohydroxybenzoylhexylbenzoate and dhhb as uvinul_A_plus
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'diethylaminohydroxybenzoylhexylbenzoate', 'uvinul_A_plus', regex=False)


### Standardization of terminology for other actives 

In [6]:
### Clean up Other Active ingredients 
# Water
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'aqua.water', 'water', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'aqua', 'water', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'water\)', 'water', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'purifiedwater', 'water', regex=True)

# Vitamin E #
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'tocopher.lacetate', 'vitaminE_acetate', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'tocopherol\.', 'vitaminE', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'tocopherol', 'vitaminE', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'vitamine', 'vitaminE', regex=True)

# Alcohols
# Drying alcohols: alcohol, alcohol denat., T-butyl alcohol, benzyl alcohol
# The other ones in the list are fatty alcohols

spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'alcoholdenat\.', 'alcoholdenat', regex=True)

# Other cleaning ups
spf['cleaned_list'] = spf['cleaned_list'].str.replace('\.', '', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    '4-t-butylcyclohexanol', 'symsitive', regex=False)

# Aloe vera
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'aloe.*extract', 'aloeveraextract', regex=True)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'aloebarbadensisleafjuicepowder', 'aloeveraextract', regex=True)

# Vitamin C
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'sodiumascorbylphosphate', 'vitaminC_derivative', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'ascorbyl.*\w', 'vitaminC_derivative', regex=True)

# Centella Asiatica
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'asiaticacid', 'centellaasiaticaextract', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'asiaticoside', 'centellaasiaticaextract', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'madecassicacid', 'centellaasiaticaextract', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'madecassoside', 'centellaasiaticaextract', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'centellaasiaticaleafextract', 'centellaasiaticaextract', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'centellaasiaticarootextract', 'centellaasiaticaextract', regex=False)

# Format cleaning
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'vitaminC_derivative/', 'vitaminC_derivative', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'tinosorb_m(nano)', 'tinosorb_m', regex=False)

# Feverfrew extract
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'chrysanthemumpartheniumextract', 'feverfewextract', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'chrysanthemumparthenium(feverfew)flower/leaf/stemjuice', 'feverfewextract', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'chrysanthemumpartheniumextract', 'feverfewextract', regex=False)

# Shea butter
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'butyrospermumparkii(shea)butter', 'sheabutter', regex=False)
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'butyrospermumparkiibutter', 'sheabutter', regex=False)

# Sunflower Oil
spf['cleaned_list'] = spf['cleaned_list'].str.replace(
    'helianthusannuus(sunflower)seedoilunsaponifiables', 'helianthusannuus(sunflower)seedoil', regex=False)

# Remove water
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('water','', regex=True)

#### Cleaning up station
cleaned_list = []

for i in spf['cleaned_list']:
    ingreds_c_list = i.split(',')
    for j in ingreds_c_list:
        cleaned_list.append(j)

    cleaned_list = sorted(set(cleaned_list))

len(cleaned_list)

#[s for s in all_ingreds if 'alcohol' in s][:50]

[s for s in cleaned_list if 'avobenzone' in s][:50]


['avobenzone(25)', 'avobenzone(3)']

### Export the final results into a csv data frame 

In [11]:
spf.to_csv('spf_full')


## Generating a classification profile score table

* For each product there will be a count of ingredients present in its list that belong to a certain ingredient classification.

* This tells how many ingredients are contributing for a certain role (ex:sunscreen filters, antioxidants, humectants, etc). 
* The score is just a rough indicator of the product profile. If I have oily skin and I see a product with 10 emollients in it, I would stay away from it. If I see a product that claims sunscreen protection and it has many filters in it for UVA and UVB, I can think of it as a somewhat reliable sunscreen. The critique to this score would be that you can have very small lists where each ingredient exists in higher concentrations than in products with longer lists. This score classification should be seen with a grain of salt and used only as a pre-analysis tool and nothing more. 

* This program will create a score generator and update the spf table with the scores for each product in the more meaningful ingredient classifications (classifications with actives & other direct benefit for the skin)


In [8]:
# Creating an ingredient classification dictionary to be used on the score generator
ic = ingredient_class['Classification'].unique()
output ={}
for n in ic:
    output[n] = ingredient_class.loc[ingredient_class['Classification'] == n, 'Name_cleaned'].values.tolist()
ingredient_dict = {n:output[n] for n in ic}

# Defining sets based on the dictionary keys 
#spf_chemical_set = set(spf_chemical)
#spf_UVB_set = set(spf_UVB)
#spf_broad_spectrum_set = set(spf_broad_spectrum)
sunscreen_set = set(ingredient_dict['sunscreen'])

soothing_set = set(ingredient_dict['soothing'])

skin_identical_set = set(ingredient_dict['skin-identical-ingredient'])

brightening_set = set(ingredient_dict['skin-brightening'])

exfoliant_set = set(ingredient_dict['exfoliant'])

emollient_set = set(ingredient_dict['emollient'])

cell_communicating_set = set(ingredient_dict['cell-communicating-ingredient'])

antioxidant_set = set(ingredient_dict['antioxidant'])

anti_acne_set = set(ingredient_dict['anti-acne'])

humectant_set = set(ingredient_dict['moisturizer-humectant'])

antibacterial_set = set(ingredient_dict['antimicrobial-antibacterial'])

# Create the score generator function
def ingredient_score(row):
    product = row['cleaned_list'].split(',')
 #   row['chemical_spf'] = sum(1 for i in product if i in spf_chemical_set)
  #  row['UVB_spf'] = sum(1 for i in product if i in spf_UVB_set)
   # row['broad_spectrum_spf'] = sum(1 for i in product if i in spf_broad_spectrum_set)
    row['sunscreen'] = sum(1 for i in product if i in sunscreen_set)
    row['sunscreen_ingred'] = [i for i in product if i in sunscreen_set]

    row['soothing'] = sum(1 for i in product if i in soothing_set)
    row['soothing_ingredients'] = [i for i in product if i in soothing_set]

    row['skin-identical-ingredient'] = sum(1 for i in product if i in skin_identical_set)
    row['skin_identical_ingredients'] = [i for i in product if i in skin_identical_set]

    row['brightening'] = sum(1 for i in product if i in brightening_set)
    row['brightening_ingredients'] = [i for i in product if i in brightening_set]

    row['exfoliants'] = sum(1 for i in product if i in exfoliant_set)
    row['exfoliant_ingredients'] = [i for i in product if i in exfoliant_set]

    row['emollients'] = sum(1 for i in product if i in emollient_set)
    row['emollient_ingredients'] = [i for i in product if i in emollient_set]

    row['cell-communicating-ingredient'] = sum(1 for i in product if i in cell_communicating_set)
    row['cell_communicating_ingredients'] = [i for i in product if i in cell_communicating_set]

    row['antioxidant'] = sum(1 for i in product if i in antioxidant_set)
    row['antioxidant_ingredients'] = [i for i in product if i in antioxidant_set]

    row['anti-acne'] = sum(1 for i in product if i in anti_acne_set)
    row['anti_acne_ingred'] = [i for i in product if i in anti_acne_set]

    row['humectants'] = sum(1 for i in product if i in humectant_set)
    row['humectant_ingredients'] = [i for i in product if i in humectant_set]

    row['antimicrobial/antibacterial'] = sum(1 for i in product if i in antibacterial_set)
    row['antibacterial_ingredients'] = [i for i in product if i in antibacterial_set]

    return row

# Apply the function and update the spf file with the new columns 
spf = spf.apply(ingredient_score, axis=1)
spf.to_csv('ingredient_score',index=None, header=True)


In [9]:
a = spf[spf['Brand'] == 'BeautyofJoseon']
#pd.set_option('display.max_colwidth', None)
a


Unnamed: 0,Category,Market,Brand,Product,ingredient_list,cleaned_list,sunscreen,sunscreen_ingred,soothing,soothing_ingredients,...,cell-communicating-ingredient,cell_communicating_ingredients,antioxidant,antioxidant_ingredients,anti-acne,anti_acne_ingred,humectants,humectant_ingredients,antimicrobial/antibacterial,antibacterial_ingredients
37,SPF,KR,BeautyofJoseon,BeautyofJoseon-Rice Probiotics SPF 50,"oryzasativa(rice)branwater,glycerin,methylmeth...","oryzasativa(rice)branwater,glycerin,methylmeth...",1,[diethylaminohydroxybenzoylhexylbenzoate],1,[lactobacillusferment],...,2,"[niacinamide, adenosine]",0,[],1,[niacinamide],3,"[glycerin, niacinamide, butyleneglycol]",0,[]


In [35]:
a['ingredient_list']

37    oryzasativa(rice)branwater,glycerin,methylmethacrylatecrosspolymer,c12-15alkylbenzoate,1,2-hexanediol,behenylalcohol,diethylaminohydroxybenzoylhexylbenzoate,niacinamide,bis-ethylhexyloxyphenolmethoxyphenyltriazine,sodiumacrylate/sodiumacryloyldimethyltauratecopolymer,potassiumcetylphosphate,ammoniumacryloyldimethyltaurate/vpcopolymer,polyisobutene,silica,caprylyl/caprylglucoside,sorbitanoleate,adenosine,butyleneglycol,disodiumedta,bifidafermentfiltrate,lactobacillusferment,lactococcusfermentlysate,streptococcusthermophilusferment
Name: ingredient_list, dtype: object