In [None]:
"""
Skinlytics Project Workflow 

This notebook is a small prototype of the whole projects and intends to outline all the steps, commands and 

packages necessary for the project in a smaller part of the data (SPF Product category, total 50 products).


The program goes through the following steps:

--------------------
Currently in Progress
--------------------

1. Access the data 

2. Clean the ingredient list

3. Classify the ingredients by role/function  and generate a visualization of ingredient profile per one product

4. Calculate the similarity matrix and generate a visualization of the results 

5. Create an interactive dashboard with Dash that display the previous charts 

------------
Next steps:
------------

6. Collect and transform the data from different sources on reviews about the products listed

7. Create first iteration of the sentiment and topic modeling of these reviews 

8. Generate visualizations and display in on a Dash dashboard 

"""


# Skinlytics Project workflow with SPF Product data (total 50 products)

In [9]:
# Packages 

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import pandas as pd
from math import pi
import re

import dash
import dash_html_components as html
import plotly.graph_objects as go
import dash_core_components as dcc
import plotly.express as px
from dash.dependencies import Input, Output

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc


## Step 1. Access the data 

In [5]:
spf = pd.read_csv('Skinlytics - IngredientList-SPF.csv')
spf.head()


Unnamed: 0,Category,Market,Brand,Product,Ingredient List
0,SPF,AU,BondiSands,Fragrance Free Body lotion SPF 50,"Homosalate (10.00%), Octocrylene (8.00%), Octy..."
1,SPF,DE,DM,Sun Dance Ultra sensitive Gel creme SPF 50,"Aqua, Alcohol Denat., Dibutyl Adipate, Diethyl..."
2,SPF,DE,DM,Balea Niacinamide 10% moisturizer SPF 30,"AQUA, NIACINAMIDE, DIBUTYL ADIPATE, GLYCERIN, ..."
3,SPF,DE,DM,Alverde Clear Beauty Getonte tagescreme,"Aqua, Olea Europaea Fruit Oil*, CI 77891, Alco..."
4,SPF,DE,Eucerin,Sun Oil Control,"Aqua, Homosalate, Polymethylsilsesquioxane, Bu..."


## Step 2. Dataset Cleaning

In [6]:
spf.rename(columns={'Ingredient List ':'ingredient_list'},inplace=True)
spf['ingredient_list'] = spf['ingredient_list'].str.lower()
spf['ingredient_list'] = spf['ingredient_list'].str.replace('|',',', regex=False)
spf['ingredient_list'] = spf['ingredient_list'].str.replace('*',' ', regex=False)
spf['ingredient_list'] = spf['ingredient_list'].str.replace('%',' ', regex=False)
spf['ingredient_list'] = spf['ingredient_list'].str.replace('\n',' ', regex=False)
spf['ingredient_list'] = spf['ingredient_list'].str.replace('\u200b',' ',regex=False)
spf['ingredient_list'] = spf['ingredient_list'].str.replace(' ','', regex=False)
spf['ingredient_list'] = spf['ingredient_list'].str.replace('\(\d\.\d\d\)|\(\d\d\.\d\d\)','', regex=True)



spf['cleaned_list'] = spf['ingredient_list']
spf.head()

Unnamed: 0,Category,Market,Brand,Product,ingredient_list,cleaned_list
0,SPF,AU,BondiSands,Fragrance Free Body lotion SPF 50,"homosalate,octocrylene,octylsalicylate,butylme...","homosalate,octocrylene,octylsalicylate,butylme..."
1,SPF,DE,DM,Sun Dance Ultra sensitive Gel creme SPF 50,"aqua,alcoholdenat.,dibutyladipate,diethylamino...","aqua,alcoholdenat.,dibutyladipate,diethylamino..."
2,SPF,DE,DM,Balea Niacinamide 10% moisturizer SPF 30,"aqua,niacinamide,dibutyladipate,glycerin,ethyl...","aqua,niacinamide,dibutyladipate,glycerin,ethyl..."
3,SPF,DE,DM,Alverde Clear Beauty Getonte tagescreme,"aqua,oleaeuropaeafruitoil,ci77891,alcoholdenat...","aqua,oleaeuropaeafruitoil,ci77891,alcoholdenat..."
4,SPF,DE,Eucerin,Sun Oil Control,"aqua,homosalate,polymethylsilsesquioxane,butyl...","aqua,homosalate,polymethylsilsesquioxane,butyl..."


In [7]:
all_ingreds = []

for i in spf['ingredient_list']:
    ingreds_list = i.split(',')
    for j in ingreds_list:
        all_ingreds.append(j)
        
all_ingreds = sorted(set(all_ingreds))
all_ingreds[0:10]



['(acrylicacid/alkylacrylate(c10-30))copolymer',
 '(acrylicacidna/acryloyldimethyltaurinena)copolymer',
 '1',
 '1-methylhydantoin-2-imide',
 '2-hexanediol',
 '2naphosphate',
 '4-methylbenzylidenecamphor',
 '4-t-butylcyclohexanol',
 'acaciasenegalgum',
 'acidgliciretinic']

In [92]:
len(all_ingreds)

542

### Ingredient list cleaning


In [93]:
cleaned_list = []

for i in spf['cleaned_list']:
    ingreds_c_list = i.split(',')
    for j in ingreds_c_list:
        cleaned_list.append(j)
            
    cleaned_list = sorted(set(cleaned_list))


In [94]:
len(cleaned_list)

542

#### Clean up active ingredients and SPF filters

In [95]:
# Styrene/Acrylates Copolymer 
spf['cleaned_list'] = spf['cleaned_list'].str.replace('styrene\/acrylatescopolymer', 'styrene_acrylates_copolymer', regex=True)

#Zinc Oxide
spf['cleaned_list'] = spf['cleaned_list'].str.replace('microparticlezincoxide|zincoxide\(\d\)|zincoxide\(\d\.\d\)', 'zincoxide', regex=True)



In [96]:
cleaned_list = []

for i in spf['cleaned_list']:
    ingreds_c_list = i.split(',')
    for j in ingreds_c_list:
        cleaned_list.append(j)
            
    cleaned_list = sorted(set(cleaned_list))


In [97]:
len(cleaned_list)

539

#### Remove unecessary ingredients 

In [106]:
# Remove unecessary ingredients 

ingred_to_be_removed = ['1', 
                        '1-methylhydantoin-2-imide', 
                        '-methylhydantoin-2-imide',
                        '2-hexanediol',
                        '2naphosphate', 
                        'acaciasenegalgum', 
                        'acidgliciretinic', 
                        'alhydroxide', 
                        'alpha-isomethylionone', 
                        'alumina',
                        'aluminumstarchoctenylsuccinate',
                        'zeamaysstarch',
                        'xanthangum',
                        'undecane',
                        'tromethamine',
                        'trisodiumethylenediaminedisuccinate',
                        'trisodiumedta' ,
                        'vp/hexadecenecopolymer',
                        'vp/hexadecenecopolymer',
                        'vinyldimethicone/methiconesilsesquioxanecrosspolymer',
                        'triethylhexanoin',
                        'triethoxysilylethylpolydimethylsiloxyethylhexyldimethicone', 
                        'triethoxycaprylylsilane',
                        'triethanolamine',
                        'trideceth-6',
                        'tridecane',
                        'tricaprylin', 
                        'triacontanylpvp', 
                        'tinoxide',
                        'tetrasodiumedta', 
                        'tea-cocoylglutamate',
                        'tapiocastarch',
                        't-butylalcohol',
                        'syntheticfluorphlogopite',
                        '(acrylicacid/alkylacrylate(c10-30))copolymer',
                        '(acrylicacidna/acryloyldimethyltaurinena)copolymer',
                        'acrylates-c10-30-alkyl-acrylate-crosspolymer',
                        'acrylates/alkylacrylatec10-30crosspolymer',
                        'acrylates/c10-30alkylacrylatecrosspolymer',
                        'ammoniumacrylatescopolymer',
                        'ammoniumacryloyldimethyltaurate/vpcopolymer',
                        'ammoniumpolyacryloyldimethyltaurate',
                        'c10-30alkylacrylatecrosspolymer',
                        'c12-22alkylacrylate/hydroxyethylacrylatecopolymer',
                        'dimethiconecrosspolymer(laurylmethacrylate/namethacrylate)',
                        'ethylene/acrylicacidcopolymer',
                        'glycoldimethacrylatecrosspolymer',
                        'hydroxyethylacrylate/sodiumacryloyldimethyltauratecopolymer',
                        'methylmethacrylatecrosspolymer',
                        'polyacrylamide',
                        'polyacrylatecrosspolymer-6',
                        'polyc10-30alkylacrylate',
                        'polymethylmethacrylate',
                        'sodiumacrylate/sodiumacryloyldimethyltauratecopolymer',
                        'sodiumacrylatescopolymer',
                        'sodiumacrylatescrosspolymer-2',
                        'sodiumpolyacrylate',
                        'sodiumpolyacrylatestarch',
                        'sodiumpolyacryloyldimethyltaurate',
                        '(acrylicacid/alkylacrylate(c0-30))copolymer',
                        'acrylates-c0-30-alkyl-acrylate-crosspolymer',
                        'acrylates/alkylacrylatec0-30crosspolymer',
                        'acrylates/c0-30alkylacrylatecrosspolymer',
                        'c0-30alkylacrylatecrosspolymer',
                        'c2-22alkylacrylate/hydroxyethylacrylatecopolymer',
                        'polyc0-30alkylacrylate',

                     
                       
                       ]

for i in ingred_to_be_removed:
    spf['cleaned_list'] = spf['cleaned_list'].str.replace(i,'', regex=False)


In [107]:
cleaned_list = []

for i in spf['cleaned_list']:
    ingreds_c_list = i.split(',')
    for j in ingreds_c_list:
        cleaned_list.append(j)
            
    cleaned_list = sorted(set(cleaned_list))


In [108]:
len(cleaned_list)

483

#### Cleaning up station

In [111]:
[s for s in all_ingreds if 'alcohol' in s][:50]

['alcohol',
 'alcoholdenat',
 'alcoholdenat.',
 'arachidylalcohol',
 'behenylalcohol',
 'benzylalcohol',
 'c14-22alcohols',
 'cetearylalcohol',
 'cetylalcohol',
 'myristylalcohol',
 'polyvinylalcohol',
 'stearylalcohol',
 't-butylalcohol']

In [109]:
[s for s in cleaned_list if 'acryl' in s][:50]

['acrylates/dimethiconecopolymer',
 'acrylatescopolymer',
 'styrene_acrylates_copolymer']

In [None]:
# Drying alcohols: alcohol, alcohol denat. 
# Fatty alcohols: 'stearylalcohol, 'cetylalcohol, cetearylalcohol, arachidylalcohol, behenylalcohol

In [112]:
#cleaned_list[0:600]


In [89]:
# Remove water as an ingredient # Fix the water sintax
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('aqua|aqua\(\w+\)|aqua\/.*r','water', regex=True)
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('purifiedwater','water')
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('\(water\)|\/water|\/\\u200bwater','', regex=True)
# issue: spf['cleaned_list'] = spf['cleaned_list'].str.replace('\bwater\b','', regex=True)

# Vitamin E #Fix the \n sintax
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('vitamine\n','vitamin_e_unspecified')
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('\w+acetate','vitamin_e_acetate', regex=True)
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('tocopherol','vitamin_e', regex=True) 
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('vitamin_e\.','vitamin_e', regex=True) 

# Titanium Dioxide # Fix the \n sintax
# Ci77891 when used as a colorant 
# Nano form is an enhanced version of titanium dioxide 
# usually combined with Iron oxide pigments to increase visible light protection
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('\w.*nano\)|\w.*nano.*dioxide|microparticletitaniumdioxide','titaniumdioxide[nano/micro]', regex=True)
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('microparticletitaniumdioxide','titaniumdioxide[micro]', regex=True)
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('titaniumoxide','titaniumdioxide', regex=True)
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('titaniumdioxide\(\d\)','titaniumdioxide', regex=True)
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('\w.*ci77891\/.*\w','titaniumdioxide(ci77891)', regex=True)
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('\w.*ci77891\)\\n','titaniumdioxide(ci77891)', regex=True)
#spf['cleaned_list'] = spf['cleaned_list'].str.replace('citral\\n','', regex=True)


# filters that risk the coral sea life: oxybenzone, octocrylene and octinoxate

# Acrylates increase water resistance and boost SPF protection
#Common water resistant ingredients: 
# Acrylates/Octylacrylamide Copolymer 
# VA/Butyl Maleate/Isobornyl Acrylate Copolymer
# Styrene Acrylates Copolymer
# Acrylates/C12-22 Alkyl Methacrylate Copolymer
# VP/Eicosene Copolymer

# Acrylates 
# Styrene/Acrylates Copolymer - used to create water resistant formulas, boosts in SPF bwt 11%-18% per 1% of styrene
# Sodium Polyacrylate - subperabsorbent polymer with very strong water binding abilities

# Alcohol
#Vitamin C 


## Step 3. Generating a classification profile score system and visualize it 

* For each ingredient in the list, attribute it to a role/function 
* Then agregate it by count of ingredients in each role/function and use it as a score
* Calculate overall score 
* Store the table and visualize the scores


In [None]:
# Store active ingredients by role 

spf_filters = ['zincoxide', 
               'zincoxide(4)', 
               'zincoxide(9.0)',
              'tris-biphenyltriazine(nano)']



active_ingredients = ['water-solublecollagen',
                      'vitisviniferaseedoil',
                      'vitamine\n',
                      'vacciniummyrtillusleafextract', 
                      'ubiquinone',
                      'tremellafuciformispolysaccharide',
                      'trehalose']

fragrances = ['vanillaplanifoliafruitextract','', '', '']


anti_acne = []

antioxidant = []

cell_communicating = []

skin_identical = []

skin_brightening = []

soothing = []

antibacterial = []

pigment = []

emolient = []

humectant = []

preservative = []


## Step 4. Generating a similarity matrix (each column an ingredient, each row a product) and visualize it 

In [None]:

# This is a very basic matrix, weights can be adjusted 

one_hot_list = [[0] * 0 for i in range(len(all_ingreds))]

for i in data['clean_ingreds']:
    k=0
    for j in all_ingreds:
        if j in i:
            one_hot_list[k].append(1)
        else:
            one_hot_list[k].append(0)
        k+=1
        
ingred_matrix = pd.DataFrame(one_hot_list).transpose()
ingred_matrix.columns = [sorted(set(all_ingreds))]

ingred_matrix

In [None]:
# Step 2: Rank, classify and score the ingredient list aggregated by Active Group

## Step 5. Display Visualizations in Dash 

In [12]:
# Dash script with dummy data 

from dash import Dash, html, dcc

app = Dash(__name__)

# assume you have a "long-form" data frame
# see https://plotly.com/python/px-arguments/ for more options

df = pd.DataFrame({
    "Fruit": ["Apples", "Oranges", "Bananas", "Apples", "Oranges", "Bananas"],
    "Amount": [4, 1, 2, 2, 4, 5],
    "City": ["SF", "SF", "SF", "Montreal", "Montreal", "Montreal"]
})

fig = px.bar(df, x="Fruit", y="Amount", color="City", barmode="group")

app.layout = html.Div(children=[
    html.H1(children='Hello Dash'),

    html.Div(children='''
        Dash: A web application framework for your data.
    '''),

    dcc.Graph(
        id='example-graph',
        figure=fig
    )
])

#if __name__ == '__main__':
#    app.run(host='1.0.0.0'
#            port=''
#        debug=True)

------------
Next steps:
------------

6. Collect and transform the data from different sources on reviews about the products listed

7. Create first iteration of the sentiment and topic modeling of these reviews 

8. Generate visualizations and display in on a Dash dashboard 
