# Similarity matrix 
Generating a similarity matrix to feed into the recommender system engine 
(each column an ingredient, each row a product) 

In [16]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from math import pi
import plotly.express as px
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from bokeh.io import curdoc, push_notebook, output_notebook
from bokeh.layouts import column, layout
from bokeh.models import ColumnDataSource, Div, Select, Slider, TextInput, HoverTool
from bokeh.plotting import figure, show
from ipywidgets import interact, interactive, fixed, interact_manual


### Import data 

In [17]:
# Import data 
spf = pd.read_csv('spf_full')
spf.head()

Unnamed: 0.1,Unnamed: 0,Category,Market,Brand,Product,ingredient_list,cleaned_list
0,0,SPF,AU,BondiSands,BondiSands-Fragrance Free Body lotion SPF 50,"homosalate,octocrylene,octylsalicylate,butylme...","homosalate,octocrylene,octisalate,avobenzone,,..."
1,1,SPF,DE,DM,DM-Sun Dance Ultra sensitive Gel creme SPF 50,"aqua,alcoholdenat.,dibutyladipate,diethylamino...","water,alcoholdenat,dibutyladipate,uvinul_A_plu..."
2,2,SPF,DE,DM,DM-Balea Niacinamide 10% moisturizer SPF 30,"aqua,niacinamide,dibutyladipate,glycerin,ethyl...","water,niacinamide,dibutyladipate,glycerin,octi..."
3,3,SPF,DE,DM,DM-Alverde Clear Beauty Getonte tagescreme,"aqua,oleaeuropaeafruitoil,ci77891,alcoholdenat...","water,oleaeuropaeafruitoil,ci7789,alcoholdenat..."
4,4,SPF,DE,Eucerin,Eucerin-Sun Oil Control,"aqua,homosalate,polymethylsilsesquioxane,butyl...","water,homosalate,,avobenzone,octisalate,octocr..."


In [18]:

spf_all_ingreds = []

for i in spf['ingredient_list']:
    spf_list = i.split(',')
    for j in spf_list:
        spf_all_ingreds.append(j)
        
        
spf_all_ingreds = sorted(set(spf_all_ingreds))
spf_all_ingreds[0:20]

['(acrylicacid/alkylacrylate(c10-30))copolymer',
 '(acrylicacidna/acryloyldimethyltaurinena)copolymer',
 '1',
 '1-methylhydantoin-2-imide',
 '2-hexanediol',
 '2naphosphate',
 '4-methylbenzylidenecamphor',
 '4-t-butylcyclohexanol',
 'acaciasenegalgum',
 'acidgliciretinic',
 'acrylates-c10-30-alkyl-acrylate-crosspolymer',
 'acrylates/alkylacrylatec10-30crosspolymer',
 'acrylates/c10-30alkylacrylatecrosspolymer',
 'acrylates/dimethiconecopolymer',
 'acrylatescopolymer',
 'adenosine',
 'alcohol',
 'alcoholdenat',
 'alcoholdenat.',
 'alhydroxide']

### Generating the similarity matrix 

In [19]:
# This is a very basic matrix, weights can be adjusted 

one_hot_list = [[0] * 0 for i in range(len(spf_all_ingreds))]

for i in spf['cleaned_list']:
    k=0
    for j in spf_all_ingreds:
        if j in i:
            one_hot_list[k].append(1)
        else:
            one_hot_list[k].append(0)
        k+=1
        
ingred_matrix = pd.DataFrame(one_hot_list).transpose()
ingred_matrix.columns = [sorted(set(spf_all_ingreds))]

ingred_matrix[0:5]


Unnamed: 0,(acrylicacid/alkylacrylate(c10-30))copolymer,(acrylicacidna/acryloyldimethyltaurinena)copolymer,1,1-methylhydantoin-2-imide,2-hexanediol,2naphosphate,4-methylbenzylidenecamphor,4-t-butylcyclohexanol,acaciasenegalgum,acidgliciretinic,...,vitamine,vitisviniferaseedoil,vp/hexadecenecopolymer,water,water-solublecollagen,xanthangum,zeamaysstarch,zincoxide,zincoxide(4),zincoxide(9.0)
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [20]:
ingred_matrix.to_csv('similarity_matrix')

### Extra: Mapped Similarities Visualization 

In [21]:
svd = TruncatedSVD(n_components=300, n_iter = 1000, random_state = 6) # firstly reduce features to 150 with truncatedSVD - this suppresses some noise

svd_features = svd.fit_transform(ingred_matrix)

tsne = TSNE(n_components = 2, n_iter = 1000000, random_state = 6) # reduce 150 features to 2 using t-SNE with exact method

tsne_features = tsne.fit_transform(svd_features)

spf['X'] = tsne_features[:, 0]

spf['Y'] = tsne_features[:, 1]

In [22]:
unique_types = ['SPF']

source = ColumnDataSource(spf)

plot = figure(title = "Mapped Similarities", width = 700, height = 500)
plot.xaxis.axis_label = "t-SNE 1"
plot.yaxis.axis_label = 't-SNE 2'

plot.circle(x = 'X', y = 'Y', source = source, fill_alpha=0.7, size=10,
           color = '#c0a5e3', alpha = 1)

plot.background_fill_color = "#E9E9E9"
plot.background_fill_alpha = 0.3

hover = HoverTool(tooltips=[('Product', '@Product')])
plot.add_tools(hover)


def type_updater(product_type = unique_types[0]):

    new_data = {'X' : spf[spf['Category'] == product_type]['X'],
                'Y' : spf[spf['Category'] == product_type]['Y'],
                'Product' : spf[spf['Category'] == product_type]['Product']
               }
    source.data = new_data
    push_notebook()
  
output_notebook()

show(plot, notebook_handle = True)
interact(type_updater, product_type = unique_types)

interactive(children=(Dropdown(description='product_type', options=('SPF',), value='SPF'), Output()), _dom_cla…

<function __main__.type_updater(product_type='SPF')>