Import Necessary Libraries:

In [21]:
import pandas as pd
import numpy as np
import warnings
from sklearn.manifold import TSNE

In [22]:
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

Read the CSV File into a DataFrame:

In [4]:
df = pd.read_csv('datasets/cosmetics.csv')

Display first 5 Rows 

In [5]:
df.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


Display Counts of Product Types:

In [6]:
product_counts = df['Label'].value_counts()
display(product_counts)

Moisturizer    298
Cleanser       281
Face Mask      266
Treatment      248
Eye cream      209
Sun protect    170
Name: Label, dtype: int64

 Data Cleaning

Removed Duplicates

In [6]:
df.drop_duplicates(inplace=True)

Handled Missing Values

In [7]:
df.dropna(inplace=True)

Data Filtering

Filtered for Moisturizers & Filtered for Sensitive Skin:

In [23]:
moisturizers = df[df['Label'] == 'Moisturizer']
moisturizers_dry = moisturizers[moisturizers['Dry'] == 1]

In [18]:
moisturizers

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
293,Moisturizer,LA MER,The Moisturizing Matte Lotion,270,3.9,"Water, Algae (Seaweed) Extract, Propanediol, S...",0,0,1,1,0
294,Moisturizer,HERBIVORE,Jasmine Green Tea Balancing Toner,39,4.2,"Jasminum Officinale (Jasmine) Flower Water, Ha...",1,0,0,1,1
295,Moisturizer,CLARINS,Super Restorative Night Age Spot Correcting Re...,136,4.5,"Water, Cetearyl Isononanoate, Dimethicone, Gly...",0,0,0,0,0
296,Moisturizer,KATE SOMERVILLE,Goat Milk Moisturizing Cream,65,4.1,"Water, Ethylhexyl Palmitate, Myristyl Myristat...",1,1,1,1,1


In [19]:
moisturizers_dry

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
289,Moisturizer,KIEHL'S SINCE 1851,Ultra Facial Deep Moisture Balm,29,4.7,"Water, Glycerin, Shea Butter, Glyceryl Stearat...",0,1,1,0,0
291,Moisturizer,SHISEIDO,White Lucent All Day Brightener Broad Spectrum...,62,4.6,"Water, Sd Alcohol 40-B, Dimethicone, Dipropyle...",1,1,1,0,0
292,Moisturizer,SATURDAY SKIN,Featherweight Daily Moisturizing Cream,49,4.6,"Water, Butylene Glycol, Ethylhexyl Palmitate, ...",1,1,1,1,1
296,Moisturizer,KATE SOMERVILLE,Goat Milk Moisturizing Cream,65,4.1,"Water, Ethylhexyl Palmitate, Myristyl Myristat...",1,1,1,1,1


Tokenisation

In [24]:
# Split ingredients into lists
moisturizers_dry['Ingredients'] = moisturizers_dry['Ingredients'].str.split(', ')

# Explode the list into rows
exploded_df = moisturizers_dry.explode('Ingredients')

Initialize Document Term Matrix & One Hot Encoding

In [26]:
one_hot_encoded_df = pd.get_dummies(exploded_df, columns=['Ingredients'])

In [28]:
# Aggregate back to the original format
one_hot_encoded_df = one_hot_encoded_df.groupby(level=0).sum(numeric_only=True)

print(one_hot_encoded_df)

     Price  Rank  Combination  Dry  Normal  Oily  Sensitive
0      175   4.1            1    1       1     1          1
1      179   4.1            1    1       1     1          1
2       68   4.4            1    1       1     1          0
3      175   3.8            1    1       1     1          1
4       38   4.1            1    1       1     1          1
..     ...   ...          ...  ...     ...   ...        ...
289     29   4.7            0    1       1     0          0
291     62   4.6            1    1       1     0          0
292     49   4.6            1    1       1     1          1
296     65   4.1            1    1       1     1          1
297     34   4.8            1    1       1     1          1

[190 rows x 7 columns]


Dimensionality Reduction with t-SNE

In [29]:
model = TSNE(n_components=2, learning_rate=200, random_state=42)
tsne_features = model.fit_transform(one_hot_encoded_df.values)

In [30]:
# Add t-SNE features to the original DataFrame
moisturizers_dry['X'] = tsne_features[:, 0]
moisturizers_dry['Y'] = tsne_features[:, 1]

print(moisturizers_dry[['X', 'Y']])

            X          Y
0    9.493482 -20.321863
1    9.542826 -20.415232
2    4.596718 -10.922693
3    9.493015 -20.322302
4   -4.574909   6.246821
..        ...        ...
289 -7.202548  11.121629
291  3.561064  -9.162087
292  0.251606  -2.815953
296  4.167188 -10.079400
297 -5.710824   9.251492

[190 rows x 2 columns]


Visualization with Bokeh

In [31]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

In [32]:
# Create a ColumnDataSource from the DataFrame
source = ColumnDataSource(moisturizers_dry)

# Create a Bokeh figure for the scatter plot
plot = figure(title="Ingredient Similarity", x_axis_label='T-SNE 1', y_axis_label='T-SNE 2')


In [33]:
# Plot the data points using circles
plot.circle(x='X', y='Y', source=source, size=10)

In [34]:
# Add a hover tool to show product details
hover = HoverTool()
hover.tooltips = [
    ('Item', '@Name'),
    ('Brand', '@Brand'),
    ('Price', '$@Price'),
    ('Rank', '@Rank')
]
plot.add_tools(hover)

In [35]:
# Display the plot
show(plot)