In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import find
import numpy  as np
from IPython.display import clear_output

# TF-IDF Similarity
*add description here*

In [2]:
df = pd.read_csv("data/skincare_products.csv")
df.head(3)

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0


In [8]:
# Preprocess the ingredients
def preprocess_ingredients(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters (keeping spaces)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply the preprocessing function to the 'Ingredient Description' column
df['Ingredients'] = df['Ingredients'].apply(preprocess_ingredients)

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the 'Ingredient Description' column
tfidf_matrix = vectorizer.fit_transform(df['Ingredients'])

# Inspect the shape of the resulting TF-IDF matrix
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)


# Optionally, you can inspect the feature names (terms)
print("Feature names (first 10):", vectorizer.get_feature_names_out()[:10])


TF-IDF Matrix Shape: (1472, 5185)
Feature names (first 10): ['01' '02' '05' '067178' '074905' '10' '100' '1000' '101' '107']


In [9]:
# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Find non-zero entries and their indices and values
row_indices, term_indices, scores = find(tfidf_matrix)

# Combine the rows, term indices, and scores into a list of tuples
entries = list(zip(row_indices, term_indices, scores))

# Sort entries by score in descending order
entries_sorted = sorted(entries, key=lambda x: x[2], reverse=True)

# Print the top 10 terms with the highest TF-IDF scores
print("\nTop 10 terms with the highest TF-IDF scores:")
for i in range(min(10, len(entries_sorted))):
    row_index, term_index, score = entries_sorted[i]
    term = feature_names[term_index]
    print(f"Document {row_index}: Term '{term}', Score: {score}")



Top 10 terms with the highest TF-IDF scores:
Document 39: Term 'name', Score: 1.0
Document 93: Term 'natural', Score: 1.0
Document 263: Term 'name', Score: 1.0
Document 310: Term 'hadasei3', Score: 1.0
Document 496: Term 'name', Score: 1.0
Document 656: Term 'name', Score: 1.0
Document 846: Term 'hadasei3', Score: 1.0
Document 90: Term 'essential', Score: 0.9330613898050689
Document 121: Term 'essential', Score: 0.9330613898050689
Document 858: Term 'essential', Score: 0.9330613898050689


In [10]:
# Compute the mean TF-IDF score for each term across the entire dataset
mean_tfidf_scores = tfidf_matrix.mean(axis=0).A1

# Get the feature names (terms)
terms = vectorizer.get_feature_names_out()

# Create a DataFrame to map terms to their mean TF-IDF scores
mean_tfidf_df = pd.DataFrame({'term': terms, 'mean_tfidf': mean_tfidf_scores})

# Sort the DataFrame by mean TF-IDF scores in descending order
mean_tfidf_df = mean_tfidf_df.sort_values(by='mean_tfidf', ascending=False)

# Display the top 10 terms with the highest mean TF-IDF scores
top_10_terms = mean_tfidf_df.head(10)
print("Top 10 terms with the highest mean TF-IDF scores across the entire dataset:")
print(top_10_terms)


Top 10 terms with the highest mean TF-IDF scores across the entire dataset:
         term  mean_tfidf
1878  extract    0.125674
3370      oil    0.070623
4489   sodium    0.063218
207      acid    0.053459
2201   glycol    0.052736
4376     seed    0.039070
2761     leaf    0.037237
2066    fruit    0.036676
1149       ci    0.035016
333   alcohol    0.034404


In [11]:
# Function to calculate similarity score between two products based on their ingredients
def calculate_similarity(product1_ingredients, product2_ingredients):
    # Preprocess ingredient descriptions
    product1_ingredients = preprocess_ingredients(product1_ingredients)
    product2_ingredients = preprocess_ingredients(product2_ingredients)
    
    # Create a TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the ingredient descriptions of the two products
    tfidf_matrix_product1 = vectorizer.fit_transform([product1_ingredients])
    tfidf_matrix_product2 = vectorizer.transform([product2_ingredients])

    # Compute cosine similarity between the TF-IDF vectors of the two products
    similarity_score = cosine_similarity(tfidf_matrix_product1, tfidf_matrix_product2)

    return similarity_score[0][0]

# Example usage:
product1_ingredients = df.loc[1, 'Ingredients']
product2_ingredients = df.loc[2, 'Ingredients']
similarity_score = calculate_similarity(product1_ingredients, product2_ingredients)
print("Similarity Score between Product 1 and Product 2:", similarity_score)


Similarity Score between Product 1 and Product 2: 0.6071428571428571


In [13]:
tfidf_similarity_matrix = np.zeros((df.shape[0], df.shape[0]))

for i in range(df.shape[0]):
    print("Generating scores for product", i)
    clear_output(wait=True)
    for j in range(i+1, df.shape[0]):
        tfidf_similarity_matrix[i,j] = calculate_similarity(df.iloc[i,5], df.iloc[j,5])

tfidf_similarity_matrix[0:5, 0:5]

KeyboardInterrupt: 