# 1. Configuration

### A. Importing dependencies

In [96]:
import pandas as pd
import numpy as np
import os
from enum import Enum

# Gemini API
import google.generativeai as genai
apiKey = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=apiKey)

# Enables the display of multiple outputs when running a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Progress bar and its size
from tqdm import tqdm
tqdm.pandas(ncols=70)

### B. Loading dataset

In [97]:
file = pd.read_csv('E:/Documents Florian/projects/ai_embeddings/dataset.csv')
file = file.drop(columns={'category'})
file.nunique()
dataset = file.head(800)
dataset

name           396209
subcategory       113
dtype: int64

Unnamed: 0,name,subcategory
0,- Dynamo Motor With Fan Blade For Wind Mill Pr...,Heating & Cooling Appliances
1,- Dynamo Motor With Fan Blade For Wind Mill Pr...,Heating & Cooling Appliances
2,- Home Security Solution (Nano- Max 3 Cameras),Security Cameras
3,- Home Security Solution (Nano- Max 3 Cameras),Security Cameras
4,GREEMITO 4 in 1 Travel Dispenser Bottle - Ref...,Bags & Luggage
...,...,...
795,(Renewed) EDICT by boAt EWE02 in-Ear Wireless ...,Headphones
796,(Renewed) EDICT by boAt EWE02 in-Ear Wireless ...,Headphones
797,(Renewed) EDICT by boAt EWE02 Wireless Bluetoo...,Headphones
798,(Renewed) EDICT by boAt EWE02 Wireless Bluetoo...,Headphones


### C. Checking subcategories

In [98]:
dataset['subcategory'].nunique()
pd.DataFrame(data={'subcategories': dataset['subcategory'].unique()})

42

Unnamed: 0,subcategories
0,Heating & Cooling Appliances
1,Security Cameras
2,Bags & Luggage
3,Fashion & Silver Jewellery
4,All Appliances
5,Kitchen & Home Appliances
6,Baby Fashion
7,Handbags & Clutches
8,Household Supplies
9,Jewellery


# 2. Functions

### A. Task type class

In [99]:
# The Gemini embeddings API generate embeddings differently depending on the task type parameter
# Each task type represents a different use case
# For more details, see the TaskType API reference page
# Two task types are used in this test to compare their results

# Class to set the task type more easily in our functions
class Task_type(Enum):
    SEMANTIC_SIMILARITY = 'semantic_similarity'
    CLASSIFICATION = 'classification'

classification = Task_type.CLASSIFICATION
semantic_similarity = Task_type.SEMANTIC_SIMILARITY

### B. Embedding function

In [139]:
# Generating embeddings

def get_embeddings(text, task_type):
    r = genai.embed_content(
        model='models/text-embedding-004',
        content=text,
        task_type=task_type.value
    )

    return r['embedding']

get_embeddings('hello world', semantic_similarity)[:5], '... TRIMMED]'
get_embeddings('hello world', classification)[:5], '... TRIMMED]'

([-0.0124511365, 0.0053285696, -0.05608754, 0.014377563, -0.005170464],
 '... TRIMMED]')

([-0.0011347869, -0.016465848, -0.03926207, -0.007995497, -0.034482062],
 '... TRIMMED]')

### C. Subcategory function

In [101]:
# Setting a subcategory to a new product based on the product from dataset with the closest embeddings to the new product's embeddings

def get_subcategory(new_product, task_type):
    embd = get_embeddings(new_product, task_type)

    # Comparing the new product's embeddings to the embeddings of every product from the dataset
    if task_type == semantic_similarity:
        check = np.dot(np.stack(dataset['embedding_semantic_similarity']), embd)
    elif task_type == classification:
        check = np.dot(np.stack(dataset['embedding_classification']), embd)

    # Locating the product from dataset whose embeddings are the closest to the embeddings of the new product
    number = np.argmax(check)
    
    # Retrieving the subcategory of the located product from dataset
    subcategory = dataset['subcategory'].iloc[number]
    return subcategory

# 3. Testing

### A. Generating embeddings

In [None]:
# Generating embeddings for every product from dataset, and for each task type

dataset['embedding_semantic_similarity'] = dataset.progress_apply(lambda row: get_embeddings(row['name'], semantic_similarity), axis=1)
dataset['embedding_classification'] = dataset.progress_apply(lambda row: get_embeddings(row['name'], classification), axis=1)

In [140]:
# Checking dataset and the embeddings columns
dataset

Unnamed: 0,name,subcategory,embedding_semantic_similarity,embedding_classification
0,- Dynamo Motor With Fan Blade For Wind Mill Pr...,Heating & Cooling Appliances,"[-0.09697587, 0.00473178, -0.013350259, 0.0240...","[-0.048477206, 0.0027766721, -0.0075184857, 0...."
1,- Dynamo Motor With Fan Blade For Wind Mill Pr...,Heating & Cooling Appliances,"[-0.09697587, 0.00473178, -0.013350259, 0.0240...","[-0.048477206, 0.0027766721, -0.0075184857, 0...."
2,- Home Security Solution (Nano- Max 3 Cameras),Security Cameras,"[-0.03215398, 0.015601758, 0.009883114, 0.0400...","[-0.008784347, -0.0012417992, 0.0005692593, 0...."
3,- Home Security Solution (Nano- Max 3 Cameras),Security Cameras,"[-0.03215398, 0.015601758, 0.009883114, 0.0400...","[-0.008784347, -0.0012417992, 0.0005692593, 0...."
4,GREEMITO 4 in 1 Travel Dispenser Bottle - Ref...,Bags & Luggage,"[-0.012695936, 0.019810265, 0.017035743, -0.01...","[-0.0005123235, 0.029853862, 0.014941086, 0.00..."
...,...,...,...,...
795,(Renewed) EDICT by boAt EWE02 in-Ear Wireless ...,Headphones,"[-0.04471205, -0.01894425, -0.013719039, 0.012...","[-0.007776835, -0.006432484, -0.0005216781, 0...."
796,(Renewed) EDICT by boAt EWE02 in-Ear Wireless ...,Headphones,"[-0.04471205, -0.01894425, -0.013719039, 0.012...","[-0.007776835, -0.006432484, -0.0005216781, 0...."
797,(Renewed) EDICT by boAt EWE02 Wireless Bluetoo...,Headphones,"[-0.02989217, -0.025131183, -0.013902445, 0.01...","[-0.009133643, -0.015499952, -0.008813183, 0.0..."
798,(Renewed) EDICT by boAt EWE02 Wireless Bluetoo...,Headphones,"[-0.02989217, -0.025131183, -0.013902445, 0.01...","[-0.009133643, -0.015499952, -0.008813183, 0.0..."


### B. Creating testing subset

In [103]:
# Subset of products that will be used for the test
# A random sample of n products will be used from this subset

subset = file.iloc[801:1000]
subset = subset.rename(columns={'subcategory':'default_subcategory'})
subset

Unnamed: 0,name,default_subcategory
801,(Renewed) Energy Sistem Sport 2 in-Ear Earphon...,Headphones
802,(Renewed) Energy Sistem Sport 2 in-Ear Earphon...,Headphones
803,(Renewed) EZVIZ by Hikvision C6N Wireless Full...,Cameras
804,(Renewed) EZVIZ by Hikvision C6N Wireless Full...,Security Cameras
805,(Renewed) EZVIZ by Hikvision C6N Wireless Full...,Cameras
...,...,...
995,(Renewed) IMPACT BY HONEYWELL 2MP Bullet CCTV ...,Security Cameras
996,(Renewed) IMPACT BY HONEYWELL 5MP high Resolut...,Refurbished & Open Box
997,(Renewed) IMPACT BY HONEYWELL 5MP high Resolut...,Security Cameras
998,(Renewed) IMPACT BY HONEYWELL 5MP high Resolut...,Refurbished & Open Box


### C. Running test

In [136]:
# See section E. for step-by-step details

# Global variable to store our results dataframe
results_df = ''

def testing(number_of_products):

    # Getting a random sample of n products from our subset
    testing_subset = subset.sample(n=number_of_products, random_state=None).reset_index(drop=True)

    # For every product, generating a new subcategory for each task type
    testing_subset['new_subcategory_semantic_similarity'] = testing_subset.progress_apply(lambda row: get_subcategory(row['name'], semantic_similarity), axis=1)
    testing_subset['new_subcategory_classification'] = testing_subset.progress_apply(lambda row: get_subcategory(row['name'], classification), axis=1)

    # Storing the results in a global dataframe for easier access
    global results_df
    results_df = testing_subset
    
    # Storing the results of successful tests for each task type
    # For each task type, test has passed when the new subcategory is identical to the default subcategory, otherwise it failed
    semantic_similarity_pass = len(testing_subset[(testing_subset['default_subcategory'] == testing_subset['new_subcategory_semantic_similarity'])])
    classification_pass = len(testing_subset[(testing_subset['default_subcategory'] == testing_subset['new_subcategory_classification'])])

    print(f'Tests passed: \nSemantic similarity: {semantic_similarity_pass}/{number_of_products}\nClassification: {classification_pass}/{number_of_products}')

testing(100)

100%|███████████████████████████████| 100/100 [00:35<00:00,  2.81it/s]
100%|███████████████████████████████| 100/100 [00:35<00:00,  2.85it/s]

Tests passed: 
Semantic similarity: 66/100
Classification: 53/100





### D. Test result details

In [137]:
results_df

Unnamed: 0,name,default_subcategory,new_subcategory_semantic_similarity,new_subcategory_classification
0,(Renewed) F&D T300X Covertible Sound Bar with ...,Speakers,Speakers,Speakers
1,(Renewed) GOVO GOCRUSH 410 4W Portable Speaker...,Speakers,All Electronics,All Electronics
2,(Renewed) Havells Ventilair DB 300mm Exhaust F...,Heating & Cooling Appliances,Refurbished & Open Box,Refurbished & Open Box
3,(Renewed) Google Pixel 5 5G 128GB - Just Black,All Electronics,All Electronics,Headphones
4,(Renewed) iBall SoundBuzz i5 Smart Feather Tou...,Speakers,Speakers,Speakers
...,...,...,...,...
95,(Renewed) GIZMORE Gizbar 900 2.0 Channel Home ...,Speakers,Speakers,Speakers
96,(Renewed) EZVIZ C8C 1080p Full HD WiFi Securit...,Security Cameras,Cameras,Cameras
97,(Renewed) Havells Glanzo Garment Steamer 1650 ...,Refurbished & Open Box,All Appliances,Heating & Cooling Appliances
98,(Renewed) Havells Sprint 450mm High Speed Pede...,Heating & Cooling Appliances,Heating & Cooling Appliances,Heating & Cooling Appliances


### E. Step-by-step details

##### 1. Subset sample size

In [106]:
# Size of the random sample of products from the subset
number_of_products = 10

##### 2. Generating sample

In [129]:
# Getting a random sample of n products from our subset
testing_subset = subset.sample(n=number_of_products, random_state=None).reset_index(drop=True)

testing_subset

Unnamed: 0,name,default_subcategory
0,(Renewed) Honor Choice Bluetooth Earbuds Headp...,Headphones
1,(Renewed) EZVIZ by Hikvision|C8C Outdoor WiFi ...,Security Cameras
2,(Renewed) HUL Pureit Advanced Pro Mineral RO+U...,Refurbished & Open Box
3,(Renewed) Hamilton Beach Metal 2 Slice Wide Sl...,Refurbished & Open Box
4,(Renewed) FIFINE K678 USB Podcast Microphone f...,Musical Instruments & Professional Audio
5,(Renewed) Honeywell Moxie V1000 Wireless Bluet...,Speakers
6,(Renewed) F&D A110 3000W 2.1 Channel Wired Mul...,Speakers
7,(Renewed) F&D PA926 60 W Bluetooth Party Speaker,Speakers
8,(Renewed) Fire-Boltt Beast SpO2 1.69” Industry...,Men's Fashion
9,(Renewed) Havells FHVVEDXOWH08 200mm Exhaust F...,Heating & Cooling Appliances


##### 3. Generating subcategories

In [108]:
# For every product, generating a new subcategory for each task type
testing_subset['new_subcategory_semantic_similarity'] = testing_subset.progress_apply(lambda row: get_subcategory(row['name'], semantic_similarity), axis=1)
testing_subset['new_subcategory_classification'] = testing_subset.progress_apply(lambda row: get_subcategory(row['name'], classification), axis=1)

testing_subset

100%|█████████████████████████████████| 10/10 [00:03<00:00,  2.90it/s]
100%|█████████████████████████████████| 10/10 [00:03<00:00,  2.92it/s]


Unnamed: 0,name,default_subcategory,new_subcategory_semantic_similarity,new_subcategory_classification
836,(Renewed) FIFINE Studio Monitor Headphones for...,Headphones,Headphones,All Electronics
935,(Renewed) Havells Swing 400mm Wall Fan (Off Wh...,Heating & Cooling Appliances,Heating & Cooling Appliances,Heating & Cooling Appliances
912,(Renewed) Havells Ciera 300mm Cabin Fan (Ivory),Heating & Cooling Appliances,Refurbished & Open Box,Heating & Cooling Appliances
974,(Renewed) HP 840g3 Elitebook Ultralight 1.48 K...,All Electronics,All Electronics,All Electronics
936,(Renewed) Havells Swing FHWSWSTIVR12 300mm Wal...,Heating & Cooling Appliances,Heating & Cooling Appliances,Heating & Cooling Appliances
835,(Renewed) FIFINE Studio Monitor Headphones for...,Headphones,Headphones,All Electronics
990,"(Renewed) iBall Musi Cube X1, 3W Wireless Blue...",Speakers,Speakers,Speakers
968,(Renewed) Honor Choice Bluetooth Earbuds Headp...,Headphones,Headphones,Headphones
814,(Renewed) F&D A521 X 2.1 Channel Multimedia Bl...,Speakers,Speakers,Speakers
941,(Renewed) Havells Swing Lx 400mm Table Fan (Ch...,Heating & Cooling Appliances,Heating & Cooling Appliances,Heating & Cooling Appliances


##### 4. Measuring results

In [109]:
# Storing the results of successful tests for each task type
# For each task type, test has passed when the new subcategory is identical to the default subcategory, otherwise it failed
semantic_similarity_pass = len(testing_subset[(testing_subset['default_subcategory'] == testing_subset['new_subcategory_semantic_similarity'])])
classification_pass = len(testing_subset[(testing_subset['default_subcategory'] == testing_subset['new_subcategory_classification'])])

semantic_similarity_pass
classification_pass

9

8

##### 5. Returning results

In [110]:
print(f'Tests passed: \nSemantic similarity: {semantic_similarity_pass}/{number_of_products}\nClassification: {classification_pass}/{number_of_products}')

Tests passed: 
Semantic similarity: 9/10
Classification: 8/10
