In [7]:
from IPython.display import Image, display
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv
import ast
import pandas as pd

In [8]:
load_dotenv()
client = OpenAI()

## Image to Keyword

In [9]:
dataset_path =  "products_small.csv"
#dataset_path =  "products.csv"
df = pd.read_csv(dataset_path)

In [10]:
def convert_string_to_list(string_list):
    try:
        actual_list = ast.literal_eval(string_list)
        return actual_list
    except ValueError as e:
        print(f"Error converting string to list: {e}")
        return None

# Apply the string_to_list function to the 'product_image_all' column in the dataframe
df['product_image_all'] = df['product_image_all'].apply(convert_string_to_list)
print(f"The dataframe has {df.shape[0]} rows and {df.shape[1]} columns.")
df.head()

The dataframe has 10 rows and 5 columns.


Unnamed: 0,product_name,product_image_main,product_image_all,product_url,product_description
0,Fidelio Multimedia Cabinet,https://cdn.shopify.com/s/files/1/0093/8033/15...,[https://proof.com.sg/cdn/shop/files/fideliomu...,https://proof.com.sg/products/fidelio-multimed...,Roberto Lazzeroni has enriched the series of F...
1,Get Back 2-Seater Sofa,https://cdn.shopify.com/s/files/1/0093/8033/15...,[https://proof.com.sg/cdn/shop/files/get1.png?...,https://proof.com.sg/products/get-back-sofa,"After Let it Be and Come Together, designed fo..."
2,Grantorino 3-Seater Sofa,https://cdn.shopify.com/s/files/1/0093/8033/15...,[https://proof.com.sg/cdn/shop/files/3seater.p...,https://proof.com.sg/products/grantorino-sofa,"The world of saddlery, a realm of luxury craft..."
3,Mamy Blue Armchair with Ottoman,https://cdn.shopify.com/s/files/1/0093/8033/15...,[https://proof.com.sg/cdn/shop/files/mamy3_1c7...,https://proof.com.sg/products/mamy-blue-armchair,Mamy Blue could be described as an armchair fo...
4,Othello Table,https://cdn.shopify.com/s/files/1/0093/8033/15...,[https://proof.com.sg/cdn/shop/files/OthelloTa...,https://proof.com.sg/products/othello-table,The series of Othello tables are inspired by t...


In [11]:
system_prompt = '''
    You are an agent specialized in tagging images of furniture items, decorative items, or furnishings with relevant keywords that could be used to search for these items on a marketplace.
    
    You will be provided with a list of images of the item, the product_title and product_description of the item depicted in the image, and your goal is to extract keywords for only the item specified. 
    
    Keywords should be concise and in lower case. 
    
    Keywords can describe things like:
    - Item type e.g. 'sofa bed', 'chair', 'desk', 'plant'
    - Item material e.g. 'wood', 'metal', 'fabric'
    - Item style e.g. 'scandinavian', 'vintage', 'industrial', 'modern', 'contemporary'
    - Item color e.g. 'red', 'blue', 'white'
    
    Only deduce material, style or color keywords when it is obvious that they make the item depicted in the image stand out or specifically mentioned in the product_description

    Return keywords in the format of an array of strings, and only the array of strings, like this:
    ['desk', 'industrial', 'metal']
    
'''

In [12]:
def create_user_prompt(product_name, product_description, image_urls):
    user_prompt = [
        {
            "type": "text",
            "text": f"product_name: {product_name} \n",
        }, 
        {
            "type": "text",
            "text": f"product_description: {product_description} \n",
        }
    ]
    user_prompt.extend([
        {
            "type": "image_url",
            "image_url": {
                "url": url,
            },
        } for url in image_urls
    ])
    return user_prompt


In [13]:
def analyze_image(user_prompt):
    response = client.chat.completions.create(
    model="gpt-4-vision-preview",
    messages=[
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_prompt,
        },
    ],
        max_tokens=500,
        top_p=0.1
    )

    return response.choices[0].message.content

In [55]:
keywords = []
for index, row in df.iterrows():
    try:
        user_prompt = create_user_prompt(row['product_name'], row['product_description'], row['product_image_all'])
        keyword_list = analyze_image(user_prompt)
        keywords.append(keyword_list)
    except Exception as e:
        keywords.append(f"Error: {str(e)}")
    print(index)

df['keywords'] = keywords

0
1
2
3
4
5
6
7
8
9


In [56]:
df.to_csv('product_small_keywords.csv', index=False)


## Load Products CSV with Keywords

In [14]:

df = pd.read_csv('product_keywords.csv')

df.head()

Unnamed: 0,product_name,product_image,product_url,keywords
0,Fidelio Multimedia Cabinet,https://cdn.shopify.com/s/files/1/0093/8033/15...,https://proof.com.sg/products/fidelio-multimed...,"['multimedia cabinet', 'wood', 'modern', 'brown']"
1,Get Back 2-Seater Sofa,https://cdn.shopify.com/s/files/1/0093/8033/15...,https://proof.com.sg/products/get-back-sofa,"['sofa', '2-seater', 'leather', 'brown', 'mode..."
2,Grantorino 3-Seater Sofa,https://cdn.shopify.com/s/files/1/0093/8033/15...,https://proof.com.sg/products/grantorino-sofa,"['sofa', '3-seater', 'fabric', 'wood', 'beige'..."
3,Mamy Blue Armchair with Ottoman,https://cdn.shopify.com/s/files/1/0093/8033/15...,https://proof.com.sg/products/mamy-blue-armchair,"['armchair', 'ottoman', 'leather', 'blue', 'br..."
4,Othello Table,https://cdn.shopify.com/s/files/1/0093/8033/15...,https://proof.com.sg/products/othello-table,"['table', 'oval', 'marble top', 'wooden base',..."


In [15]:
unique_keywords = set()

for index, row in df.iterrows():
    try:
        keyword_list = convert_string_to_list(row['keywords'])
        if keyword_list:
            unique_keywords.update(keyword_list)
    except Exception as e:
        print(f"Skipping product '{row['product_name']}' at index {index} due to error: {e}")
        continue

unique_keywords_list = list(unique_keywords)



Skipping product 'Go Figure' at index 61 due to error: unterminated string literal (detected at line 1) (<unknown>, line 1)
Skipping product 'Light The Way' at index 96 due to error: invalid syntax (<unknown>, line 1)
Skipping product 'Burano Rug' at index 102 due to error: invalid syntax (<unknown>, line 1)
Skipping product 'Turn+' at index 155 due to error: invalid syntax (<unknown>, line 1)
Skipping product 'Sister Louise' at index 162 due to error: invalid syntax (<unknown>, line 1)


In [16]:
# Count the occurrences of each keyword in the 'keywords' column of the dataframe
keyword_counts = {}
for keyword in unique_keywords_list:
    count = df['keywords'].str.contains(keyword, regex=False).sum()
    keyword_counts[keyword] = count

# Convert the dictionary to a dataframe for better visualization
df_keyword_counts = pd.DataFrame(list(keyword_counts.items()), columns=['Keyword', 'Count']).sort_values(by='Count', ascending=False)


print(f"There are {len(unique_keywords_list)} unique keywords.")
print("First 10 keywords and their counts:")
print(df_keyword_counts[:10])
print("\nLast 5 keywords and their counts:")
print(df_keyword_counts[-5:])


There are 265 unique keywords.
First 10 keywords and their counts:
          Keyword  Count
255         metal     78
32         modern     70
139          wood     62
29          table     50
85         fabric     48
168  contemporary     46
33          chair     40
69          white     39
237         black     37
235         brown     36

Last 5 keywords and their counts:
           Keyword  Count
109           teal      1
110   leather care      1
111  woven pattern      1
115       two-tone      1
264        quilted      1


## Embedding Keywords

In [17]:
def get_embedding(value, model="text-embedding-3-large"): 
    embeddings = client.embeddings.create(
      model=model,
      input=value,
      encoding_format="float"
    )
    return embeddings.data[0].embedding

In [18]:
df_keywords = pd.DataFrame(unique_keywords_list, columns=['keyword'])
df_keywords['embedding'] = df_keywords['keyword'].apply(lambda x: get_embedding(x))
df_keywords

KeyboardInterrupt: 

### for embeddings of new products/keywords

In [None]:
def compare_keyword(keyword):
    embedded_value = get_embedding(keyword)
    df_keywords['similarity'] = df_keywords['embedding'].apply(lambda x: cosine_similarity(np.array(x).reshape(1,-1), np.array(embedded_value).reshape(1, -1)))
    most_similar = df_keywords.sort_values('similarity', ascending=False).iloc[0]
    return most_similar

def replace_keyword(keyword, threshold = 0.6):
    most_similar = compare_keyword(keyword)
    if most_similar['similarity'] > threshold:
        print(f"Replacing '{keyword}' with existing keyword: '{most_similar['keyword']}'")
        return most_similar['keyword']
    return keyword

## Demo

In [19]:
def search_products_by_keyword(df, keyword):
    # Filter the dataframe for products containing the keyword in their keywords list
    filtered_df = df[df['keywords'].apply(lambda x: keyword in x)]
    # Return the first 5 products
    return filtered_df

def render_products(df):
    # Render the first 5 products, images, and keywords
    for index, row in df.iterrows():
        product_name = row['product_name']
        product_url = row['product_url']
        product_image = row['product_image']
        product_keywords = row['keywords']
        print(f"Product Name: {product_name}")
        print(f"Product URL: {product_url}")
        print(f"Keywords: {product_keywords}")
        display(Image(url=product_image))
        print("\n")


In [20]:
keyword = "tv cabinet"
number = 10

filtered_df = search_products_by_keyword(df, keyword)
print(f"There are {len(filtered_df)} products that contain the keyword '{keyword}' \nHere are {number} of them:\n\n")

render_products(filtered_df[:number])

There are 1 products that contain the keyword 'tv cabinet' 
Here are 10 of them:


Product Name: Full TV Cabinet
Product URL: https://proof.com.sg/products/full-tv-cabinets
Keywords: ['tv cabinet', 'wood', 'modern', 'dark brown']




